1"""
2Module contains tools for processing files into DataFrames or other objects
3"""
4
5from collections import abc, defaultdict
6import csv
7import datetime
8from io import StringIO
9import itertools
10import re
11import sys
12from textwrap import fill
13from typing import (
14    Any,
15    Dict,
16    Iterable,
17    Iterator,
18    List,
19    Optional,
20    Sequence,
21    Set,
22    Type,
23    cast,
24)
25import warnings
26
27import numpy as np
28
29import pandas._libs.lib as lib
30import pandas._libs.ops as libops
31import pandas._libs.parsers as parsers
32from pandas._libs.parsers import STR_NA_VALUES
33from pandas._libs.tslibs import parsing
34from pandas._typing import FilePathOrBuffer, StorageOptions, Union
35from pandas.errors import (
36    AbstractMethodError,
37    EmptyDataError,
38    ParserError,
39    ParserWarning,
40)
41from pandas.util._decorators import Appender
42
43from pandas.core.dtypes.cast import astype_nansafe
44from pandas.core.dtypes.common import (
45    ensure_object,
46    ensure_str,
47    is_bool_dtype,
48    is_categorical_dtype,
49    is_dict_like,
50    is_dtype_equal,
51    is_extension_array_dtype,
52    is_file_like,
53    is_float,
54    is_integer,
55    is_integer_dtype,
56    is_list_like,
57    is_object_dtype,
58    is_scalar,
59    is_string_dtype,
60    pandas_dtype,
61)
62from pandas.core.dtypes.dtypes import CategoricalDtype
63from pandas.core.dtypes.missing import isna
64
65from pandas.core import algorithms, generic
66from pandas.core.arrays import Categorical
67from pandas.core.frame import DataFrame
68from pandas.core.indexes.api import (
69    Index,
70    MultiIndex,
71    RangeIndex,
72    ensure_index_from_sequences,
73)
74from pandas.core.series import Series
75from pandas.core.tools import datetimes as tools
76
77from pandas.io.common import IOHandles, get_handle, validate_header_arg
78from pandas.io.date_converters import generic_parser
79
80# BOM character (byte order mark)
81# This exists at the beginning of a file to indicate endianness
82# of a file (stream). Unfortunately, this marker screws up parsing,
83# so we need to remove it if we see it.
84_BOM = "\ufeff"
85
86_doc_read_csv_and_table = (
87    r"""
88{summary}
89
90Also supports optionally iterating or breaking of the file
91into chunks.
92
93Additional help can be found in the online docs for
94`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
95
96Parameters
97----------
98filepath_or_buffer : str, path object or file-like object
99    Any valid string path is acceptable. The string could be a URL. Valid
100    URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
101    expected. A local file could be: file://localhost/path/to/table.csv.
102
103    If you want to pass in a path object, pandas accepts any ``os.PathLike``.
104
105    By file-like object, we refer to objects with a ``read()`` method, such as
106    a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
107sep : str, default {_default_sep}
108    Delimiter to use. If sep is None, the C engine cannot automatically detect
109    the separator, but the Python parsing engine can, meaning the latter will
110    be used and automatically detect the separator by Python's builtin sniffer
111    tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
112    different from ``'\s+'`` will be interpreted as regular expressions and
113    will also force the use of the Python parsing engine. Note that regex
114    delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
115delimiter : str, default ``None``
116    Alias for sep.
117header : int, list of int, default 'infer'
118    Row number(s) to use as the column names, and the start of the
119    data.  Default behavior is to infer the column names: if no names
120    are passed the behavior is identical to ``header=0`` and column
121    names are inferred from the first line of the file, if column
122    names are passed explicitly then the behavior is identical to
123    ``header=None``. Explicitly pass ``header=0`` to be able to
124    replace existing names. The header can be a list of integers that
125    specify row locations for a multi-index on the columns
126    e.g. [0,1,3]. Intervening rows that are not specified will be
127    skipped (e.g. 2 in this example is skipped). Note that this
128    parameter ignores commented lines and empty lines if
129    ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
130    data rather than the first line of the file.
131names : array-like, optional
132    List of column names to use. If the file contains a header row,
133    then you should explicitly pass ``header=0`` to override the column names.
134    Duplicates in this list are not allowed.
135index_col : int, str, sequence of int / str, or False, default ``None``
136  Column(s) to use as the row labels of the ``DataFrame``, either given as
137  string name or column index. If a sequence of int / str is given, a
138  MultiIndex is used.
139
140  Note: ``index_col=False`` can be used to force pandas to *not* use the first
141  column as the index, e.g. when you have a malformed file with delimiters at
142  the end of each line.
143usecols : list-like or callable, optional
144    Return a subset of the columns. If list-like, all elements must either
145    be positional (i.e. integer indices into the document columns) or strings
146    that correspond to column names provided either by the user in `names` or
147    inferred from the document header row(s). For example, a valid list-like
148    `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
149    Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
150    To instantiate a DataFrame from ``data`` with element order preserved use
151    ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
152    in ``['foo', 'bar']`` order or
153    ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
154    for ``['bar', 'foo']`` order.
155
156    If callable, the callable function will be evaluated against the column
157    names, returning names where the callable function evaluates to True. An
158    example of a valid callable argument would be ``lambda x: x.upper() in
159    ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
160    parsing time and lower memory usage.
161squeeze : bool, default False
162    If the parsed data only contains one column then return a Series.
163prefix : str, optional
164    Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
165mangle_dupe_cols : bool, default True
166    Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
167    'X'...'X'. Passing in False will cause data to be overwritten if there
168    are duplicate names in the columns.
169dtype : Type name or dict of column -> type, optional
170    Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
171    'c': 'Int64'}}
172    Use `str` or `object` together with suitable `na_values` settings
173    to preserve and not interpret dtype.
174    If converters are specified, they will be applied INSTEAD
175    of dtype conversion.
176engine : {{'c', 'python'}}, optional
177    Parser engine to use. The C engine is faster while the python engine is
178    currently more feature-complete.
179converters : dict, optional
180    Dict of functions for converting values in certain columns. Keys can either
181    be integers or column labels.
182true_values : list, optional
183    Values to consider as True.
184false_values : list, optional
185    Values to consider as False.
186skipinitialspace : bool, default False
187    Skip spaces after delimiter.
188skiprows : list-like, int or callable, optional
189    Line numbers to skip (0-indexed) or number of lines to skip (int)
190    at the start of the file.
191
192    If callable, the callable function will be evaluated against the row
193    indices, returning True if the row should be skipped and False otherwise.
194    An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
195skipfooter : int, default 0
196    Number of lines at bottom of file to skip (Unsupported with engine='c').
197nrows : int, optional
198    Number of rows of file to read. Useful for reading pieces of large files.
199na_values : scalar, str, list-like, or dict, optional
200    Additional strings to recognize as NA/NaN. If dict passed, specific
201    per-column NA values.  By default the following values are interpreted as
202    NaN: '"""
203    + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent="    ")
204    + """'.
205keep_default_na : bool, default True
206    Whether or not to include the default NaN values when parsing the data.
207    Depending on whether `na_values` is passed in, the behavior is as follows:
208
209    * If `keep_default_na` is True, and `na_values` are specified, `na_values`
210      is appended to the default NaN values used for parsing.
211    * If `keep_default_na` is True, and `na_values` are not specified, only
212      the default NaN values are used for parsing.
213    * If `keep_default_na` is False, and `na_values` are specified, only
214      the NaN values specified `na_values` are used for parsing.
215    * If `keep_default_na` is False, and `na_values` are not specified, no
216      strings will be parsed as NaN.
217
218    Note that if `na_filter` is passed in as False, the `keep_default_na` and
219    `na_values` parameters will be ignored.
220na_filter : bool, default True
221    Detect missing value markers (empty strings and the value of na_values). In
222    data without any NAs, passing na_filter=False can improve the performance
223    of reading a large file.
224verbose : bool, default False
225    Indicate number of NA values placed in non-numeric columns.
226skip_blank_lines : bool, default True
227    If True, skip over blank lines rather than interpreting as NaN values.
228parse_dates : bool or list of int or names or list of lists or dict, \
229default False
230    The behavior is as follows:
231
232    * boolean. If True -> try parsing the index.
233    * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
234      each as a separate date column.
235    * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
236      a single date column.
237    * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
238      result 'foo'
239
240    If a column or index cannot be represented as an array of datetimes,
241    say because of an unparsable value or a mixture of timezones, the column
242    or index will be returned unaltered as an object data type. For
243    non-standard datetime parsing, use ``pd.to_datetime`` after
244    ``pd.read_csv``. To parse an index or column with a mixture of timezones,
245    specify ``date_parser`` to be a partially-applied
246    :func:`pandas.to_datetime` with ``utc=True``. See
247    :ref:`io.csv.mixed_timezones` for more.
248
249    Note: A fast-path exists for iso8601-formatted dates.
250infer_datetime_format : bool, default False
251    If True and `parse_dates` is enabled, pandas will attempt to infer the
252    format of the datetime strings in the columns, and if it can be inferred,
253    switch to a faster method of parsing them. In some cases this can increase
254    the parsing speed by 5-10x.
255keep_date_col : bool, default False
256    If True and `parse_dates` specifies combining multiple columns then
257    keep the original columns.
258date_parser : function, optional
259    Function to use for converting a sequence of string columns to an array of
260    datetime instances. The default uses ``dateutil.parser.parser`` to do the
261    conversion. Pandas will try to call `date_parser` in three different ways,
262    advancing to the next if an exception occurs: 1) Pass one or more arrays
263    (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
264    string values from the columns defined by `parse_dates` into a single array
265    and pass that; and 3) call `date_parser` once for each row using one or
266    more strings (corresponding to the columns defined by `parse_dates`) as
267    arguments.
268dayfirst : bool, default False
269    DD/MM format dates, international and European format.
270cache_dates : bool, default True
271    If True, use a cache of unique, converted dates to apply the datetime
272    conversion. May produce significant speed-up when parsing duplicate
273    date strings, especially ones with timezone offsets.
274
275    .. versionadded:: 0.25.0
276iterator : bool, default False
277    Return TextFileReader object for iteration or getting chunks with
278    ``get_chunk()``.
279
280    .. versionchanged:: 1.2
281
282       ``TextFileReader`` is a context manager.
283chunksize : int, optional
284    Return TextFileReader object for iteration.
285    See the `IO Tools docs
286    <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
287    for more information on ``iterator`` and ``chunksize``.
288
289    .. versionchanged:: 1.2
290
291       ``TextFileReader`` is a context manager.
292compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
293    For on-the-fly decompression of on-disk data. If 'infer' and
294    `filepath_or_buffer` is path-like, then detect compression from the
295    following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
296    decompression). If using 'zip', the ZIP file must contain only one data
297    file to be read in. Set to None for no decompression.
298thousands : str, optional
299    Thousands separator.
300decimal : str, default '.'
301    Character to recognize as decimal point (e.g. use ',' for European data).
302lineterminator : str (length 1), optional
303    Character to break file into lines. Only valid with C parser.
304quotechar : str (length 1), optional
305    The character used to denote the start and end of a quoted item. Quoted
306    items can include the delimiter and it will be ignored.
307quoting : int or csv.QUOTE_* instance, default 0
308    Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
309    QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
310doublequote : bool, default ``True``
311   When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
312   whether or not to interpret two consecutive quotechar elements INSIDE a
313   field as a single ``quotechar`` element.
314escapechar : str (length 1), optional
315    One-character string used to escape other characters.
316comment : str, optional
317    Indicates remainder of line should not be parsed. If found at the beginning
318    of a line, the line will be ignored altogether. This parameter must be a
319    single character. Like empty lines (as long as ``skip_blank_lines=True``),
320    fully commented lines are ignored by the parameter `header` but not by
321    `skiprows`. For example, if ``comment='#'``, parsing
322    ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
323    treated as the header.
324encoding : str, optional
325    Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
326    standard encodings
327    <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
328    .. versionchanged:: 1.2
329
330       When ``encoding`` is ``None``, ``errors="replace"`` is passed to
331       ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
332       This behavior was previously only the case for ``engine="python"``.
333dialect : str or csv.Dialect, optional
334    If provided, this parameter will override values (default or not) for the
335    following parameters: `delimiter`, `doublequote`, `escapechar`,
336    `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
337    override values, a ParserWarning will be issued. See csv.Dialect
338    documentation for more details.
339error_bad_lines : bool, default True
340    Lines with too many fields (e.g. a csv line with too many commas) will by
341    default cause an exception to be raised, and no DataFrame will be returned.
342    If False, then these "bad lines" will dropped from the DataFrame that is
343    returned.
344warn_bad_lines : bool, default True
345    If error_bad_lines is False, and warn_bad_lines is True, a warning for each
346    "bad line" will be output.
347delim_whitespace : bool, default False
348    Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
349    used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
350    is set to True, nothing should be passed in for the ``delimiter``
351    parameter.
352low_memory : bool, default True
353    Internally process the file in chunks, resulting in lower memory use
354    while parsing, but possibly mixed type inference.  To ensure no mixed
355    types either set False, or specify the type with the `dtype` parameter.
356    Note that the entire file is read into a single DataFrame regardless,
357    use the `chunksize` or `iterator` parameter to return the data in chunks.
358    (Only valid with C parser).
359memory_map : bool, default False
360    If a filepath is provided for `filepath_or_buffer`, map the file object
361    directly onto memory and access the data directly from there. Using this
362    option can improve performance because there is no longer any I/O overhead.
363float_precision : str, optional
364    Specifies which converter the C engine should use for floating-point
365    values. The options are ``None`` or 'high' for the ordinary converter,
366    'legacy' for the original lower precision pandas converter, and
367    'round_trip' for the round-trip converter.
368
369    .. versionchanged:: 1.2
370
371{storage_options}
372
373    .. versionadded:: 1.2
374
375Returns
376-------
377DataFrame or TextParser
378    A comma-separated values (csv) file is returned as two-dimensional
379    data structure with labeled axes.
380
381See Also
382--------
383DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
384read_csv : Read a comma-separated values (csv) file into DataFrame.
385read_fwf : Read a table of fixed-width formatted lines into DataFrame.
386
387Examples
388--------
389>>> pd.{func_name}('data.csv')  # doctest: +SKIP
390"""
391)
392
393
394def validate_integer(name, val, min_val=0):
395    """
396    Checks whether the 'name' parameter for parsing is either
397    an integer OR float that can SAFELY be cast to an integer
398    without losing accuracy. Raises a ValueError if that is
399    not the case.
400
401    Parameters
402    ----------
403    name : string
404        Parameter name (used for error reporting)
405    val : int or float
406        The value to check
407    min_val : int
408        Minimum allowed value (val < min_val will result in a ValueError)
409    """
410    msg = f"'{name:s}' must be an integer >={min_val:d}"
411
412    if val is not None:
413        if is_float(val):
414            if int(val) != val:
415                raise ValueError(msg)
416            val = int(val)
417        elif not (is_integer(val) and val >= min_val):
418            raise ValueError(msg)
419
420    return val
421
422
423def _validate_names(names):
424    """
425    Raise ValueError if the `names` parameter contains duplicates or has an
426    invalid data type.
427
428    Parameters
429    ----------
430    names : array-like or None
431        An array containing a list of the names used for the output DataFrame.
432
433    Raises
434    ------
435    ValueError
436        If names are not unique or are not ordered (e.g. set).
437    """
438    if names is not None:
439        if len(names) != len(set(names)):
440            raise ValueError("Duplicate names are not allowed.")
441        if not (
442            is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
443        ):
444            raise ValueError("Names should be an ordered collection.")
445
446
447def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
448    """Generic reader of line files."""
449    if kwds.get("date_parser", None) is not None:
450        if isinstance(kwds["parse_dates"], bool):
451            kwds["parse_dates"] = True
452
453    # Extract some of the arguments (pass chunksize on).
454    iterator = kwds.get("iterator", False)
455    chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
456    nrows = kwds.get("nrows", None)
457
458    # Check for duplicates in names.
459    _validate_names(kwds.get("names", None))
460
461    # Create the parser.
462    parser = TextFileReader(filepath_or_buffer, **kwds)
463
464    if chunksize or iterator:
465        return parser
466
467    with parser:
468        return parser.read(nrows)
469
470
471_parser_defaults = {
472    "delimiter": None,
473    "escapechar": None,
474    "quotechar": '"',
475    "quoting": csv.QUOTE_MINIMAL,
476    "doublequote": True,
477    "skipinitialspace": False,
478    "lineterminator": None,
479    "header": "infer",
480    "index_col": None,
481    "names": None,
482    "prefix": None,
483    "skiprows": None,
484    "skipfooter": 0,
485    "nrows": None,
486    "na_values": None,
487    "keep_default_na": True,
488    "true_values": None,
489    "false_values": None,
490    "converters": None,
491    "dtype": None,
492    "cache_dates": True,
493    "thousands": None,
494    "comment": None,
495    "decimal": ".",
496    # 'engine': 'c',
497    "parse_dates": False,
498    "keep_date_col": False,
499    "dayfirst": False,
500    "date_parser": None,
501    "usecols": None,
502    # 'iterator': False,
503    "chunksize": None,
504    "verbose": False,
505    "encoding": None,
506    "squeeze": False,
507    "compression": None,
508    "mangle_dupe_cols": True,
509    "infer_datetime_format": False,
510    "skip_blank_lines": True,
511}
512
513
514_c_parser_defaults = {
515    "delim_whitespace": False,
516    "na_filter": True,
517    "low_memory": True,
518    "memory_map": False,
519    "error_bad_lines": True,
520    "warn_bad_lines": True,
521    "float_precision": None,
522}
523
524_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
525
526_c_unsupported = {"skipfooter"}
527_python_unsupported = {"low_memory", "float_precision"}
528
529_deprecated_defaults: Dict[str, Any] = {}
530_deprecated_args: Set[str] = set()
531
532
533@Appender(
534    _doc_read_csv_and_table.format(
535        func_name="read_csv",
536        summary="Read a comma-separated values (csv) file into DataFrame.",
537        _default_sep="','",
538        storage_options=generic._shared_docs["storage_options"],
539    )
540)
541def read_csv(
542    filepath_or_buffer: FilePathOrBuffer,
543    sep=lib.no_default,
544    delimiter=None,
545    # Column and Index Locations and Names
546    header="infer",
547    names=None,
548    index_col=None,
549    usecols=None,
550    squeeze=False,
551    prefix=None,
552    mangle_dupe_cols=True,
553    # General Parsing Configuration
554    dtype=None,
555    engine=None,
556    converters=None,
557    true_values=None,
558    false_values=None,
559    skipinitialspace=False,
560    skiprows=None,
561    skipfooter=0,
562    nrows=None,
563    # NA and Missing Data Handling
564    na_values=None,
565    keep_default_na=True,
566    na_filter=True,
567    verbose=False,
568    skip_blank_lines=True,
569    # Datetime Handling
570    parse_dates=False,
571    infer_datetime_format=False,
572    keep_date_col=False,
573    date_parser=None,
574    dayfirst=False,
575    cache_dates=True,
576    # Iteration
577    iterator=False,
578    chunksize=None,
579    # Quoting, Compression, and File Format
580    compression="infer",
581    thousands=None,
582    decimal: str = ".",
583    lineterminator=None,
584    quotechar='"',
585    quoting=csv.QUOTE_MINIMAL,
586    doublequote=True,
587    escapechar=None,
588    comment=None,
589    encoding=None,
590    dialect=None,
591    # Error Handling
592    error_bad_lines=True,
593    warn_bad_lines=True,
594    # Internal
595    delim_whitespace=False,
596    low_memory=_c_parser_defaults["low_memory"],
597    memory_map=False,
598    float_precision=None,
599    storage_options: StorageOptions = None,
600):
601    kwds = locals()
602    del kwds["filepath_or_buffer"]
603    del kwds["sep"]
604
605    kwds_defaults = _refine_defaults_read(
606        dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","}
607    )
608    kwds.update(kwds_defaults)
609
610    return _read(filepath_or_buffer, kwds)
611
612
613@Appender(
614    _doc_read_csv_and_table.format(
615        func_name="read_table",
616        summary="Read general delimited file into DataFrame.",
617        _default_sep=r"'\\t' (tab-stop)",
618        storage_options=generic._shared_docs["storage_options"],
619    )
620)
621def read_table(
622    filepath_or_buffer: FilePathOrBuffer,
623    sep=lib.no_default,
624    delimiter=None,
625    # Column and Index Locations and Names
626    header="infer",
627    names=None,
628    index_col=None,
629    usecols=None,
630    squeeze=False,
631    prefix=None,
632    mangle_dupe_cols=True,
633    # General Parsing Configuration
634    dtype=None,
635    engine=None,
636    converters=None,
637    true_values=None,
638    false_values=None,
639    skipinitialspace=False,
640    skiprows=None,
641    skipfooter=0,
642    nrows=None,
643    # NA and Missing Data Handling
644    na_values=None,
645    keep_default_na=True,
646    na_filter=True,
647    verbose=False,
648    skip_blank_lines=True,
649    # Datetime Handling
650    parse_dates=False,
651    infer_datetime_format=False,
652    keep_date_col=False,
653    date_parser=None,
654    dayfirst=False,
655    cache_dates=True,
656    # Iteration
657    iterator=False,
658    chunksize=None,
659    # Quoting, Compression, and File Format
660    compression="infer",
661    thousands=None,
662    decimal: str = ".",
663    lineterminator=None,
664    quotechar='"',
665    quoting=csv.QUOTE_MINIMAL,
666    doublequote=True,
667    escapechar=None,
668    comment=None,
669    encoding=None,
670    dialect=None,
671    # Error Handling
672    error_bad_lines=True,
673    warn_bad_lines=True,
674    # Internal
675    delim_whitespace=False,
676    low_memory=_c_parser_defaults["low_memory"],
677    memory_map=False,
678    float_precision=None,
679):
680    kwds = locals()
681    del kwds["filepath_or_buffer"]
682    del kwds["sep"]
683
684    kwds_defaults = _refine_defaults_read(
685        dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"}
686    )
687    kwds.update(kwds_defaults)
688
689    return _read(filepath_or_buffer, kwds)
690
691
692def read_fwf(
693    filepath_or_buffer: FilePathOrBuffer,
694    colspecs="infer",
695    widths=None,
696    infer_nrows=100,
697    **kwds,
698):
699    r"""
700    Read a table of fixed-width formatted lines into DataFrame.
701
702    Also supports optionally iterating or breaking of the file
703    into chunks.
704
705    Additional help can be found in the `online docs for IO Tools
706    <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
707
708    Parameters
709    ----------
710    filepath_or_buffer : str, path object or file-like object
711        Any valid string path is acceptable. The string could be a URL. Valid
712        URL schemes include http, ftp, s3, and file. For file URLs, a host is
713        expected. A local file could be:
714        ``file://localhost/path/to/table.csv``.
715
716        If you want to pass in a path object, pandas accepts any
717        ``os.PathLike``.
718
719        By file-like object, we refer to objects with a ``read()`` method,
720        such as a file handle (e.g. via builtin ``open`` function)
721        or ``StringIO``.
722    colspecs : list of tuple (int, int) or 'infer'. optional
723        A list of tuples giving the extents of the fixed-width
724        fields of each line as half-open intervals (i.e.,  [from, to[ ).
725        String value 'infer' can be used to instruct the parser to try
726        detecting the column specifications from the first 100 rows of
727        the data which are not being skipped via skiprows (default='infer').
728    widths : list of int, optional
729        A list of field widths which can be used instead of 'colspecs' if
730        the intervals are contiguous.
731    infer_nrows : int, default 100
732        The number of rows to consider when letting the parser determine the
733        `colspecs`.
734
735        .. versionadded:: 0.24.0
736    **kwds : optional
737        Optional keyword arguments can be passed to ``TextFileReader``.
738
739    Returns
740    -------
741    DataFrame or TextParser
742        A comma-separated values (csv) file is returned as two-dimensional
743        data structure with labeled axes.
744
745    See Also
746    --------
747    DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
748    read_csv : Read a comma-separated values (csv) file into DataFrame.
749
750    Examples
751    --------
752    >>> pd.read_fwf('data.csv')  # doctest: +SKIP
753    """
754    # Check input arguments.
755    if colspecs is None and widths is None:
756        raise ValueError("Must specify either colspecs or widths")
757    elif colspecs not in (None, "infer") and widths is not None:
758        raise ValueError("You must specify only one of 'widths' and 'colspecs'")
759
760    # Compute 'colspecs' from 'widths', if specified.
761    if widths is not None:
762        colspecs, col = [], 0
763        for w in widths:
764            colspecs.append((col, col + w))
765            col += w
766
767    kwds["colspecs"] = colspecs
768    kwds["infer_nrows"] = infer_nrows
769    kwds["engine"] = "python-fwf"
770    return _read(filepath_or_buffer, kwds)
771
772
773class TextFileReader(abc.Iterator):
774    """
775
776    Passed dialect overrides any of the related parser options
777
778    """
779
780    def __init__(self, f, engine=None, **kwds):
781
782        self.f = f
783
784        if engine is not None:
785            engine_specified = True
786        else:
787            engine = "python"
788            engine_specified = False
789        self.engine = engine
790        self._engine_specified = kwds.get("engine_specified", engine_specified)
791
792        _validate_skipfooter(kwds)
793
794        dialect = _extract_dialect(kwds)
795        if dialect is not None:
796            kwds = _merge_with_dialect_properties(dialect, kwds)
797
798        if kwds.get("header", "infer") == "infer":
799            kwds["header"] = 0 if kwds.get("names") is None else None
800
801        self.orig_options = kwds
802
803        # miscellanea
804        self._currow = 0
805
806        options = self._get_options_with_defaults(engine)
807        options["storage_options"] = kwds.get("storage_options", None)
808
809        self.chunksize = options.pop("chunksize", None)
810        self.nrows = options.pop("nrows", None)
811        self.squeeze = options.pop("squeeze", False)
812
813        self._check_file_or_buffer(f, engine)
814        self.options, self.engine = self._clean_options(options, engine)
815
816        if "has_index_names" in kwds:
817            self.options["has_index_names"] = kwds["has_index_names"]
818
819        self._engine = self._make_engine(self.engine)
820
821    def close(self):
822        self._engine.close()
823
824    def _get_options_with_defaults(self, engine):
825        kwds = self.orig_options
826
827        options = {}
828
829        for argname, default in _parser_defaults.items():
830            value = kwds.get(argname, default)
831
832            # see gh-12935
833            if argname == "mangle_dupe_cols" and not value:
834                raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
835            else:
836                options[argname] = value
837
838        for argname, default in _c_parser_defaults.items():
839            if argname in kwds:
840                value = kwds[argname]
841
842                if engine != "c" and value != default:
843                    if "python" in engine and argname not in _python_unsupported:
844                        pass
845                    elif value == _deprecated_defaults.get(argname, default):
846                        pass
847                    else:
848                        raise ValueError(
849                            f"The {repr(argname)} option is not supported with the "
850                            f"{repr(engine)} engine"
851                        )
852            else:
853                value = _deprecated_defaults.get(argname, default)
854            options[argname] = value
855
856        if engine == "python-fwf":
857            # pandas\io\parsers.py:907: error: Incompatible types in assignment
858            # (expression has type "object", variable has type "Union[int, str,
859            # None]")  [assignment]
860            for argname, default in _fwf_defaults.items():  # type: ignore[assignment]
861                options[argname] = kwds.get(argname, default)
862
863        return options
864
865    def _check_file_or_buffer(self, f, engine):
866        # see gh-16530
867        if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"):
868            # The C engine doesn't need the file-like to have the "__next__"
869            # attribute. However, the Python engine explicitly calls
870            # "__next__(...)" when iterating through such an object, meaning it
871            # needs to have that attribute
872            raise ValueError(
873                "The 'python' engine cannot iterate through this file buffer."
874            )
875
876    def _clean_options(self, options, engine):
877        result = options.copy()
878
879        fallback_reason = None
880
881        # C engine not supported yet
882        if engine == "c":
883            if options["skipfooter"] > 0:
884                fallback_reason = "the 'c' engine does not support skipfooter"
885                engine = "python"
886
887        sep = options["delimiter"]
888        delim_whitespace = options["delim_whitespace"]
889
890        if sep is None and not delim_whitespace:
891            if engine == "c":
892                fallback_reason = (
893                    "the 'c' engine does not support "
894                    "sep=None with delim_whitespace=False"
895                )
896                engine = "python"
897        elif sep is not None and len(sep) > 1:
898            if engine == "c" and sep == r"\s+":
899                result["delim_whitespace"] = True
900                del result["delimiter"]
901            elif engine not in ("python", "python-fwf"):
902                # wait until regex engine integrated
903                fallback_reason = (
904                    "the 'c' engine does not support "
905                    "regex separators (separators > 1 char and "
906                    r"different from '\s+' are interpreted as regex)"
907                )
908                engine = "python"
909        elif delim_whitespace:
910            if "python" in engine:
911                result["delimiter"] = r"\s+"
912        elif sep is not None:
913            encodeable = True
914            encoding = sys.getfilesystemencoding() or "utf-8"
915            try:
916                if len(sep.encode(encoding)) > 1:
917                    encodeable = False
918            except UnicodeDecodeError:
919                encodeable = False
920            if not encodeable and engine not in ("python", "python-fwf"):
921                fallback_reason = (
922                    f"the separator encoded in {encoding} "
923                    "is > 1 char long, and the 'c' engine "
924                    "does not support such separators"
925                )
926                engine = "python"
927
928        quotechar = options["quotechar"]
929        if quotechar is not None and isinstance(quotechar, (str, bytes)):
930            if (
931                len(quotechar) == 1
932                and ord(quotechar) > 127
933                and engine not in ("python", "python-fwf")
934            ):
935                fallback_reason = (
936                    "ord(quotechar) > 127, meaning the "
937                    "quotechar is larger than one byte, "
938                    "and the 'c' engine does not support such quotechars"
939                )
940                engine = "python"
941
942        if fallback_reason and self._engine_specified:
943            raise ValueError(fallback_reason)
944
945        if engine == "c":
946            for arg in _c_unsupported:
947                del result[arg]
948
949        if "python" in engine:
950            for arg in _python_unsupported:
951                if fallback_reason and result[arg] != _c_parser_defaults[arg]:
952                    raise ValueError(
953                        "Falling back to the 'python' engine because "
954                        f"{fallback_reason}, but this causes {repr(arg)} to be "
955                        "ignored as it is not supported by the 'python' engine."
956                    )
957                del result[arg]
958
959        if fallback_reason:
960            warnings.warn(
961                (
962                    "Falling back to the 'python' engine because "
963                    f"{fallback_reason}; you can avoid this warning by specifying "
964                    "engine='python'."
965                ),
966                ParserWarning,
967                stacklevel=5,
968            )
969
970        index_col = options["index_col"]
971        names = options["names"]
972        converters = options["converters"]
973        na_values = options["na_values"]
974        skiprows = options["skiprows"]
975
976        validate_header_arg(options["header"])
977
978        for arg in _deprecated_args:
979            parser_default = _c_parser_defaults[arg]
980            depr_default = _deprecated_defaults[arg]
981            if result.get(arg, depr_default) != depr_default:
982                msg = (
983                    f"The {arg} argument has been deprecated and will be "
984                    "removed in a future version.\n\n"
985                )
986                warnings.warn(msg, FutureWarning, stacklevel=2)
987            else:
988                result[arg] = parser_default
989
990        if index_col is True:
991            raise ValueError("The value of index_col couldn't be 'True'")
992        if _is_index_col(index_col):
993            if not isinstance(index_col, (list, tuple, np.ndarray)):
994                index_col = [index_col]
995        result["index_col"] = index_col
996
997        names = list(names) if names is not None else names
998
999        # type conversion-related
1000        if converters is not None:
1001            if not isinstance(converters, dict):
1002                raise TypeError(
1003                    "Type converters must be a dict or subclass, "
1004                    f"input was a {type(converters).__name__}"
1005                )
1006        else:
1007            converters = {}
1008
1009        # Converting values to NA
1010        keep_default_na = options["keep_default_na"]
1011        na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
1012
1013        # handle skiprows; this is internally handled by the
1014        # c-engine, so only need for python parsers
1015        if engine != "c":
1016            if is_integer(skiprows):
1017                skiprows = list(range(skiprows))
1018            if skiprows is None:
1019                skiprows = set()
1020            elif not callable(skiprows):
1021                skiprows = set(skiprows)
1022
1023        # put stuff back
1024        result["names"] = names
1025        result["converters"] = converters
1026        result["na_values"] = na_values
1027        result["na_fvalues"] = na_fvalues
1028        result["skiprows"] = skiprows
1029
1030        return result, engine
1031
1032    def __next__(self):
1033        try:
1034            return self.get_chunk()
1035        except StopIteration:
1036            self.close()
1037            raise
1038
1039    def _make_engine(self, engine="c"):
1040        mapping: Dict[str, Type[ParserBase]] = {
1041            "c": CParserWrapper,
1042            "python": PythonParser,
1043            "python-fwf": FixedWidthFieldParser,
1044        }
1045        if engine not in mapping:
1046            raise ValueError(
1047                f"Unknown engine: {engine} (valid options are {mapping.keys()})"
1048            )
1049        # error: Too many arguments for "ParserBase"
1050        return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
1051
1052    def _failover_to_python(self):
1053        raise AbstractMethodError(self)
1054
1055    def read(self, nrows=None):
1056        nrows = validate_integer("nrows", nrows)
1057        index, columns, col_dict = self._engine.read(nrows)
1058
1059        if index is None:
1060            if col_dict:
1061                # Any column is actually fine:
1062                new_rows = len(next(iter(col_dict.values())))
1063                index = RangeIndex(self._currow, self._currow + new_rows)
1064            else:
1065                new_rows = 0
1066        else:
1067            new_rows = len(index)
1068
1069        df = DataFrame(col_dict, columns=columns, index=index)
1070
1071        self._currow += new_rows
1072
1073        if self.squeeze and len(df.columns) == 1:
1074            return df[df.columns[0]].copy()
1075        return df
1076
1077    def get_chunk(self, size=None):
1078        if size is None:
1079            size = self.chunksize
1080        if self.nrows is not None:
1081            if self._currow >= self.nrows:
1082                raise StopIteration
1083            size = min(size, self.nrows - self._currow)
1084        return self.read(nrows=size)
1085
1086    def __enter__(self):
1087        return self
1088
1089    def __exit__(self, exc_type, exc_value, traceback):
1090        self.close()
1091
1092
1093def _is_index_col(col):
1094    return col is not None and col is not False
1095
1096
1097def _is_potential_multi_index(
1098    columns, index_col: Optional[Union[bool, Sequence[int]]] = None
1099):
1100    """
1101    Check whether or not the `columns` parameter
1102    could be converted into a MultiIndex.
1103
1104    Parameters
1105    ----------
1106    columns : array-like
1107        Object which may or may not be convertible into a MultiIndex
1108    index_col : None, bool or list, optional
1109        Column or columns to use as the (possibly hierarchical) index
1110
1111    Returns
1112    -------
1113    boolean : Whether or not columns could become a MultiIndex
1114    """
1115    if index_col is None or isinstance(index_col, bool):
1116        index_col = []
1117
1118    return (
1119        len(columns)
1120        and not isinstance(columns, MultiIndex)
1121        and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
1122    )
1123
1124
1125def _evaluate_usecols(usecols, names):
1126    """
1127    Check whether or not the 'usecols' parameter
1128    is a callable.  If so, enumerates the 'names'
1129    parameter and returns a set of indices for
1130    each entry in 'names' that evaluates to True.
1131    If not a callable, returns 'usecols'.
1132    """
1133    if callable(usecols):
1134        return {i for i, name in enumerate(names) if usecols(name)}
1135    return usecols
1136
1137
1138def _validate_usecols_names(usecols, names):
1139    """
1140    Validates that all usecols are present in a given
1141    list of names. If not, raise a ValueError that
1142    shows what usecols are missing.
1143
1144    Parameters
1145    ----------
1146    usecols : iterable of usecols
1147        The columns to validate are present in names.
1148    names : iterable of names
1149        The column names to check against.
1150
1151    Returns
1152    -------
1153    usecols : iterable of usecols
1154        The `usecols` parameter if the validation succeeds.
1155
1156    Raises
1157    ------
1158    ValueError : Columns were missing. Error message will list them.
1159    """
1160    missing = [c for c in usecols if c not in names]
1161    if len(missing) > 0:
1162        raise ValueError(
1163            f"Usecols do not match columns, columns expected but not found: {missing}"
1164        )
1165
1166    return usecols
1167
1168
1169def _validate_skipfooter_arg(skipfooter):
1170    """
1171    Validate the 'skipfooter' parameter.
1172
1173    Checks whether 'skipfooter' is a non-negative integer.
1174    Raises a ValueError if that is not the case.
1175
1176    Parameters
1177    ----------
1178    skipfooter : non-negative integer
1179        The number of rows to skip at the end of the file.
1180
1181    Returns
1182    -------
1183    validated_skipfooter : non-negative integer
1184        The original input if the validation succeeds.
1185
1186    Raises
1187    ------
1188    ValueError : 'skipfooter' was not a non-negative integer.
1189    """
1190    if not is_integer(skipfooter):
1191        raise ValueError("skipfooter must be an integer")
1192
1193    if skipfooter < 0:
1194        raise ValueError("skipfooter cannot be negative")
1195
1196    return skipfooter
1197
1198
1199def _validate_usecols_arg(usecols):
1200    """
1201    Validate the 'usecols' parameter.
1202
1203    Checks whether or not the 'usecols' parameter contains all integers
1204    (column selection by index), strings (column by name) or is a callable.
1205    Raises a ValueError if that is not the case.
1206
1207    Parameters
1208    ----------
1209    usecols : list-like, callable, or None
1210        List of columns to use when parsing or a callable that can be used
1211        to filter a list of table columns.
1212
1213    Returns
1214    -------
1215    usecols_tuple : tuple
1216        A tuple of (verified_usecols, usecols_dtype).
1217
1218        'verified_usecols' is either a set if an array-like is passed in or
1219        'usecols' if a callable or None is passed in.
1220
1221        'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
1222        is passed in or None if a callable or None is passed in.
1223    """
1224    msg = (
1225        "'usecols' must either be list-like of all strings, all unicode, "
1226        "all integers or a callable."
1227    )
1228    if usecols is not None:
1229        if callable(usecols):
1230            return usecols, None
1231
1232        if not is_list_like(usecols):
1233            # see gh-20529
1234            #
1235            # Ensure it is iterable container but not string.
1236            raise ValueError(msg)
1237
1238        usecols_dtype = lib.infer_dtype(usecols, skipna=False)
1239
1240        if usecols_dtype not in ("empty", "integer", "string"):
1241            raise ValueError(msg)
1242
1243        usecols = set(usecols)
1244
1245        return usecols, usecols_dtype
1246    return usecols, None
1247
1248
1249def _validate_parse_dates_arg(parse_dates):
1250    """
1251    Check whether or not the 'parse_dates' parameter
1252    is a non-boolean scalar. Raises a ValueError if
1253    that is the case.
1254    """
1255    msg = (
1256        "Only booleans, lists, and dictionaries are accepted "
1257        "for the 'parse_dates' parameter"
1258    )
1259
1260    if parse_dates is not None:
1261        if is_scalar(parse_dates):
1262            if not lib.is_bool(parse_dates):
1263                raise TypeError(msg)
1264
1265        elif not isinstance(parse_dates, (list, dict)):
1266            raise TypeError(msg)
1267
1268    return parse_dates
1269
1270
1271class ParserBase:
1272    def __init__(self, kwds):
1273
1274        self.names = kwds.get("names")
1275        self.orig_names: Optional[List] = None
1276        self.prefix = kwds.pop("prefix", None)
1277
1278        self.index_col = kwds.get("index_col", None)
1279        self.unnamed_cols: Set = set()
1280        self.index_names: Optional[List] = None
1281        self.col_names = None
1282
1283        self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
1284        self.date_parser = kwds.pop("date_parser", None)
1285        self.dayfirst = kwds.pop("dayfirst", False)
1286        self.keep_date_col = kwds.pop("keep_date_col", False)
1287
1288        self.na_values = kwds.get("na_values")
1289        self.na_fvalues = kwds.get("na_fvalues")
1290        self.na_filter = kwds.get("na_filter", False)
1291        self.keep_default_na = kwds.get("keep_default_na", True)
1292
1293        self.true_values = kwds.get("true_values")
1294        self.false_values = kwds.get("false_values")
1295        self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
1296        self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
1297        self.cache_dates = kwds.pop("cache_dates", True)
1298
1299        self._date_conv = _make_date_converter(
1300            date_parser=self.date_parser,
1301            dayfirst=self.dayfirst,
1302            infer_datetime_format=self.infer_datetime_format,
1303            cache_dates=self.cache_dates,
1304        )
1305
1306        # validate header options for mi
1307        self.header = kwds.get("header")
1308        if isinstance(self.header, (list, tuple, np.ndarray)):
1309            if not all(map(is_integer, self.header)):
1310                raise ValueError("header must be integer or list of integers")
1311            if any(i < 0 for i in self.header):
1312                raise ValueError(
1313                    "cannot specify multi-index header with negative integers"
1314                )
1315            if kwds.get("usecols"):
1316                raise ValueError(
1317                    "cannot specify usecols when specifying a multi-index header"
1318                )
1319            if kwds.get("names"):
1320                raise ValueError(
1321                    "cannot specify names when specifying a multi-index header"
1322                )
1323
1324            # validate index_col that only contains integers
1325            if self.index_col is not None:
1326                is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
1327                if not (
1328                    is_sequence
1329                    and all(map(is_integer, self.index_col))
1330                    or is_integer(self.index_col)
1331                ):
1332                    raise ValueError(
1333                        "index_col must only contain row numbers "
1334                        "when specifying a multi-index header"
1335                    )
1336        elif self.header is not None:
1337            # GH 27394
1338            if self.prefix is not None:
1339                raise ValueError(
1340                    "Argument prefix must be None if argument header is not None"
1341                )
1342            # GH 16338
1343            elif not is_integer(self.header):
1344                raise ValueError("header must be integer or list of integers")
1345            # GH 27779
1346            elif self.header < 0:
1347                raise ValueError(
1348                    "Passing negative integer to header is invalid. "
1349                    "For no header, use header=None instead"
1350                )
1351
1352        self._name_processed = False
1353
1354        self._first_chunk = True
1355
1356        self.handles: Optional[IOHandles] = None
1357
1358    def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
1359        """
1360        Let the readers open IOHanldes after they are done with their potential raises.
1361        """
1362        self.handles = get_handle(
1363            src,
1364            "r",
1365            encoding=kwds.get("encoding", None),
1366            compression=kwds.get("compression", None),
1367            memory_map=kwds.get("memory_map", False),
1368            storage_options=kwds.get("storage_options", None),
1369        )
1370
1371    def _validate_parse_dates_presence(self, columns: List[str]) -> None:
1372        """
1373        Check if parse_dates are in columns.
1374
1375        If user has provided names for parse_dates, check if those columns
1376        are available.
1377
1378        Parameters
1379        ----------
1380        columns : list
1381            List of names of the dataframe.
1382
1383        Raises
1384        ------
1385        ValueError
1386            If column to parse_date is not in dataframe.
1387
1388        """
1389        cols_needed: Iterable
1390        if is_dict_like(self.parse_dates):
1391            cols_needed = itertools.chain(*self.parse_dates.values())
1392        elif is_list_like(self.parse_dates):
1393            # a column in parse_dates could be represented
1394            # ColReference = Union[int, str]
1395            # DateGroups = List[ColReference]
1396            # ParseDates = Union[DateGroups, List[DateGroups],
1397            #     Dict[ColReference, DateGroups]]
1398            cols_needed = itertools.chain.from_iterable(
1399                col if is_list_like(col) else [col] for col in self.parse_dates
1400            )
1401        else:
1402            cols_needed = []
1403
1404        # get only columns that are references using names (str), not by index
1405        missing_cols = ", ".join(
1406            sorted(
1407                {
1408                    col
1409                    for col in cols_needed
1410                    if isinstance(col, str) and col not in columns
1411                }
1412            )
1413        )
1414        if missing_cols:
1415            raise ValueError(
1416                f"Missing column provided to 'parse_dates': '{missing_cols}'"
1417            )
1418
1419    def close(self):
1420        if self.handles is not None:
1421            self.handles.close()
1422
1423    @property
1424    def _has_complex_date_col(self):
1425        return isinstance(self.parse_dates, dict) or (
1426            isinstance(self.parse_dates, list)
1427            and len(self.parse_dates) > 0
1428            and isinstance(self.parse_dates[0], list)
1429        )
1430
1431    def _should_parse_dates(self, i):
1432        if isinstance(self.parse_dates, bool):
1433            return self.parse_dates
1434        else:
1435            if self.index_names is not None:
1436                name = self.index_names[i]
1437            else:
1438                name = None
1439            j = self.index_col[i]
1440
1441            if is_scalar(self.parse_dates):
1442                return (j == self.parse_dates) or (
1443                    name is not None and name == self.parse_dates
1444                )
1445            else:
1446                return (j in self.parse_dates) or (
1447                    name is not None and name in self.parse_dates
1448                )
1449
1450    def _extract_multi_indexer_columns(
1451        self, header, index_names, col_names, passed_names=False
1452    ):
1453        """
1454        extract and return the names, index_names, col_names
1455        header is a list-of-lists returned from the parsers
1456        """
1457        if len(header) < 2:
1458            return header[0], index_names, col_names, passed_names
1459
1460        # the names are the tuples of the header that are not the index cols
1461        # 0 is the name of the index, assuming index_col is a list of column
1462        # numbers
1463        ic = self.index_col
1464        if ic is None:
1465            ic = []
1466
1467        if not isinstance(ic, (list, tuple, np.ndarray)):
1468            ic = [ic]
1469        sic = set(ic)
1470
1471        # clean the index_names
1472        index_names = header.pop(-1)
1473        index_names, names, index_col = _clean_index_names(
1474            index_names, self.index_col, self.unnamed_cols
1475        )
1476
1477        # extract the columns
1478        field_count = len(header[0])
1479
1480        def extract(r):
1481            return tuple(r[i] for i in range(field_count) if i not in sic)
1482
1483        columns = list(zip(*(extract(r) for r in header)))
1484        names = ic + columns
1485
1486        # If we find unnamed columns all in a single
1487        # level, then our header was too long.
1488        for n in range(len(columns[0])):
1489            if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
1490                header = ",".join(str(x) for x in self.header)
1491                raise ParserError(
1492                    f"Passed header=[{header}] are too many rows "
1493                    "for this multi_index of columns"
1494                )
1495
1496        # Clean the column names (if we have an index_col).
1497        if len(ic):
1498            col_names = [
1499                r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None
1500                for r in header
1501            ]
1502        else:
1503            col_names = [None] * len(header)
1504
1505        passed_names = True
1506
1507        return names, index_names, col_names, passed_names
1508
1509    def _maybe_dedup_names(self, names):
1510        # see gh-7160 and gh-9424: this helps to provide
1511        # immediate alleviation of the duplicate names
1512        # issue and appears to be satisfactory to users,
1513        # but ultimately, not needing to butcher the names
1514        # would be nice!
1515        if self.mangle_dupe_cols:
1516            names = list(names)  # so we can index
1517            # pandas\io\parsers.py:1559: error: Need type annotation for
1518            # 'counts'  [var-annotated]
1519            counts = defaultdict(int)  # type: ignore[var-annotated]
1520            is_potential_mi = _is_potential_multi_index(names, self.index_col)
1521
1522            for i, col in enumerate(names):
1523                cur_count = counts[col]
1524
1525                while cur_count > 0:
1526                    counts[col] = cur_count + 1
1527
1528                    if is_potential_mi:
1529                        col = col[:-1] + (f"{col[-1]}.{cur_count}",)
1530                    else:
1531                        col = f"{col}.{cur_count}"
1532                    cur_count = counts[col]
1533
1534                names[i] = col
1535                counts[col] = cur_count + 1
1536
1537        return names
1538
1539    def _maybe_make_multi_index_columns(self, columns, col_names=None):
1540        # possibly create a column mi here
1541        if _is_potential_multi_index(columns):
1542            columns = MultiIndex.from_tuples(columns, names=col_names)
1543        return columns
1544
1545    def _make_index(self, data, alldata, columns, indexnamerow=False):
1546        if not _is_index_col(self.index_col) or not self.index_col:
1547            index = None
1548
1549        elif not self._has_complex_date_col:
1550            index = self._get_simple_index(alldata, columns)
1551            index = self._agg_index(index)
1552        elif self._has_complex_date_col:
1553            if not self._name_processed:
1554                (self.index_names, _, self.index_col) = _clean_index_names(
1555                    list(columns), self.index_col, self.unnamed_cols
1556                )
1557                self._name_processed = True
1558            index = self._get_complex_date_index(data, columns)
1559            index = self._agg_index(index, try_parse_dates=False)
1560
1561        # add names for the index
1562        if indexnamerow:
1563            coffset = len(indexnamerow) - len(columns)
1564            # pandas\io\parsers.py:1604: error: Item "None" of "Optional[Any]"
1565            # has no attribute "set_names"  [union-attr]
1566            index = index.set_names(indexnamerow[:coffset])  # type: ignore[union-attr]
1567
1568        # maybe create a mi on the columns
1569        columns = self._maybe_make_multi_index_columns(columns, self.col_names)
1570
1571        return index, columns
1572
1573    _implicit_index = False
1574
1575    def _get_simple_index(self, data, columns):
1576        def ix(col):
1577            if not isinstance(col, str):
1578                return col
1579            raise ValueError(f"Index {col} invalid")
1580
1581        to_remove = []
1582        index = []
1583        for idx in self.index_col:
1584            i = ix(idx)
1585            to_remove.append(i)
1586            index.append(data[i])
1587
1588        # remove index items from content and columns, don't pop in
1589        # loop
1590        for i in sorted(to_remove, reverse=True):
1591            data.pop(i)
1592            if not self._implicit_index:
1593                columns.pop(i)
1594
1595        return index
1596
1597    def _get_complex_date_index(self, data, col_names):
1598        def _get_name(icol):
1599            if isinstance(icol, str):
1600                return icol
1601
1602            if col_names is None:
1603                raise ValueError(f"Must supply column order to use {icol!s} as index")
1604
1605            for i, c in enumerate(col_names):
1606                if i == icol:
1607                    return c
1608
1609        to_remove = []
1610        index = []
1611        for idx in self.index_col:
1612            name = _get_name(idx)
1613            to_remove.append(name)
1614            index.append(data[name])
1615
1616        # remove index items from content and columns, don't pop in
1617        # loop
1618        for c in sorted(to_remove, reverse=True):
1619            data.pop(c)
1620            col_names.remove(c)
1621
1622        return index
1623
1624    def _agg_index(self, index, try_parse_dates=True) -> Index:
1625        arrays = []
1626
1627        for i, arr in enumerate(index):
1628
1629            if try_parse_dates and self._should_parse_dates(i):
1630                arr = self._date_conv(arr)
1631
1632            if self.na_filter:
1633                col_na_values = self.na_values
1634                col_na_fvalues = self.na_fvalues
1635            else:
1636                col_na_values = set()
1637                col_na_fvalues = set()
1638
1639            if isinstance(self.na_values, dict):
1640                # pandas\io\parsers.py:1678: error: Value of type
1641                # "Optional[Any]" is not indexable  [index]
1642                col_name = self.index_names[i]  # type: ignore[index]
1643                if col_name is not None:
1644                    col_na_values, col_na_fvalues = _get_na_values(
1645                        col_name, self.na_values, self.na_fvalues, self.keep_default_na
1646                    )
1647
1648            arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
1649            arrays.append(arr)
1650
1651        names = self.index_names
1652        index = ensure_index_from_sequences(arrays, names)
1653
1654        return index
1655
1656    def _convert_to_ndarrays(
1657        self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
1658    ):
1659        result = {}
1660        for c, values in dct.items():
1661            conv_f = None if converters is None else converters.get(c, None)
1662            if isinstance(dtypes, dict):
1663                cast_type = dtypes.get(c, None)
1664            else:
1665                # single dtype or None
1666                cast_type = dtypes
1667
1668            if self.na_filter:
1669                col_na_values, col_na_fvalues = _get_na_values(
1670                    c, na_values, na_fvalues, self.keep_default_na
1671                )
1672            else:
1673                col_na_values, col_na_fvalues = set(), set()
1674
1675            if conv_f is not None:
1676                # conv_f applied to data before inference
1677                if cast_type is not None:
1678                    warnings.warn(
1679                        (
1680                            "Both a converter and dtype were specified "
1681                            f"for column {c} - only the converter will be used"
1682                        ),
1683                        ParserWarning,
1684                        stacklevel=7,
1685                    )
1686
1687                try:
1688                    values = lib.map_infer(values, conv_f)
1689                except ValueError:
1690                    mask = algorithms.isin(values, list(na_values)).view(np.uint8)
1691                    values = lib.map_infer_mask(values, conv_f, mask)
1692
1693                cvals, na_count = self._infer_types(
1694                    values, set(col_na_values) | col_na_fvalues, try_num_bool=False
1695                )
1696            else:
1697                is_ea = is_extension_array_dtype(cast_type)
1698                is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
1699                # skip inference if specified dtype is object
1700                # or casting to an EA
1701                try_num_bool = not (cast_type and is_str_or_ea_dtype)
1702
1703                # general type inference and conversion
1704                cvals, na_count = self._infer_types(
1705                    values, set(col_na_values) | col_na_fvalues, try_num_bool
1706                )
1707
1708                # type specified in dtype param or cast_type is an EA
1709                if cast_type and (
1710                    not is_dtype_equal(cvals, cast_type)
1711                    or is_extension_array_dtype(cast_type)
1712                ):
1713                    if not is_ea and na_count > 0:
1714                        try:
1715                            if is_bool_dtype(cast_type):
1716                                raise ValueError(
1717                                    f"Bool column has NA values in column {c}"
1718                                )
1719                        except (AttributeError, TypeError):
1720                            # invalid input to is_bool_dtype
1721                            pass
1722                    cvals = self._cast_types(cvals, cast_type, c)
1723
1724            result[c] = cvals
1725            if verbose and na_count:
1726                print(f"Filled {na_count} NA values in column {c!s}")
1727        return result
1728
1729    def _infer_types(self, values, na_values, try_num_bool=True):
1730        """
1731        Infer types of values, possibly casting
1732
1733        Parameters
1734        ----------
1735        values : ndarray
1736        na_values : set
1737        try_num_bool : bool, default try
1738           try to cast values to numeric (first preference) or boolean
1739
1740        Returns
1741        -------
1742        converted : ndarray
1743        na_count : int
1744        """
1745        na_count = 0
1746        if issubclass(values.dtype.type, (np.number, np.bool_)):
1747            mask = algorithms.isin(values, list(na_values))
1748            na_count = mask.sum()
1749            if na_count > 0:
1750                if is_integer_dtype(values):
1751                    values = values.astype(np.float64)
1752                np.putmask(values, mask, np.nan)
1753            return values, na_count
1754
1755        if try_num_bool and is_object_dtype(values.dtype):
1756            # exclude e.g DatetimeIndex here
1757            try:
1758                result = lib.maybe_convert_numeric(values, na_values, False)
1759            except (ValueError, TypeError):
1760                # e.g. encountering datetime string gets ValueError
1761                #  TypeError can be raised in floatify
1762                result = values
1763                na_count = parsers.sanitize_objects(result, na_values, False)
1764            else:
1765                na_count = isna(result).sum()
1766        else:
1767            result = values
1768            if values.dtype == np.object_:
1769                na_count = parsers.sanitize_objects(values, na_values, False)
1770
1771        if result.dtype == np.object_ and try_num_bool:
1772            result = libops.maybe_convert_bool(
1773                np.asarray(values),
1774                true_values=self.true_values,
1775                false_values=self.false_values,
1776            )
1777
1778        return result, na_count
1779
1780    def _cast_types(self, values, cast_type, column):
1781        """
1782        Cast values to specified type
1783
1784        Parameters
1785        ----------
1786        values : ndarray
1787        cast_type : string or np.dtype
1788           dtype to cast values to
1789        column : string
1790            column name - used only for error reporting
1791
1792        Returns
1793        -------
1794        converted : ndarray
1795        """
1796        if is_categorical_dtype(cast_type):
1797            known_cats = (
1798                isinstance(cast_type, CategoricalDtype)
1799                and cast_type.categories is not None
1800            )
1801
1802            if not is_object_dtype(values) and not known_cats:
1803                # TODO: this is for consistency with
1804                # c-parser which parses all categories
1805                # as strings
1806                values = astype_nansafe(values, str)
1807
1808            cats = Index(values).unique().dropna()
1809            values = Categorical._from_inferred_categories(
1810                cats, cats.get_indexer(values), cast_type, true_values=self.true_values
1811            )
1812
1813        # use the EA's implementation of casting
1814        elif is_extension_array_dtype(cast_type):
1815            # ensure cast_type is an actual dtype and not a string
1816            cast_type = pandas_dtype(cast_type)
1817            array_type = cast_type.construct_array_type()
1818            try:
1819                return array_type._from_sequence_of_strings(values, dtype=cast_type)
1820            except NotImplementedError as err:
1821                raise NotImplementedError(
1822                    f"Extension Array: {array_type} must implement "
1823                    "_from_sequence_of_strings in order to be used in parser methods"
1824                ) from err
1825
1826        else:
1827            try:
1828                values = astype_nansafe(values, cast_type, copy=True, skipna=True)
1829            except ValueError as err:
1830                raise ValueError(
1831                    f"Unable to convert column {column} to type {cast_type}"
1832                ) from err
1833        return values
1834
1835    def _do_date_conversions(self, names, data):
1836        # returns data, columns
1837
1838        if self.parse_dates is not None:
1839            data, names = _process_date_conversion(
1840                data,
1841                self._date_conv,
1842                self.parse_dates,
1843                self.index_col,
1844                self.index_names,
1845                names,
1846                keep_date_col=self.keep_date_col,
1847            )
1848
1849        return names, data
1850
1851
1852class CParserWrapper(ParserBase):
1853    def __init__(self, src: FilePathOrBuffer, **kwds):
1854        self.kwds = kwds
1855        kwds = kwds.copy()
1856
1857        ParserBase.__init__(self, kwds)
1858
1859        # #2442
1860        kwds["allow_leading_cols"] = self.index_col is not False
1861
1862        # GH20529, validate usecol arg before TextReader
1863        self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
1864        kwds["usecols"] = self.usecols
1865
1866        # open handles
1867        self._open_handles(src, kwds)
1868        assert self.handles is not None
1869        for key in ("storage_options", "encoding", "memory_map", "compression"):
1870            kwds.pop(key, None)
1871
1872        try:
1873            self._reader = parsers.TextReader(self.handles.handle, **kwds)
1874        except Exception:
1875            self.handles.close()
1876            raise
1877        self.unnamed_cols = self._reader.unnamed_cols
1878
1879        passed_names = self.names is None
1880
1881        if self._reader.header is None:
1882            self.names = None
1883        else:
1884            if len(self._reader.header) > 1:
1885                # we have a multi index in the columns
1886                (
1887                    self.names,
1888                    self.index_names,
1889                    self.col_names,
1890                    passed_names,
1891                ) = self._extract_multi_indexer_columns(
1892                    self._reader.header, self.index_names, self.col_names, passed_names
1893                )
1894            else:
1895                self.names = list(self._reader.header[0])
1896
1897        if self.names is None:
1898            if self.prefix:
1899                self.names = [
1900                    f"{self.prefix}{i}" for i in range(self._reader.table_width)
1901                ]
1902            else:
1903                self.names = list(range(self._reader.table_width))
1904
1905        # gh-9755
1906        #
1907        # need to set orig_names here first
1908        # so that proper indexing can be done
1909        # with _set_noconvert_columns
1910        #
1911        # once names has been filtered, we will
1912        # then set orig_names again to names
1913        self.orig_names = self.names[:]
1914
1915        if self.usecols:
1916            usecols = _evaluate_usecols(self.usecols, self.orig_names)
1917
1918            # GH 14671
1919            # assert for mypy, orig_names is List or None, None would error in issubset
1920            assert self.orig_names is not None
1921            if self.usecols_dtype == "string" and not set(usecols).issubset(
1922                self.orig_names
1923            ):
1924                _validate_usecols_names(usecols, self.orig_names)
1925
1926            if len(self.names) > len(usecols):
1927                self.names = [
1928                    n
1929                    for i, n in enumerate(self.names)
1930                    if (i in usecols or n in usecols)
1931                ]
1932
1933            if len(self.names) < len(usecols):
1934                _validate_usecols_names(usecols, self.names)
1935
1936        self._validate_parse_dates_presence(self.names)
1937        self._set_noconvert_columns()
1938
1939        self.orig_names = self.names
1940
1941        if not self._has_complex_date_col:
1942            if self._reader.leading_cols == 0 and _is_index_col(self.index_col):
1943
1944                self._name_processed = True
1945                (index_names, self.names, self.index_col) = _clean_index_names(
1946                    self.names, self.index_col, self.unnamed_cols
1947                )
1948
1949                if self.index_names is None:
1950                    self.index_names = index_names
1951
1952            if self._reader.header is None and not passed_names:
1953                # pandas\io\parsers.py:1997: error: Argument 1 to "len" has
1954                # incompatible type "Optional[Any]"; expected "Sized"
1955                # [arg-type]
1956                self.index_names = [None] * len(
1957                    self.index_names  # type: ignore[arg-type]
1958                )
1959
1960        self._implicit_index = self._reader.leading_cols > 0
1961
1962    def close(self) -> None:
1963        super().close()
1964
1965        # close additional handles opened by C parser
1966        try:
1967            self._reader.close()
1968        except ValueError:
1969            pass
1970
1971    def _set_noconvert_columns(self):
1972        """
1973        Set the columns that should not undergo dtype conversions.
1974
1975        Currently, any column that is involved with date parsing will not
1976        undergo such conversions.
1977        """
1978        names = self.orig_names
1979        if self.usecols_dtype == "integer":
1980            # A set of integers will be converted to a list in
1981            # the correct order every single time.
1982            usecols = list(self.usecols)
1983            usecols.sort()
1984        elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
1985            # The names attribute should have the correct columns
1986            # in the proper order for indexing with parse_dates.
1987            usecols = self.names[:]
1988        else:
1989            # Usecols is empty.
1990
1991            # pandas\io\parsers.py:2030: error: Incompatible types in
1992            # assignment (expression has type "None", variable has type
1993            # "List[Any]")  [assignment]
1994            usecols = None  # type: ignore[assignment]
1995
1996        def _set(x):
1997            if usecols is not None and is_integer(x):
1998                x = usecols[x]
1999
2000            if not is_integer(x):
2001                # assert for mypy, names is List or None, None would error when calling
2002                # .index()
2003                assert names is not None
2004                x = names.index(x)
2005
2006            self._reader.set_noconvert(x)
2007
2008        if isinstance(self.parse_dates, list):
2009            for val in self.parse_dates:
2010                if isinstance(val, list):
2011                    for k in val:
2012                        _set(k)
2013                else:
2014                    _set(val)
2015
2016        elif isinstance(self.parse_dates, dict):
2017            for val in self.parse_dates.values():
2018                if isinstance(val, list):
2019                    for k in val:
2020                        _set(k)
2021                else:
2022                    _set(val)
2023
2024        elif self.parse_dates:
2025            if isinstance(self.index_col, list):
2026                for k in self.index_col:
2027                    _set(k)
2028            elif self.index_col is not None:
2029                _set(self.index_col)
2030
2031    def set_error_bad_lines(self, status):
2032        self._reader.set_error_bad_lines(int(status))
2033
2034    def read(self, nrows=None):
2035        try:
2036            data = self._reader.read(nrows)
2037        except StopIteration:
2038            if self._first_chunk:
2039                self._first_chunk = False
2040                names = self._maybe_dedup_names(self.orig_names)
2041                index, columns, col_dict = _get_empty_meta(
2042                    names,
2043                    self.index_col,
2044                    self.index_names,
2045                    dtype=self.kwds.get("dtype"),
2046                )
2047                columns = self._maybe_make_multi_index_columns(columns, self.col_names)
2048
2049                if self.usecols is not None:
2050                    columns = self._filter_usecols(columns)
2051
2052                col_dict = {k: v for k, v in col_dict.items() if k in columns}
2053
2054                return index, columns, col_dict
2055
2056            else:
2057                self.close()
2058                raise
2059
2060        # Done with first read, next time raise StopIteration
2061        self._first_chunk = False
2062
2063        names = self.names
2064
2065        if self._reader.leading_cols:
2066            if self._has_complex_date_col:
2067                raise NotImplementedError("file structure not yet supported")
2068
2069            # implicit index, no index names
2070            arrays = []
2071
2072            for i in range(self._reader.leading_cols):
2073                if self.index_col is None:
2074                    values = data.pop(i)
2075                else:
2076                    values = data.pop(self.index_col[i])
2077
2078                values = self._maybe_parse_dates(values, i, try_parse_dates=True)
2079                arrays.append(values)
2080
2081            index = ensure_index_from_sequences(arrays)
2082
2083            if self.usecols is not None:
2084                names = self._filter_usecols(names)
2085
2086            names = self._maybe_dedup_names(names)
2087
2088            # rename dict keys
2089            data = sorted(data.items())
2090            data = {k: v for k, (i, v) in zip(names, data)}
2091
2092            names, data = self._do_date_conversions(names, data)
2093
2094        else:
2095            # rename dict keys
2096            data = sorted(data.items())
2097
2098            # ugh, mutation
2099
2100            # assert for mypy, orig_names is List or None, None would error in list(...)
2101            assert self.orig_names is not None
2102            names = list(self.orig_names)
2103            names = self._maybe_dedup_names(names)
2104
2105            if self.usecols is not None:
2106                names = self._filter_usecols(names)
2107
2108            # columns as list
2109            alldata = [x[1] for x in data]
2110
2111            data = {k: v for k, (i, v) in zip(names, data)}
2112
2113            names, data = self._do_date_conversions(names, data)
2114            index, names = self._make_index(data, alldata, names)
2115
2116        # maybe create a mi on the columns
2117        names = self._maybe_make_multi_index_columns(names, self.col_names)
2118
2119        return index, names, data
2120
2121    def _filter_usecols(self, names):
2122        # hackish
2123        usecols = _evaluate_usecols(self.usecols, names)
2124        if usecols is not None and len(names) != len(usecols):
2125            names = [
2126                name for i, name in enumerate(names) if i in usecols or name in usecols
2127            ]
2128        return names
2129
2130    def _get_index_names(self):
2131        names = list(self._reader.header[0])
2132        idx_names = None
2133
2134        if self._reader.leading_cols == 0 and self.index_col is not None:
2135            (idx_names, names, self.index_col) = _clean_index_names(
2136                names, self.index_col, self.unnamed_cols
2137            )
2138
2139        return names, idx_names
2140
2141    def _maybe_parse_dates(self, values, index, try_parse_dates=True):
2142        if try_parse_dates and self._should_parse_dates(index):
2143            values = self._date_conv(values)
2144        return values
2145
2146
2147def TextParser(*args, **kwds):
2148    """
2149    Converts lists of lists/tuples into DataFrames with proper type inference
2150    and optional (e.g. string to datetime) conversion. Also enables iterating
2151    lazily over chunks of large files
2152
2153    Parameters
2154    ----------
2155    data : file-like object or list
2156    delimiter : separator character to use
2157    dialect : str or csv.Dialect instance, optional
2158        Ignored if delimiter is longer than 1 character
2159    names : sequence, default
2160    header : int, default 0
2161        Row to use to parse column labels. Defaults to the first row. Prior
2162        rows will be discarded
2163    index_col : int or list, optional
2164        Column or columns to use as the (possibly hierarchical) index
2165    has_index_names: bool, default False
2166        True if the cols defined in index_col have an index name and are
2167        not in the header.
2168    na_values : scalar, str, list-like, or dict, optional
2169        Additional strings to recognize as NA/NaN.
2170    keep_default_na : bool, default True
2171    thousands : str, optional
2172        Thousands separator
2173    comment : str, optional
2174        Comment out remainder of line
2175    parse_dates : bool, default False
2176    keep_date_col : bool, default False
2177    date_parser : function, optional
2178    skiprows : list of integers
2179        Row numbers to skip
2180    skipfooter : int
2181        Number of line at bottom of file to skip
2182    converters : dict, optional
2183        Dict of functions for converting values in certain columns. Keys can
2184        either be integers or column labels, values are functions that take one
2185        input argument, the cell (not column) content, and return the
2186        transformed content.
2187    encoding : str, optional
2188        Encoding to use for UTF when reading/writing (ex. 'utf-8')
2189    squeeze : bool, default False
2190        returns Series if only one column.
2191    infer_datetime_format: bool, default False
2192        If True and `parse_dates` is True for a column, try to infer the
2193        datetime format based on the first datetime string. If the format
2194        can be inferred, there often will be a large parsing speed-up.
2195    float_precision : str, optional
2196        Specifies which converter the C engine should use for floating-point
2197        values. The options are `None` or `high` for the ordinary converter,
2198        `legacy` for the original lower precision pandas converter, and
2199        `round_trip` for the round-trip converter.
2200
2201        .. versionchanged:: 1.2
2202    """
2203    kwds["engine"] = "python"
2204    return TextFileReader(*args, **kwds)
2205
2206
2207def count_empty_vals(vals) -> int:
2208    return sum(1 for v in vals if v == "" or v is None)
2209
2210
2211class PythonParser(ParserBase):
2212    def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
2213        """
2214        Workhorse function for processing nested list into DataFrame
2215        """
2216        ParserBase.__init__(self, kwds)
2217
2218        self.data: Optional[Iterator[str]] = None
2219        self.buf: List = []
2220        self.pos = 0
2221        self.line_pos = 0
2222
2223        self.skiprows = kwds["skiprows"]
2224
2225        if callable(self.skiprows):
2226            self.skipfunc = self.skiprows
2227        else:
2228            self.skipfunc = lambda x: x in self.skiprows
2229
2230        self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
2231        self.delimiter = kwds["delimiter"]
2232
2233        self.quotechar = kwds["quotechar"]
2234        if isinstance(self.quotechar, str):
2235            self.quotechar = str(self.quotechar)
2236
2237        self.escapechar = kwds["escapechar"]
2238        self.doublequote = kwds["doublequote"]
2239        self.skipinitialspace = kwds["skipinitialspace"]
2240        self.lineterminator = kwds["lineterminator"]
2241        self.quoting = kwds["quoting"]
2242        self.usecols, _ = _validate_usecols_arg(kwds["usecols"])
2243        self.skip_blank_lines = kwds["skip_blank_lines"]
2244
2245        self.warn_bad_lines = kwds["warn_bad_lines"]
2246        self.error_bad_lines = kwds["error_bad_lines"]
2247
2248        self.names_passed = kwds["names"] or None
2249
2250        self.has_index_names = False
2251        if "has_index_names" in kwds:
2252            self.has_index_names = kwds["has_index_names"]
2253
2254        self.verbose = kwds["verbose"]
2255        self.converters = kwds["converters"]
2256
2257        self.dtype = kwds["dtype"]
2258        self.thousands = kwds["thousands"]
2259        self.decimal = kwds["decimal"]
2260
2261        self.comment = kwds["comment"]
2262
2263        # Set self.data to something that can read lines.
2264        if isinstance(f, list):
2265            # read_excel: f is a list
2266            self.data = cast(Iterator[str], f)
2267        else:
2268            self._open_handles(f, kwds)
2269            assert self.handles is not None
2270            assert hasattr(self.handles.handle, "readline")
2271            try:
2272                self._make_reader(self.handles.handle)
2273            except (csv.Error, UnicodeDecodeError):
2274                self.close()
2275                raise
2276
2277        # Get columns in two steps: infer from data, then
2278        # infer column indices from self.usecols if it is specified.
2279        self._col_indices = None
2280        try:
2281            (
2282                self.columns,
2283                self.num_original_columns,
2284                self.unnamed_cols,
2285            ) = self._infer_columns()
2286        except (TypeError, ValueError):
2287            self.close()
2288            raise
2289
2290        # Now self.columns has the set of columns that we will process.
2291        # The original set is stored in self.original_columns.
2292        if len(self.columns) > 1:
2293            # we are processing a multi index column
2294            (
2295                self.columns,
2296                self.index_names,
2297                self.col_names,
2298                _,
2299            ) = self._extract_multi_indexer_columns(
2300                self.columns, self.index_names, self.col_names
2301            )
2302            # Update list of original names to include all indices.
2303            self.num_original_columns = len(self.columns)
2304        else:
2305            self.columns = self.columns[0]
2306
2307        # get popped off for index
2308        self.orig_names = list(self.columns)
2309
2310        # needs to be cleaned/refactored
2311        # multiple date column thing turning into a real spaghetti factory
2312
2313        if not self._has_complex_date_col:
2314            (index_names, self.orig_names, self.columns) = self._get_index_name(
2315                self.columns
2316            )
2317            self._name_processed = True
2318            if self.index_names is None:
2319                self.index_names = index_names
2320
2321        self._validate_parse_dates_presence(self.columns)
2322        if self.parse_dates:
2323            self._no_thousands_columns = self._set_no_thousands_columns()
2324        else:
2325            self._no_thousands_columns = None
2326
2327        if len(self.decimal) != 1:
2328            raise ValueError("Only length-1 decimal markers supported")
2329
2330        if self.thousands is None:
2331            self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+")
2332        else:
2333            self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+")
2334
2335    def _set_no_thousands_columns(self):
2336        # Create a set of column ids that are not to be stripped of thousands
2337        # operators.
2338        noconvert_columns = set()
2339
2340        def _set(x):
2341            if is_integer(x):
2342                noconvert_columns.add(x)
2343            else:
2344                noconvert_columns.add(self.columns.index(x))
2345
2346        if isinstance(self.parse_dates, list):
2347            for val in self.parse_dates:
2348                if isinstance(val, list):
2349                    for k in val:
2350                        _set(k)
2351                else:
2352                    _set(val)
2353
2354        elif isinstance(self.parse_dates, dict):
2355            for val in self.parse_dates.values():
2356                if isinstance(val, list):
2357                    for k in val:
2358                        _set(k)
2359                else:
2360                    _set(val)
2361
2362        elif self.parse_dates:
2363            if isinstance(self.index_col, list):
2364                for k in self.index_col:
2365                    _set(k)
2366            elif self.index_col is not None:
2367                _set(self.index_col)
2368
2369        return noconvert_columns
2370
2371    def _make_reader(self, f):
2372        sep = self.delimiter
2373
2374        if sep is None or len(sep) == 1:
2375            if self.lineterminator:
2376                raise ValueError(
2377                    "Custom line terminators not supported in python parser (yet)"
2378                )
2379
2380            class MyDialect(csv.Dialect):
2381                delimiter = self.delimiter
2382                quotechar = self.quotechar
2383                escapechar = self.escapechar
2384                doublequote = self.doublequote
2385                skipinitialspace = self.skipinitialspace
2386                quoting = self.quoting
2387                lineterminator = "\n"
2388
2389            dia = MyDialect
2390
2391            if sep is not None:
2392                dia.delimiter = sep
2393            else:
2394                # attempt to sniff the delimiter from the first valid line,
2395                # i.e. no comment line and not in skiprows
2396                line = f.readline()
2397                lines = self._check_comments([[line]])[0]
2398                while self.skipfunc(self.pos) or not lines:
2399                    self.pos += 1
2400                    line = f.readline()
2401                    lines = self._check_comments([[line]])[0]
2402
2403                # since `line` was a string, lines will be a list containing
2404                # only a single string
2405                line = lines[0]
2406
2407                self.pos += 1
2408                self.line_pos += 1
2409                sniffed = csv.Sniffer().sniff(line)
2410                dia.delimiter = sniffed.delimiter
2411
2412                # Note: encoding is irrelevant here
2413                line_rdr = csv.reader(StringIO(line), dialect=dia)
2414                self.buf.extend(list(line_rdr))
2415
2416            # Note: encoding is irrelevant here
2417            reader = csv.reader(f, dialect=dia, strict=True)
2418
2419        else:
2420
2421            def _read():
2422                line = f.readline()
2423                pat = re.compile(sep)
2424
2425                yield pat.split(line.strip())
2426
2427                for line in f:
2428                    yield pat.split(line.strip())
2429
2430            reader = _read()
2431
2432        # pandas\io\parsers.py:2427: error: Incompatible types in assignment
2433        # (expression has type "_reader", variable has type "Union[IO[Any],
2434        # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap, None]")
2435        # [assignment]
2436        self.data = reader  # type: ignore[assignment]
2437
2438    def read(self, rows=None):
2439        try:
2440            content = self._get_lines(rows)
2441        except StopIteration:
2442            if self._first_chunk:
2443                content = []
2444            else:
2445                self.close()
2446                raise
2447
2448        # done with first read, next time raise StopIteration
2449        self._first_chunk = False
2450
2451        # pandas\io\parsers.py:2480: error: Argument 1 to "list" has
2452        # incompatible type "Optional[Any]"; expected "Iterable[Any]"
2453        # [arg-type]
2454        columns = list(self.orig_names)  # type: ignore[arg-type]
2455        if not len(content):  # pragma: no cover
2456            # DataFrame with the right metadata, even though it's length 0
2457            names = self._maybe_dedup_names(self.orig_names)
2458            index, columns, col_dict = _get_empty_meta(
2459                names, self.index_col, self.index_names, self.dtype
2460            )
2461            columns = self._maybe_make_multi_index_columns(columns, self.col_names)
2462            return index, columns, col_dict
2463
2464        # handle new style for names in index
2465        count_empty_content_vals = count_empty_vals(content[0])
2466        indexnamerow = None
2467        if self.has_index_names and count_empty_content_vals == len(columns):
2468            indexnamerow = content[0]
2469            content = content[1:]
2470
2471        alldata = self._rows_to_cols(content)
2472        data = self._exclude_implicit_index(alldata)
2473
2474        columns = self._maybe_dedup_names(self.columns)
2475        columns, data = self._do_date_conversions(columns, data)
2476
2477        data = self._convert_data(data)
2478        index, columns = self._make_index(data, alldata, columns, indexnamerow)
2479
2480        return index, columns, data
2481
2482    def _exclude_implicit_index(self, alldata):
2483        names = self._maybe_dedup_names(self.orig_names)
2484
2485        if self._implicit_index:
2486            excl_indices = self.index_col
2487
2488            data = {}
2489            offset = 0
2490            for i, col in enumerate(names):
2491                while i + offset in excl_indices:
2492                    offset += 1
2493                data[col] = alldata[i + offset]
2494        else:
2495            data = {k: v for k, v in zip(names, alldata)}
2496
2497        return data
2498
2499    # legacy
2500    def get_chunk(self, size=None):
2501        if size is None:
2502            # pandas\io\parsers.py:2528: error: "PythonParser" has no attribute
2503            # "chunksize"  [attr-defined]
2504            size = self.chunksize  # type: ignore[attr-defined]
2505        return self.read(rows=size)
2506
2507    def _convert_data(self, data):
2508        # apply converters
2509        def _clean_mapping(mapping):
2510            """converts col numbers to names"""
2511            clean = {}
2512            for col, v in mapping.items():
2513                # pandas\io\parsers.py:2537: error: Unsupported right operand
2514                # type for in ("Optional[Any]")  [operator]
2515                if (
2516                    isinstance(col, int)
2517                    and col not in self.orig_names  # type: ignore[operator]
2518                ):
2519                    # pandas\io\parsers.py:2538: error: Value of type
2520                    # "Optional[Any]" is not indexable  [index]
2521                    col = self.orig_names[col]  # type: ignore[index]
2522                clean[col] = v
2523            return clean
2524
2525        clean_conv = _clean_mapping(self.converters)
2526        if not isinstance(self.dtype, dict):
2527            # handles single dtype applied to all columns
2528            clean_dtypes = self.dtype
2529        else:
2530            clean_dtypes = _clean_mapping(self.dtype)
2531
2532        # Apply NA values.
2533        clean_na_values = {}
2534        clean_na_fvalues = {}
2535
2536        if isinstance(self.na_values, dict):
2537            for col in self.na_values:
2538                na_value = self.na_values[col]
2539                na_fvalue = self.na_fvalues[col]
2540
2541                # pandas\io\parsers.py:2558: error: Unsupported right operand
2542                # type for in ("Optional[Any]")  [operator]
2543                if (
2544                    isinstance(col, int)
2545                    and col not in self.orig_names  # type: ignore[operator]
2546                ):
2547                    # pandas\io\parsers.py:2559: error: Value of type
2548                    # "Optional[Any]" is not indexable  [index]
2549                    col = self.orig_names[col]  # type: ignore[index]
2550
2551                clean_na_values[col] = na_value
2552                clean_na_fvalues[col] = na_fvalue
2553        else:
2554            clean_na_values = self.na_values
2555            clean_na_fvalues = self.na_fvalues
2556
2557        return self._convert_to_ndarrays(
2558            data,
2559            clean_na_values,
2560            clean_na_fvalues,
2561            self.verbose,
2562            clean_conv,
2563            clean_dtypes,
2564        )
2565
2566    def _infer_columns(self):
2567        names = self.names
2568        num_original_columns = 0
2569        clear_buffer = True
2570        # pandas\io\parsers.py:2580: error: Need type annotation for
2571        # 'unnamed_cols' (hint: "unnamed_cols: Set[<type>] = ...")
2572        # [var-annotated]
2573        unnamed_cols = set()  # type: ignore[var-annotated]
2574
2575        if self.header is not None:
2576            header = self.header
2577
2578            if isinstance(header, (list, tuple, np.ndarray)):
2579                have_mi_columns = len(header) > 1
2580                # we have a mi columns, so read an extra line
2581                if have_mi_columns:
2582                    header = list(header) + [header[-1] + 1]
2583            else:
2584                have_mi_columns = False
2585                header = [header]
2586
2587            # pandas\io\parsers.py:2594: error: Need type annotation for
2588            # 'columns' (hint: "columns: List[<type>] = ...")  [var-annotated]
2589            columns = []  # type: ignore[var-annotated]
2590            for level, hr in enumerate(header):
2591                try:
2592                    line = self._buffered_line()
2593
2594                    while self.line_pos <= hr:
2595                        line = self._next_line()
2596
2597                except StopIteration as err:
2598                    if self.line_pos < hr:
2599                        raise ValueError(
2600                            f"Passed header={hr} but only {self.line_pos + 1} lines in "
2601                            "file"
2602                        ) from err
2603
2604                    # We have an empty file, so check
2605                    # if columns are provided. That will
2606                    # serve as the 'line' for parsing
2607                    if have_mi_columns and hr > 0:
2608                        if clear_buffer:
2609                            self._clear_buffer()
2610                        columns.append([None] * len(columns[-1]))
2611                        return columns, num_original_columns, unnamed_cols
2612
2613                    if not self.names:
2614                        raise EmptyDataError("No columns to parse from file") from err
2615
2616                    line = self.names[:]
2617
2618                this_columns = []
2619                this_unnamed_cols = []
2620
2621                for i, c in enumerate(line):
2622                    if c == "":
2623                        if have_mi_columns:
2624                            col_name = f"Unnamed: {i}_level_{level}"
2625                        else:
2626                            col_name = f"Unnamed: {i}"
2627
2628                        this_unnamed_cols.append(i)
2629                        this_columns.append(col_name)
2630                    else:
2631                        this_columns.append(c)
2632
2633                if not have_mi_columns and self.mangle_dupe_cols:
2634                    # pandas\io\parsers.py:2639: error: Need type annotation
2635                    # for 'counts'  [var-annotated]
2636                    counts = defaultdict(int)  # type: ignore[var-annotated]
2637
2638                    for i, col in enumerate(this_columns):
2639                        cur_count = counts[col]
2640
2641                        while cur_count > 0:
2642                            counts[col] = cur_count + 1
2643                            col = f"{col}.{cur_count}"
2644                            cur_count = counts[col]
2645
2646                        this_columns[i] = col
2647                        counts[col] = cur_count + 1
2648                elif have_mi_columns:
2649
2650                    # if we have grabbed an extra line, but its not in our
2651                    # format so save in the buffer, and create an blank extra
2652                    # line for the rest of the parsing code
2653                    if hr == header[-1]:
2654                        lc = len(this_columns)
2655                        ic = len(self.index_col) if self.index_col is not None else 0
2656                        unnamed_count = len(this_unnamed_cols)
2657
2658                        if lc != unnamed_count and lc - ic > unnamed_count:
2659                            clear_buffer = False
2660                            # pandas\io\parsers.py:2663: error: List item 0 has
2661                            # incompatible type "None"; expected "str"
2662                            # [list-item]
2663                            this_columns = [None] * lc  # type: ignore[list-item]
2664                            self.buf = [self.buf[-1]]
2665
2666                # pandas\io\parsers.py:2666: error: Argument 1 to "append" of
2667                # "list" has incompatible type "List[str]"; expected
2668                # "List[None]"  [arg-type]
2669                columns.append(this_columns)  # type: ignore[arg-type]
2670                unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
2671
2672                if len(columns) == 1:
2673                    num_original_columns = len(this_columns)
2674
2675            if clear_buffer:
2676                self._clear_buffer()
2677
2678            if names is not None:
2679                if (self.usecols is not None and len(names) != len(self.usecols)) or (
2680                    self.usecols is None and len(names) != len(columns[0])
2681                ):
2682                    raise ValueError(
2683                        "Number of passed names did not match "
2684                        "number of header fields in the file"
2685                    )
2686                if len(columns) > 1:
2687                    raise TypeError("Cannot pass names with multi-index columns")
2688
2689                if self.usecols is not None:
2690                    # Set _use_cols. We don't store columns because they are
2691                    # overwritten.
2692                    self._handle_usecols(columns, names)
2693                else:
2694                    self._col_indices = None
2695                    num_original_columns = len(names)
2696                columns = [names]
2697            else:
2698                columns = self._handle_usecols(columns, columns[0])
2699        else:
2700            try:
2701                line = self._buffered_line()
2702
2703            except StopIteration as err:
2704                if not names:
2705                    raise EmptyDataError("No columns to parse from file") from err
2706
2707                line = names[:]
2708
2709            ncols = len(line)
2710            num_original_columns = ncols
2711
2712            if not names:
2713                if self.prefix:
2714                    # pandas\io\parsers.py:2711: error: List comprehension has
2715                    # incompatible type List[str]; expected List[None]  [misc]
2716                    columns = [
2717                        [
2718                            f"{self.prefix}{i}"  # type: ignore[misc]
2719                            for i in range(ncols)
2720                        ]
2721                    ]
2722                else:
2723                    # pandas\io\parsers.py:2713: error: Argument 1 to "list"
2724                    # has incompatible type "range"; expected "Iterable[None]"
2725                    # [arg-type]
2726                    columns = [list(range(ncols))]  # type: ignore[arg-type]
2727                columns = self._handle_usecols(columns, columns[0])
2728            else:
2729                if self.usecols is None or len(names) >= num_original_columns:
2730                    columns = self._handle_usecols([names], names)
2731                    num_original_columns = len(names)
2732                else:
2733                    if not callable(self.usecols) and len(names) != len(self.usecols):
2734                        raise ValueError(
2735                            "Number of passed names did not match number of "
2736                            "header fields in the file"
2737                        )
2738                    # Ignore output but set used columns.
2739                    self._handle_usecols([names], names)
2740                    columns = [names]
2741                    num_original_columns = ncols
2742
2743        return columns, num_original_columns, unnamed_cols
2744
2745    def _handle_usecols(self, columns, usecols_key):
2746        """
2747        Sets self._col_indices
2748
2749        usecols_key is used if there are string usecols.
2750        """
2751        if self.usecols is not None:
2752            if callable(self.usecols):
2753                col_indices = _evaluate_usecols(self.usecols, usecols_key)
2754            elif any(isinstance(u, str) for u in self.usecols):
2755                if len(columns) > 1:
2756                    raise ValueError(
2757                        "If using multiple headers, usecols must be integers."
2758                    )
2759                col_indices = []
2760
2761                for col in self.usecols:
2762                    if isinstance(col, str):
2763                        try:
2764                            col_indices.append(usecols_key.index(col))
2765                        except ValueError:
2766                            _validate_usecols_names(self.usecols, usecols_key)
2767                    else:
2768                        col_indices.append(col)
2769            else:
2770                col_indices = self.usecols
2771
2772            columns = [
2773                [n for i, n in enumerate(column) if i in col_indices]
2774                for column in columns
2775            ]
2776            self._col_indices = col_indices
2777        return columns
2778
2779    def _buffered_line(self):
2780        """
2781        Return a line from buffer, filling buffer if required.
2782        """
2783        if len(self.buf) > 0:
2784            return self.buf[0]
2785        else:
2786            return self._next_line()
2787
2788    def _check_for_bom(self, first_row):
2789        """
2790        Checks whether the file begins with the BOM character.
2791        If it does, remove it. In addition, if there is quoting
2792        in the field subsequent to the BOM, remove it as well
2793        because it technically takes place at the beginning of
2794        the name, not the middle of it.
2795        """
2796        # first_row will be a list, so we need to check
2797        # that that list is not empty before proceeding.
2798        if not first_row:
2799            return first_row
2800
2801        # The first element of this row is the one that could have the
2802        # BOM that we want to remove. Check that the first element is a
2803        # string before proceeding.
2804        if not isinstance(first_row[0], str):
2805            return first_row
2806
2807        # Check that the string is not empty, as that would
2808        # obviously not have a BOM at the start of it.
2809        if not first_row[0]:
2810            return first_row
2811
2812        # Since the string is non-empty, check that it does
2813        # in fact begin with a BOM.
2814        first_elt = first_row[0][0]
2815        if first_elt != _BOM:
2816            return first_row
2817
2818        first_row_bom = first_row[0]
2819
2820        if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
2821            start = 2
2822            quote = first_row_bom[1]
2823            end = first_row_bom[2:].index(quote) + 2
2824
2825            # Extract the data between the quotation marks
2826            new_row = first_row_bom[start:end]
2827
2828            # Extract any remaining data after the second
2829            # quotation mark.
2830            if len(first_row_bom) > end + 1:
2831                new_row += first_row_bom[end + 1 :]
2832
2833        else:
2834
2835            # No quotation so just remove BOM from first element
2836            new_row = first_row_bom[1:]
2837        return [new_row] + first_row[1:]
2838
2839    def _is_line_empty(self, line):
2840        """
2841        Check if a line is empty or not.
2842
2843        Parameters
2844        ----------
2845        line : str, array-like
2846            The line of data to check.
2847
2848        Returns
2849        -------
2850        boolean : Whether or not the line is empty.
2851        """
2852        return not line or all(not x for x in line)
2853
2854    def _next_line(self):
2855        if isinstance(self.data, list):
2856            while self.skipfunc(self.pos):
2857                self.pos += 1
2858
2859            while True:
2860                try:
2861                    line = self._check_comments([self.data[self.pos]])[0]
2862                    self.pos += 1
2863                    # either uncommented or blank to begin with
2864                    if not self.skip_blank_lines and (
2865                        self._is_line_empty(self.data[self.pos - 1]) or line
2866                    ):
2867                        break
2868                    elif self.skip_blank_lines:
2869                        ret = self._remove_empty_lines([line])
2870                        if ret:
2871                            line = ret[0]
2872                            break
2873                except IndexError:
2874                    raise StopIteration
2875        else:
2876            while self.skipfunc(self.pos):
2877                self.pos += 1
2878                # assert for mypy, data is Iterator[str] or None, would error in next
2879                assert self.data is not None
2880                next(self.data)
2881
2882            while True:
2883                orig_line = self._next_iter_line(row_num=self.pos + 1)
2884                self.pos += 1
2885
2886                if orig_line is not None:
2887                    line = self._check_comments([orig_line])[0]
2888
2889                    if self.skip_blank_lines:
2890                        ret = self._remove_empty_lines([line])
2891
2892                        if ret:
2893                            line = ret[0]
2894                            break
2895                    elif self._is_line_empty(orig_line) or line:
2896                        break
2897
2898        # This was the first line of the file,
2899        # which could contain the BOM at the
2900        # beginning of it.
2901        if self.pos == 1:
2902            line = self._check_for_bom(line)
2903
2904        self.line_pos += 1
2905        self.buf.append(line)
2906        return line
2907
2908    def _alert_malformed(self, msg, row_num):
2909        """
2910        Alert a user about a malformed row.
2911
2912        If `self.error_bad_lines` is True, the alert will be `ParserError`.
2913        If `self.warn_bad_lines` is True, the alert will be printed out.
2914
2915        Parameters
2916        ----------
2917        msg : The error message to display.
2918        row_num : The row number where the parsing error occurred.
2919                  Because this row number is displayed, we 1-index,
2920                  even though we 0-index internally.
2921        """
2922        if self.error_bad_lines:
2923            raise ParserError(msg)
2924        elif self.warn_bad_lines:
2925            base = f"Skipping line {row_num}: "
2926            sys.stderr.write(base + msg + "\n")
2927
2928    def _next_iter_line(self, row_num):
2929        """
2930        Wrapper around iterating through `self.data` (CSV source).
2931
2932        When a CSV error is raised, we check for specific
2933        error messages that allow us to customize the
2934        error message displayed to the user.
2935
2936        Parameters
2937        ----------
2938        row_num : The row number of the line being parsed.
2939        """
2940        try:
2941            # assert for mypy, data is Iterator[str] or None, would error in next
2942            assert self.data is not None
2943            return next(self.data)
2944        except csv.Error as e:
2945            if self.warn_bad_lines or self.error_bad_lines:
2946                msg = str(e)
2947
2948                if "NULL byte" in msg or "line contains NUL" in msg:
2949                    msg = (
2950                        "NULL byte detected. This byte "
2951                        "cannot be processed in Python's "
2952                        "native csv library at the moment, "
2953                        "so please pass in engine='c' instead"
2954                    )
2955
2956                if self.skipfooter > 0:
2957                    reason = (
2958                        "Error could possibly be due to "
2959                        "parsing errors in the skipped footer rows "
2960                        "(the skipfooter keyword is only applied "
2961                        "after Python's csv library has parsed "
2962                        "all rows)."
2963                    )
2964                    msg += ". " + reason
2965
2966                self._alert_malformed(msg, row_num)
2967            return None
2968
2969    def _check_comments(self, lines):
2970        if self.comment is None:
2971            return lines
2972        ret = []
2973        for line in lines:
2974            rl = []
2975            for x in line:
2976                if not isinstance(x, str) or self.comment not in x:
2977                    rl.append(x)
2978                else:
2979                    x = x[: x.find(self.comment)]
2980                    if len(x) > 0:
2981                        rl.append(x)
2982                    break
2983            ret.append(rl)
2984        return ret
2985
2986    def _remove_empty_lines(self, lines):
2987        """
2988        Iterate through the lines and remove any that are
2989        either empty or contain only one whitespace value
2990
2991        Parameters
2992        ----------
2993        lines : array-like
2994            The array of lines that we are to filter.
2995
2996        Returns
2997        -------
2998        filtered_lines : array-like
2999            The same array of lines with the "empty" ones removed.
3000        """
3001        ret = []
3002        for line in lines:
3003            # Remove empty lines and lines with only one whitespace value
3004            if (
3005                len(line) > 1
3006                or len(line) == 1
3007                and (not isinstance(line[0], str) or line[0].strip())
3008            ):
3009                ret.append(line)
3010        return ret
3011
3012    def _check_thousands(self, lines):
3013        if self.thousands is None:
3014            return lines
3015
3016        return self._search_replace_num_columns(
3017            lines=lines, search=self.thousands, replace=""
3018        )
3019
3020    def _search_replace_num_columns(self, lines, search, replace):
3021        ret = []
3022        for line in lines:
3023            rl = []
3024            for i, x in enumerate(line):
3025                if (
3026                    not isinstance(x, str)
3027                    or search not in x
3028                    or (self._no_thousands_columns and i in self._no_thousands_columns)
3029                    or self.nonnum.search(x.strip())
3030                ):
3031                    rl.append(x)
3032                else:
3033                    rl.append(x.replace(search, replace))
3034            ret.append(rl)
3035        return ret
3036
3037    def _check_decimal(self, lines):
3038        if self.decimal == _parser_defaults["decimal"]:
3039            return lines
3040
3041        return self._search_replace_num_columns(
3042            lines=lines, search=self.decimal, replace="."
3043        )
3044
3045    def _clear_buffer(self):
3046        self.buf = []
3047
3048    _implicit_index = False
3049
3050    def _get_index_name(self, columns):
3051        """
3052        Try several cases to get lines:
3053
3054        0) There are headers on row 0 and row 1 and their
3055        total summed lengths equals the length of the next line.
3056        Treat row 0 as columns and row 1 as indices
3057        1) Look for implicit index: there are more columns
3058        on row 1 than row 0. If this is true, assume that row
3059        1 lists index columns and row 0 lists normal columns.
3060        2) Get index from the columns if it was listed.
3061        """
3062        orig_names = list(columns)
3063        columns = list(columns)
3064
3065        try:
3066            line = self._next_line()
3067        except StopIteration:
3068            line = None
3069
3070        try:
3071            next_line = self._next_line()
3072        except StopIteration:
3073            next_line = None
3074
3075        # implicitly index_col=0 b/c 1 fewer column names
3076        implicit_first_cols = 0
3077        if line is not None:
3078            # leave it 0, #2442
3079            # Case 1
3080            if self.index_col is not False:
3081                implicit_first_cols = len(line) - self.num_original_columns
3082
3083            # Case 0
3084            if next_line is not None:
3085                if len(next_line) == len(line) + self.num_original_columns:
3086                    # column and index names on diff rows
3087                    self.index_col = list(range(len(line)))
3088                    self.buf = self.buf[1:]
3089
3090                    for c in reversed(line):
3091                        columns.insert(0, c)
3092
3093                    # Update list of original names to include all indices.
3094                    orig_names = list(columns)
3095                    self.num_original_columns = len(columns)
3096                    return line, orig_names, columns
3097
3098        if implicit_first_cols > 0:
3099            # Case 1
3100            self._implicit_index = True
3101            if self.index_col is None:
3102                self.index_col = list(range(implicit_first_cols))
3103
3104            index_name = None
3105
3106        else:
3107            # Case 2
3108            (index_name, columns_, self.index_col) = _clean_index_names(
3109                columns, self.index_col, self.unnamed_cols
3110            )
3111
3112        return index_name, orig_names, columns
3113
3114    def _rows_to_cols(self, content):
3115        col_len = self.num_original_columns
3116
3117        if self._implicit_index:
3118            col_len += len(self.index_col)
3119
3120        max_len = max(len(row) for row in content)
3121
3122        # Check that there are no rows with too many
3123        # elements in their row (rows with too few
3124        # elements are padded with NaN).
3125        if max_len > col_len and self.index_col is not False and self.usecols is None:
3126
3127            footers = self.skipfooter if self.skipfooter else 0
3128            bad_lines = []
3129
3130            iter_content = enumerate(content)
3131            content_len = len(content)
3132            content = []
3133
3134            for (i, l) in iter_content:
3135                actual_len = len(l)
3136
3137                if actual_len > col_len:
3138                    if self.error_bad_lines or self.warn_bad_lines:
3139                        row_num = self.pos - (content_len - i + footers)
3140                        bad_lines.append((row_num, actual_len))
3141
3142                        if self.error_bad_lines:
3143                            break
3144                else:
3145                    content.append(l)
3146
3147            for row_num, actual_len in bad_lines:
3148                msg = (
3149                    f"Expected {col_len} fields in line {row_num + 1}, saw "
3150                    f"{actual_len}"
3151                )
3152                if (
3153                    self.delimiter
3154                    and len(self.delimiter) > 1
3155                    and self.quoting != csv.QUOTE_NONE
3156                ):
3157                    # see gh-13374
3158                    reason = (
3159                        "Error could possibly be due to quotes being "
3160                        "ignored when a multi-char delimiter is used."
3161                    )
3162                    msg += ". " + reason
3163
3164                self._alert_malformed(msg, row_num + 1)
3165
3166        # see gh-13320
3167        zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
3168
3169        if self.usecols:
3170            if self._implicit_index:
3171                zipped_content = [
3172                    a
3173                    for i, a in enumerate(zipped_content)
3174                    if (
3175                        i < len(self.index_col)
3176                        # pandas\io\parsers.py:3159: error: Unsupported right
3177                        # operand type for in ("Optional[Any]")  [operator]
3178                        or i - len(self.index_col)  # type: ignore[operator]
3179                        in self._col_indices
3180                    )
3181                ]
3182            else:
3183                zipped_content = [
3184                    # pandas\io\parsers.py:3164: error: Unsupported right
3185                    # operand type for in ("Optional[Any]")  [operator]
3186                    a
3187                    for i, a in enumerate(zipped_content)
3188                    if i in self._col_indices  # type: ignore[operator]
3189                ]
3190        return zipped_content
3191
3192    def _get_lines(self, rows=None):
3193        lines = self.buf
3194        new_rows = None
3195
3196        # already fetched some number
3197        if rows is not None:
3198            # we already have the lines in the buffer
3199            if len(self.buf) >= rows:
3200                new_rows, self.buf = self.buf[:rows], self.buf[rows:]
3201
3202            # need some lines
3203            else:
3204                rows -= len(self.buf)
3205
3206        if new_rows is None:
3207            if isinstance(self.data, list):
3208                if self.pos > len(self.data):
3209                    raise StopIteration
3210                if rows is None:
3211                    new_rows = self.data[self.pos :]
3212                    new_pos = len(self.data)
3213                else:
3214                    new_rows = self.data[self.pos : self.pos + rows]
3215                    new_pos = self.pos + rows
3216
3217                # Check for stop rows. n.b.: self.skiprows is a set.
3218                if self.skiprows:
3219                    new_rows = [
3220                        row
3221                        for i, row in enumerate(new_rows)
3222                        if not self.skipfunc(i + self.pos)
3223                    ]
3224
3225                lines.extend(new_rows)
3226                self.pos = new_pos
3227
3228            else:
3229                new_rows = []
3230                try:
3231                    if rows is not None:
3232                        for _ in range(rows):
3233                            # assert for mypy, data is Iterator[str] or None, would
3234                            # error in next
3235                            assert self.data is not None
3236                            new_rows.append(next(self.data))
3237                        lines.extend(new_rows)
3238                    else:
3239                        rows = 0
3240
3241                        while True:
3242                            new_row = self._next_iter_line(row_num=self.pos + rows + 1)
3243                            rows += 1
3244
3245                            if new_row is not None:
3246                                new_rows.append(new_row)
3247
3248                except StopIteration:
3249                    if self.skiprows:
3250                        new_rows = [
3251                            row
3252                            for i, row in enumerate(new_rows)
3253                            if not self.skipfunc(i + self.pos)
3254                        ]
3255                    lines.extend(new_rows)
3256                    if len(lines) == 0:
3257                        raise
3258                self.pos += len(new_rows)
3259
3260            self.buf = []
3261        else:
3262            lines = new_rows
3263
3264        if self.skipfooter:
3265            lines = lines[: -self.skipfooter]
3266
3267        lines = self._check_comments(lines)
3268        if self.skip_blank_lines:
3269            lines = self._remove_empty_lines(lines)
3270        lines = self._check_thousands(lines)
3271        return self._check_decimal(lines)
3272
3273
3274def _make_date_converter(
3275    date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True
3276):
3277    def converter(*date_cols):
3278        if date_parser is None:
3279            strs = parsing.concat_date_cols(date_cols)
3280
3281            try:
3282                return tools.to_datetime(
3283                    ensure_object(strs),
3284                    utc=None,
3285                    dayfirst=dayfirst,
3286                    errors="ignore",
3287                    infer_datetime_format=infer_datetime_format,
3288                    cache=cache_dates,
3289                ).to_numpy()
3290
3291            except ValueError:
3292                return tools.to_datetime(
3293                    parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates
3294                )
3295        else:
3296            try:
3297                result = tools.to_datetime(
3298                    date_parser(*date_cols), errors="ignore", cache=cache_dates
3299                )
3300                if isinstance(result, datetime.datetime):
3301                    raise Exception("scalar parser")
3302                return result
3303            except Exception:
3304                try:
3305                    return tools.to_datetime(
3306                        parsing.try_parse_dates(
3307                            parsing.concat_date_cols(date_cols),
3308                            parser=date_parser,
3309                            dayfirst=dayfirst,
3310                        ),
3311                        errors="ignore",
3312                    )
3313                except Exception:
3314                    return generic_parser(date_parser, *date_cols)
3315
3316    return converter
3317
3318
3319def _process_date_conversion(
3320    data_dict,
3321    converter,
3322    parse_spec,
3323    index_col,
3324    index_names,
3325    columns,
3326    keep_date_col=False,
3327):
3328    def _isindex(colspec):
3329        return (isinstance(index_col, list) and colspec in index_col) or (
3330            isinstance(index_names, list) and colspec in index_names
3331        )
3332
3333    new_cols = []
3334    new_data = {}
3335
3336    orig_names = columns
3337    columns = list(columns)
3338
3339    date_cols = set()
3340
3341    if parse_spec is None or isinstance(parse_spec, bool):
3342        return data_dict, columns
3343
3344    if isinstance(parse_spec, list):
3345        # list of column lists
3346        for colspec in parse_spec:
3347            if is_scalar(colspec):
3348                if isinstance(colspec, int) and colspec not in data_dict:
3349                    colspec = orig_names[colspec]
3350                if _isindex(colspec):
3351                    continue
3352                data_dict[colspec] = converter(data_dict[colspec])
3353            else:
3354                new_name, col, old_names = _try_convert_dates(
3355                    converter, colspec, data_dict, orig_names
3356                )
3357                if new_name in data_dict:
3358                    raise ValueError(f"New date column already in dict {new_name}")
3359                new_data[new_name] = col
3360                new_cols.append(new_name)
3361                date_cols.update(old_names)
3362
3363    elif isinstance(parse_spec, dict):
3364        # dict of new name to column list
3365        for new_name, colspec in parse_spec.items():
3366            if new_name in data_dict:
3367                raise ValueError(f"Date column {new_name} already in dict")
3368
3369            _, col, old_names = _try_convert_dates(
3370                converter, colspec, data_dict, orig_names
3371            )
3372
3373            new_data[new_name] = col
3374            new_cols.append(new_name)
3375            date_cols.update(old_names)
3376
3377    data_dict.update(new_data)
3378    new_cols.extend(columns)
3379
3380    if not keep_date_col:
3381        for c in list(date_cols):
3382            data_dict.pop(c)
3383            new_cols.remove(c)
3384
3385    return data_dict, new_cols
3386
3387
3388def _try_convert_dates(parser, colspec, data_dict, columns):
3389    colset = set(columns)
3390    colnames = []
3391
3392    for c in colspec:
3393        if c in colset:
3394            colnames.append(c)
3395        elif isinstance(c, int) and c not in columns:
3396            colnames.append(columns[c])
3397        else:
3398            colnames.append(c)
3399
3400    new_name = "_".join(str(x) for x in colnames)
3401    to_parse = [data_dict[c] for c in colnames if c in data_dict]
3402
3403    new_col = parser(*to_parse)
3404    return new_name, new_col, colnames
3405
3406
3407def _clean_na_values(na_values, keep_default_na=True):
3408
3409    if na_values is None:
3410        if keep_default_na:
3411            na_values = STR_NA_VALUES
3412        else:
3413            na_values = set()
3414        # pandas\io\parsers.py:3387: error: Need type annotation for
3415        # 'na_fvalues' (hint: "na_fvalues: Set[<type>] = ...")  [var-annotated]
3416        na_fvalues = set()  # type: ignore[var-annotated]
3417    elif isinstance(na_values, dict):
3418        old_na_values = na_values.copy()
3419        na_values = {}  # Prevent aliasing.
3420
3421        # Convert the values in the na_values dictionary
3422        # into array-likes for further use. This is also
3423        # where we append the default NaN values, provided
3424        # that `keep_default_na=True`.
3425        for k, v in old_na_values.items():
3426            if not is_list_like(v):
3427                v = [v]
3428
3429            if keep_default_na:
3430                v = set(v) | STR_NA_VALUES
3431
3432            na_values[k] = v
3433        # pandas\io\parsers.py:3404: error: Incompatible types in assignment
3434        # (expression has type "Dict[Any, Any]", variable has type "Set[Any]")
3435        # [assignment]
3436        na_fvalues = {  # type: ignore[assignment]
3437            k: _floatify_na_values(v) for k, v in na_values.items()
3438        }
3439    else:
3440        if not is_list_like(na_values):
3441            na_values = [na_values]
3442        na_values = _stringify_na_values(na_values)
3443        if keep_default_na:
3444            na_values = na_values | STR_NA_VALUES
3445
3446        na_fvalues = _floatify_na_values(na_values)
3447
3448    return na_values, na_fvalues
3449
3450
3451def _clean_index_names(columns, index_col, unnamed_cols):
3452    if not _is_index_col(index_col):
3453        return None, columns, index_col
3454
3455    columns = list(columns)
3456
3457    cp_cols = list(columns)
3458    index_names = []
3459
3460    # don't mutate
3461    index_col = list(index_col)
3462
3463    for i, c in enumerate(index_col):
3464        if isinstance(c, str):
3465            index_names.append(c)
3466            for j, name in enumerate(cp_cols):
3467                if name == c:
3468                    index_col[i] = j
3469                    columns.remove(name)
3470                    break
3471        else:
3472            name = cp_cols[c]
3473            columns.remove(name)
3474            index_names.append(name)
3475
3476    # Only clean index names that were placeholders.
3477    for i, name in enumerate(index_names):
3478        if isinstance(name, str) and name in unnamed_cols:
3479            # pandas\io\parsers.py:3445: error: No overload variant of
3480            # "__setitem__" of "list" matches argument types "int", "None"
3481            # [call-overload]
3482            index_names[i] = None  # type: ignore[call-overload]
3483
3484    return index_names, columns, index_col
3485
3486
3487def _get_empty_meta(columns, index_col, index_names, dtype=None):
3488    columns = list(columns)
3489
3490    # Convert `dtype` to a defaultdict of some kind.
3491    # This will enable us to write `dtype[col_name]`
3492    # without worrying about KeyError issues later on.
3493    if not isinstance(dtype, dict):
3494        # if dtype == None, default will be object.
3495        default_dtype = dtype or object
3496        dtype = defaultdict(lambda: default_dtype)
3497    else:
3498        # Save a copy of the dictionary.
3499        _dtype = dtype.copy()
3500        dtype = defaultdict(lambda: object)
3501
3502        # Convert column indexes to column names.
3503        for k, v in _dtype.items():
3504            col = columns[k] if is_integer(k) else k
3505            dtype[col] = v
3506
3507    # Even though we have no data, the "index" of the empty DataFrame
3508    # could for example still be an empty MultiIndex. Thus, we need to
3509    # check whether we have any index columns specified, via either:
3510    #
3511    # 1) index_col (column indices)
3512    # 2) index_names (column names)
3513    #
3514    # Both must be non-null to ensure a successful construction. Otherwise,
3515    # we have to create a generic empty Index.
3516    if (index_col is None or index_col is False) or index_names is None:
3517        index = Index([])
3518    else:
3519        data = [Series([], dtype=dtype[name]) for name in index_names]
3520        index = ensure_index_from_sequences(data, names=index_names)
3521        index_col.sort()
3522
3523        for i, n in enumerate(index_col):
3524            columns.pop(n - i)
3525
3526    col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns}
3527
3528    return index, columns, col_dict
3529
3530
3531def _floatify_na_values(na_values):
3532    # create float versions of the na_values
3533    result = set()
3534    for v in na_values:
3535        try:
3536            v = float(v)
3537            if not np.isnan(v):
3538                result.add(v)
3539        except (TypeError, ValueError, OverflowError):
3540            pass
3541    return result
3542
3543
3544def _stringify_na_values(na_values):
3545    """ return a stringified and numeric for these values """
3546    result = []
3547    for x in na_values:
3548        result.append(str(x))
3549        result.append(x)
3550        try:
3551            v = float(x)
3552
3553            # we are like 999 here
3554            if v == int(v):
3555                v = int(v)
3556                result.append(f"{v}.0")
3557                result.append(str(v))
3558
3559            # pandas\io\parsers.py:3522: error: Argument 1 to "append" of
3560            # "list" has incompatible type "float"; expected "str"  [arg-type]
3561            result.append(v)  # type: ignore[arg-type]
3562        except (TypeError, ValueError, OverflowError):
3563            pass
3564        try:
3565            # pandas\io\parsers.py:3526: error: Argument 1 to "append" of
3566            # "list" has incompatible type "int"; expected "str"  [arg-type]
3567            result.append(int(x))  # type: ignore[arg-type]
3568        except (TypeError, ValueError, OverflowError):
3569            pass
3570    return set(result)
3571
3572
3573def _get_na_values(col, na_values, na_fvalues, keep_default_na):
3574    """
3575    Get the NaN values for a given column.
3576
3577    Parameters
3578    ----------
3579    col : str
3580        The name of the column.
3581    na_values : array-like, dict
3582        The object listing the NaN values as strings.
3583    na_fvalues : array-like, dict
3584        The object listing the NaN values as floats.
3585    keep_default_na : bool
3586        If `na_values` is a dict, and the column is not mapped in the
3587        dictionary, whether to return the default NaN values or the empty set.
3588
3589    Returns
3590    -------
3591    nan_tuple : A length-two tuple composed of
3592
3593        1) na_values : the string NaN values for that column.
3594        2) na_fvalues : the float NaN values for that column.
3595    """
3596    if isinstance(na_values, dict):
3597        if col in na_values:
3598            return na_values[col], na_fvalues[col]
3599        else:
3600            if keep_default_na:
3601                return STR_NA_VALUES, set()
3602
3603            return set(), set()
3604    else:
3605        return na_values, na_fvalues
3606
3607
3608def _get_col_names(colspec, columns):
3609    colset = set(columns)
3610    colnames = []
3611    for c in colspec:
3612        if c in colset:
3613            colnames.append(c)
3614        elif isinstance(c, int):
3615            colnames.append(columns[c])
3616    return colnames
3617
3618
3619class FixedWidthReader(abc.Iterator):
3620    """
3621    A reader of fixed-width lines.
3622    """
3623
3624    def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100):
3625        self.f = f
3626        self.buffer = None
3627        self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
3628        self.comment = comment
3629        if colspecs == "infer":
3630            self.colspecs = self.detect_colspecs(
3631                infer_nrows=infer_nrows, skiprows=skiprows
3632            )
3633        else:
3634            self.colspecs = colspecs
3635
3636        if not isinstance(self.colspecs, (tuple, list)):
3637            raise TypeError(
3638                "column specifications must be a list or tuple, "
3639                f"input was a {type(colspecs).__name__}"
3640            )
3641
3642        for colspec in self.colspecs:
3643            if not (
3644                isinstance(colspec, (tuple, list))
3645                and len(colspec) == 2
3646                and isinstance(colspec[0], (int, np.integer, type(None)))
3647                and isinstance(colspec[1], (int, np.integer, type(None)))
3648            ):
3649                raise TypeError(
3650                    "Each column specification must be "
3651                    "2 element tuple or list of integers"
3652                )
3653
3654    def get_rows(self, infer_nrows, skiprows=None):
3655        """
3656        Read rows from self.f, skipping as specified.
3657
3658        We distinguish buffer_rows (the first <= infer_nrows
3659        lines) from the rows returned to detect_colspecs
3660        because it's simpler to leave the other locations
3661        with skiprows logic alone than to modify them to
3662        deal with the fact we skipped some rows here as
3663        well.
3664
3665        Parameters
3666        ----------
3667        infer_nrows : int
3668            Number of rows to read from self.f, not counting
3669            rows that are skipped.
3670        skiprows: set, optional
3671            Indices of rows to skip.
3672
3673        Returns
3674        -------
3675        detect_rows : list of str
3676            A list containing the rows to read.
3677
3678        """
3679        if skiprows is None:
3680            skiprows = set()
3681        buffer_rows = []
3682        detect_rows = []
3683        for i, row in enumerate(self.f):
3684            if i not in skiprows:
3685                detect_rows.append(row)
3686            buffer_rows.append(row)
3687            if len(detect_rows) >= infer_nrows:
3688                break
3689        self.buffer = iter(buffer_rows)
3690        return detect_rows
3691
3692    def detect_colspecs(self, infer_nrows=100, skiprows=None):
3693        # Regex escape the delimiters
3694        delimiters = "".join(fr"\{x}" for x in self.delimiter)
3695        pattern = re.compile(f"([^{delimiters}]+)")
3696        rows = self.get_rows(infer_nrows, skiprows)
3697        if not rows:
3698            raise EmptyDataError("No rows from which to infer column width")
3699        max_len = max(map(len, rows))
3700        mask = np.zeros(max_len + 1, dtype=int)
3701        if self.comment is not None:
3702            rows = [row.partition(self.comment)[0] for row in rows]
3703        for row in rows:
3704            for m in pattern.finditer(row):
3705                mask[m.start() : m.end()] = 1
3706        shifted = np.roll(mask, 1)
3707        shifted[0] = 0
3708        edges = np.where((mask ^ shifted) == 1)[0]
3709        edge_pairs = list(zip(edges[::2], edges[1::2]))
3710        return edge_pairs
3711
3712    def __next__(self):
3713        if self.buffer is not None:
3714            try:
3715                line = next(self.buffer)
3716            except StopIteration:
3717                self.buffer = None
3718                line = next(self.f)
3719        else:
3720            line = next(self.f)
3721        # Note: 'colspecs' is a sequence of half-open intervals.
3722        return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
3723
3724
3725class FixedWidthFieldParser(PythonParser):
3726    """
3727    Specialization that Converts fixed-width fields into DataFrames.
3728    See PythonParser for details.
3729    """
3730
3731    def __init__(self, f, **kwds):
3732        # Support iterators, convert to a list.
3733        self.colspecs = kwds.pop("colspecs")
3734        self.infer_nrows = kwds.pop("infer_nrows")
3735        PythonParser.__init__(self, f, **kwds)
3736
3737    def _make_reader(self, f):
3738        self.data = FixedWidthReader(
3739            f,
3740            self.colspecs,
3741            self.delimiter,
3742            self.comment,
3743            self.skiprows,
3744            self.infer_nrows,
3745        )
3746
3747    def _remove_empty_lines(self, lines) -> List:
3748        """
3749        Returns the list of lines without the empty ones. With fixed-width
3750        fields, empty lines become arrays of empty strings.
3751
3752        See PythonParser._remove_empty_lines.
3753        """
3754        return [
3755            line
3756            for line in lines
3757            if any(not isinstance(e, str) or e.strip() for e in line)
3758        ]
3759
3760
3761def _refine_defaults_read(
3762    dialect: Union[str, csv.Dialect],
3763    delimiter: Union[str, object],
3764    delim_whitespace: bool,
3765    engine: str,
3766    sep: Union[str, object],
3767    defaults: Dict[str, Any],
3768):
3769    """Validate/refine default values of input parameters of read_csv, read_table.
3770
3771    Parameters
3772    ----------
3773    dialect : str or csv.Dialect
3774        If provided, this parameter will override values (default or not) for the
3775        following parameters: `delimiter`, `doublequote`, `escapechar`,
3776        `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
3777        override values, a ParserWarning will be issued. See csv.Dialect
3778        documentation for more details.
3779    delimiter : str or object
3780        Alias for sep.
3781    delim_whitespace : bool
3782        Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
3783        used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
3784        is set to True, nothing should be passed in for the ``delimiter``
3785        parameter.
3786    engine : {{'c', 'python'}}
3787        Parser engine to use. The C engine is faster while the python engine is
3788        currently more feature-complete.
3789    sep : str or object
3790        A delimiter provided by the user (str) or a sentinel value, i.e.
3791        pandas._libs.lib.no_default.
3792    defaults: dict
3793        Default values of input parameters.
3794
3795    Returns
3796    -------
3797    kwds : dict
3798        Input parameters with correct values.
3799
3800    Raises
3801    ------
3802    ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and
3803        ``delim_whitespace=True``.
3804    """
3805    # fix types for sep, delimiter to Union(str, Any)
3806    delim_default = defaults["delimiter"]
3807    kwds: Dict[str, Any] = {}
3808    # gh-23761
3809    #
3810    # When a dialect is passed, it overrides any of the overlapping
3811    # parameters passed in directly. We don't want to warn if the
3812    # default parameters were passed in (since it probably means
3813    # that the user didn't pass them in explicitly in the first place).
3814    #
3815    # "delimiter" is the annoying corner case because we alias it to
3816    # "sep" before doing comparison to the dialect values later on.
3817    # Thus, we need a flag to indicate that we need to "override"
3818    # the comparison to dialect values by checking if default values
3819    # for BOTH "delimiter" and "sep" were provided.
3820    if dialect is not None:
3821        kwds["sep_override"] = delimiter is None and (
3822            sep is lib.no_default or sep == delim_default
3823        )
3824
3825    # Alias sep -> delimiter.
3826    if delimiter is None:
3827        delimiter = sep
3828
3829    if delim_whitespace and (delimiter is not lib.no_default):
3830        raise ValueError(
3831            "Specified a delimiter with both sep and "
3832            "delim_whitespace=True; you can only specify one."
3833        )
3834
3835    if delimiter is lib.no_default:
3836        # assign default separator value
3837        kwds["delimiter"] = delim_default
3838    else:
3839        kwds["delimiter"] = delimiter
3840
3841    if engine is not None:
3842        kwds["engine_specified"] = True
3843    else:
3844        kwds["engine"] = "c"
3845        kwds["engine_specified"] = False
3846
3847    return kwds
3848
3849
3850def _extract_dialect(kwds: Dict[str, Any]) -> Optional[csv.Dialect]:
3851    """
3852    Extract concrete csv dialect instance.
3853
3854    Returns
3855    -------
3856    csv.Dialect or None
3857    """
3858    if kwds.get("dialect") is None:
3859        return None
3860
3861    dialect = kwds["dialect"]
3862    if dialect in csv.list_dialects():
3863        dialect = csv.get_dialect(dialect)
3864
3865    _validate_dialect(dialect)
3866
3867    return dialect
3868
3869
3870MANDATORY_DIALECT_ATTRS = (
3871    "delimiter",
3872    "doublequote",
3873    "escapechar",
3874    "skipinitialspace",
3875    "quotechar",
3876    "quoting",
3877)
3878
3879
3880def _validate_dialect(dialect: csv.Dialect) -> None:
3881    """
3882    Validate csv dialect instance.
3883
3884    Raises
3885    ------
3886    ValueError
3887        If incorrect dialect is provided.
3888    """
3889    for param in MANDATORY_DIALECT_ATTRS:
3890        if not hasattr(dialect, param):
3891            raise ValueError(f"Invalid dialect {dialect} provided")
3892
3893
3894def _merge_with_dialect_properties(
3895    dialect: csv.Dialect,
3896    defaults: Dict[str, Any],
3897) -> Dict[str, Any]:
3898    """
3899    Merge default kwargs in TextFileReader with dialect parameters.
3900
3901    Parameters
3902    ----------
3903    dialect : csv.Dialect
3904        Concrete csv dialect. See csv.Dialect documentation for more details.
3905    defaults : dict
3906        Keyword arguments passed to TextFileReader.
3907
3908    Returns
3909    -------
3910    kwds : dict
3911        Updated keyword arguments, merged with dialect parameters.
3912    """
3913    kwds = defaults.copy()
3914
3915    for param in MANDATORY_DIALECT_ATTRS:
3916        dialect_val = getattr(dialect, param)
3917
3918        parser_default = _parser_defaults[param]
3919        provided = kwds.get(param, parser_default)
3920
3921        # Messages for conflicting values between the dialect
3922        # instance and the actual parameters provided.
3923        conflict_msgs = []
3924
3925        # Don't warn if the default parameter was passed in,
3926        # even if it conflicts with the dialect (gh-23761).
3927        if provided != parser_default and provided != dialect_val:
3928            msg = (
3929                f"Conflicting values for '{param}': '{provided}' was "
3930                f"provided, but the dialect specifies '{dialect_val}'. "
3931                "Using the dialect-specified value."
3932            )
3933
3934            # Annoying corner case for not warning about
3935            # conflicts between dialect and delimiter parameter.
3936            # Refer to the outer "_read_" function for more info.
3937            if not (param == "delimiter" and kwds.pop("sep_override", False)):
3938                conflict_msgs.append(msg)
3939
3940        if conflict_msgs:
3941            warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2)
3942        kwds[param] = dialect_val
3943    return kwds
3944
3945
3946def _validate_skipfooter(kwds: Dict[str, Any]) -> None:
3947    """
3948    Check whether skipfooter is compatible with other kwargs in TextFileReader.
3949
3950    Parameters
3951    ----------
3952    kwds : dict
3953        Keyword arguments passed to TextFileReader.
3954
3955    Raises
3956    ------
3957    ValueError
3958        If skipfooter is not compatible with other parameters.
3959    """
3960    if kwds.get("skipfooter"):
3961        if kwds.get("iterator") or kwds.get("chunksize"):
3962            raise ValueError("'skipfooter' not supported for iteration")
3963        if kwds.get("nrows"):
3964            raise ValueError("'skipfooter' not supported with 'nrows'")
3965