1""" 2Module contains tools for processing files into DataFrames or other objects 3""" 4 5from collections import abc, defaultdict 6import csv 7import datetime 8from io import StringIO 9import itertools 10import re 11import sys 12from textwrap import fill 13from typing import ( 14 Any, 15 Dict, 16 Iterable, 17 Iterator, 18 List, 19 Optional, 20 Sequence, 21 Set, 22 Type, 23 cast, 24) 25import warnings 26 27import numpy as np 28 29import pandas._libs.lib as lib 30import pandas._libs.ops as libops 31import pandas._libs.parsers as parsers 32from pandas._libs.parsers import STR_NA_VALUES 33from pandas._libs.tslibs import parsing 34from pandas._typing import FilePathOrBuffer, StorageOptions, Union 35from pandas.errors import ( 36 AbstractMethodError, 37 EmptyDataError, 38 ParserError, 39 ParserWarning, 40) 41from pandas.util._decorators import Appender 42 43from pandas.core.dtypes.cast import astype_nansafe 44from pandas.core.dtypes.common import ( 45 ensure_object, 46 ensure_str, 47 is_bool_dtype, 48 is_categorical_dtype, 49 is_dict_like, 50 is_dtype_equal, 51 is_extension_array_dtype, 52 is_file_like, 53 is_float, 54 is_integer, 55 is_integer_dtype, 56 is_list_like, 57 is_object_dtype, 58 is_scalar, 59 is_string_dtype, 60 pandas_dtype, 61) 62from pandas.core.dtypes.dtypes import CategoricalDtype 63from pandas.core.dtypes.missing import isna 64 65from pandas.core import algorithms, generic 66from pandas.core.arrays import Categorical 67from pandas.core.frame import DataFrame 68from pandas.core.indexes.api import ( 69 Index, 70 MultiIndex, 71 RangeIndex, 72 ensure_index_from_sequences, 73) 74from pandas.core.series import Series 75from pandas.core.tools import datetimes as tools 76 77from pandas.io.common import IOHandles, get_handle, validate_header_arg 78from pandas.io.date_converters import generic_parser 79 80# BOM character (byte order mark) 81# This exists at the beginning of a file to indicate endianness 82# of a file (stream). Unfortunately, this marker screws up parsing, 83# so we need to remove it if we see it. 84_BOM = "\ufeff" 85 86_doc_read_csv_and_table = ( 87 r""" 88{summary} 89 90Also supports optionally iterating or breaking of the file 91into chunks. 92 93Additional help can be found in the online docs for 94`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 95 96Parameters 97---------- 98filepath_or_buffer : str, path object or file-like object 99 Any valid string path is acceptable. The string could be a URL. Valid 100 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is 101 expected. A local file could be: file://localhost/path/to/table.csv. 102 103 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 104 105 By file-like object, we refer to objects with a ``read()`` method, such as 106 a file handle (e.g. via builtin ``open`` function) or ``StringIO``. 107sep : str, default {_default_sep} 108 Delimiter to use. If sep is None, the C engine cannot automatically detect 109 the separator, but the Python parsing engine can, meaning the latter will 110 be used and automatically detect the separator by Python's builtin sniffer 111 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and 112 different from ``'\s+'`` will be interpreted as regular expressions and 113 will also force the use of the Python parsing engine. Note that regex 114 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. 115delimiter : str, default ``None`` 116 Alias for sep. 117header : int, list of int, default 'infer' 118 Row number(s) to use as the column names, and the start of the 119 data. Default behavior is to infer the column names: if no names 120 are passed the behavior is identical to ``header=0`` and column 121 names are inferred from the first line of the file, if column 122 names are passed explicitly then the behavior is identical to 123 ``header=None``. Explicitly pass ``header=0`` to be able to 124 replace existing names. The header can be a list of integers that 125 specify row locations for a multi-index on the columns 126 e.g. [0,1,3]. Intervening rows that are not specified will be 127 skipped (e.g. 2 in this example is skipped). Note that this 128 parameter ignores commented lines and empty lines if 129 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of 130 data rather than the first line of the file. 131names : array-like, optional 132 List of column names to use. If the file contains a header row, 133 then you should explicitly pass ``header=0`` to override the column names. 134 Duplicates in this list are not allowed. 135index_col : int, str, sequence of int / str, or False, default ``None`` 136 Column(s) to use as the row labels of the ``DataFrame``, either given as 137 string name or column index. If a sequence of int / str is given, a 138 MultiIndex is used. 139 140 Note: ``index_col=False`` can be used to force pandas to *not* use the first 141 column as the index, e.g. when you have a malformed file with delimiters at 142 the end of each line. 143usecols : list-like or callable, optional 144 Return a subset of the columns. If list-like, all elements must either 145 be positional (i.e. integer indices into the document columns) or strings 146 that correspond to column names provided either by the user in `names` or 147 inferred from the document header row(s). For example, a valid list-like 148 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. 149 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. 150 To instantiate a DataFrame from ``data`` with element order preserved use 151 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns 152 in ``['foo', 'bar']`` order or 153 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` 154 for ``['bar', 'foo']`` order. 155 156 If callable, the callable function will be evaluated against the column 157 names, returning names where the callable function evaluates to True. An 158 example of a valid callable argument would be ``lambda x: x.upper() in 159 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster 160 parsing time and lower memory usage. 161squeeze : bool, default False 162 If the parsed data only contains one column then return a Series. 163prefix : str, optional 164 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... 165mangle_dupe_cols : bool, default True 166 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 167 'X'...'X'. Passing in False will cause data to be overwritten if there 168 are duplicate names in the columns. 169dtype : Type name or dict of column -> type, optional 170 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 171 'c': 'Int64'}} 172 Use `str` or `object` together with suitable `na_values` settings 173 to preserve and not interpret dtype. 174 If converters are specified, they will be applied INSTEAD 175 of dtype conversion. 176engine : {{'c', 'python'}}, optional 177 Parser engine to use. The C engine is faster while the python engine is 178 currently more feature-complete. 179converters : dict, optional 180 Dict of functions for converting values in certain columns. Keys can either 181 be integers or column labels. 182true_values : list, optional 183 Values to consider as True. 184false_values : list, optional 185 Values to consider as False. 186skipinitialspace : bool, default False 187 Skip spaces after delimiter. 188skiprows : list-like, int or callable, optional 189 Line numbers to skip (0-indexed) or number of lines to skip (int) 190 at the start of the file. 191 192 If callable, the callable function will be evaluated against the row 193 indices, returning True if the row should be skipped and False otherwise. 194 An example of a valid callable argument would be ``lambda x: x in [0, 2]``. 195skipfooter : int, default 0 196 Number of lines at bottom of file to skip (Unsupported with engine='c'). 197nrows : int, optional 198 Number of rows of file to read. Useful for reading pieces of large files. 199na_values : scalar, str, list-like, or dict, optional 200 Additional strings to recognize as NA/NaN. If dict passed, specific 201 per-column NA values. By default the following values are interpreted as 202 NaN: '""" 203 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 204 + """'. 205keep_default_na : bool, default True 206 Whether or not to include the default NaN values when parsing the data. 207 Depending on whether `na_values` is passed in, the behavior is as follows: 208 209 * If `keep_default_na` is True, and `na_values` are specified, `na_values` 210 is appended to the default NaN values used for parsing. 211 * If `keep_default_na` is True, and `na_values` are not specified, only 212 the default NaN values are used for parsing. 213 * If `keep_default_na` is False, and `na_values` are specified, only 214 the NaN values specified `na_values` are used for parsing. 215 * If `keep_default_na` is False, and `na_values` are not specified, no 216 strings will be parsed as NaN. 217 218 Note that if `na_filter` is passed in as False, the `keep_default_na` and 219 `na_values` parameters will be ignored. 220na_filter : bool, default True 221 Detect missing value markers (empty strings and the value of na_values). In 222 data without any NAs, passing na_filter=False can improve the performance 223 of reading a large file. 224verbose : bool, default False 225 Indicate number of NA values placed in non-numeric columns. 226skip_blank_lines : bool, default True 227 If True, skip over blank lines rather than interpreting as NaN values. 228parse_dates : bool or list of int or names or list of lists or dict, \ 229default False 230 The behavior is as follows: 231 232 * boolean. If True -> try parsing the index. 233 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 234 each as a separate date column. 235 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 236 a single date column. 237 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 238 result 'foo' 239 240 If a column or index cannot be represented as an array of datetimes, 241 say because of an unparsable value or a mixture of timezones, the column 242 or index will be returned unaltered as an object data type. For 243 non-standard datetime parsing, use ``pd.to_datetime`` after 244 ``pd.read_csv``. To parse an index or column with a mixture of timezones, 245 specify ``date_parser`` to be a partially-applied 246 :func:`pandas.to_datetime` with ``utc=True``. See 247 :ref:`io.csv.mixed_timezones` for more. 248 249 Note: A fast-path exists for iso8601-formatted dates. 250infer_datetime_format : bool, default False 251 If True and `parse_dates` is enabled, pandas will attempt to infer the 252 format of the datetime strings in the columns, and if it can be inferred, 253 switch to a faster method of parsing them. In some cases this can increase 254 the parsing speed by 5-10x. 255keep_date_col : bool, default False 256 If True and `parse_dates` specifies combining multiple columns then 257 keep the original columns. 258date_parser : function, optional 259 Function to use for converting a sequence of string columns to an array of 260 datetime instances. The default uses ``dateutil.parser.parser`` to do the 261 conversion. Pandas will try to call `date_parser` in three different ways, 262 advancing to the next if an exception occurs: 1) Pass one or more arrays 263 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 264 string values from the columns defined by `parse_dates` into a single array 265 and pass that; and 3) call `date_parser` once for each row using one or 266 more strings (corresponding to the columns defined by `parse_dates`) as 267 arguments. 268dayfirst : bool, default False 269 DD/MM format dates, international and European format. 270cache_dates : bool, default True 271 If True, use a cache of unique, converted dates to apply the datetime 272 conversion. May produce significant speed-up when parsing duplicate 273 date strings, especially ones with timezone offsets. 274 275 .. versionadded:: 0.25.0 276iterator : bool, default False 277 Return TextFileReader object for iteration or getting chunks with 278 ``get_chunk()``. 279 280 .. versionchanged:: 1.2 281 282 ``TextFileReader`` is a context manager. 283chunksize : int, optional 284 Return TextFileReader object for iteration. 285 See the `IO Tools docs 286 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ 287 for more information on ``iterator`` and ``chunksize``. 288 289 .. versionchanged:: 1.2 290 291 ``TextFileReader`` is a context manager. 292compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' 293 For on-the-fly decompression of on-disk data. If 'infer' and 294 `filepath_or_buffer` is path-like, then detect compression from the 295 following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no 296 decompression). If using 'zip', the ZIP file must contain only one data 297 file to be read in. Set to None for no decompression. 298thousands : str, optional 299 Thousands separator. 300decimal : str, default '.' 301 Character to recognize as decimal point (e.g. use ',' for European data). 302lineterminator : str (length 1), optional 303 Character to break file into lines. Only valid with C parser. 304quotechar : str (length 1), optional 305 The character used to denote the start and end of a quoted item. Quoted 306 items can include the delimiter and it will be ignored. 307quoting : int or csv.QUOTE_* instance, default 0 308 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of 309 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). 310doublequote : bool, default ``True`` 311 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate 312 whether or not to interpret two consecutive quotechar elements INSIDE a 313 field as a single ``quotechar`` element. 314escapechar : str (length 1), optional 315 One-character string used to escape other characters. 316comment : str, optional 317 Indicates remainder of line should not be parsed. If found at the beginning 318 of a line, the line will be ignored altogether. This parameter must be a 319 single character. Like empty lines (as long as ``skip_blank_lines=True``), 320 fully commented lines are ignored by the parameter `header` but not by 321 `skiprows`. For example, if ``comment='#'``, parsing 322 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being 323 treated as the header. 324encoding : str, optional 325 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python 326 standard encodings 327 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ . 328 .. versionchanged:: 1.2 329 330 When ``encoding`` is ``None``, ``errors="replace"`` is passed to 331 ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. 332 This behavior was previously only the case for ``engine="python"``. 333dialect : str or csv.Dialect, optional 334 If provided, this parameter will override values (default or not) for the 335 following parameters: `delimiter`, `doublequote`, `escapechar`, 336 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 337 override values, a ParserWarning will be issued. See csv.Dialect 338 documentation for more details. 339error_bad_lines : bool, default True 340 Lines with too many fields (e.g. a csv line with too many commas) will by 341 default cause an exception to be raised, and no DataFrame will be returned. 342 If False, then these "bad lines" will dropped from the DataFrame that is 343 returned. 344warn_bad_lines : bool, default True 345 If error_bad_lines is False, and warn_bad_lines is True, a warning for each 346 "bad line" will be output. 347delim_whitespace : bool, default False 348 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 349 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 350 is set to True, nothing should be passed in for the ``delimiter`` 351 parameter. 352low_memory : bool, default True 353 Internally process the file in chunks, resulting in lower memory use 354 while parsing, but possibly mixed type inference. To ensure no mixed 355 types either set False, or specify the type with the `dtype` parameter. 356 Note that the entire file is read into a single DataFrame regardless, 357 use the `chunksize` or `iterator` parameter to return the data in chunks. 358 (Only valid with C parser). 359memory_map : bool, default False 360 If a filepath is provided for `filepath_or_buffer`, map the file object 361 directly onto memory and access the data directly from there. Using this 362 option can improve performance because there is no longer any I/O overhead. 363float_precision : str, optional 364 Specifies which converter the C engine should use for floating-point 365 values. The options are ``None`` or 'high' for the ordinary converter, 366 'legacy' for the original lower precision pandas converter, and 367 'round_trip' for the round-trip converter. 368 369 .. versionchanged:: 1.2 370 371{storage_options} 372 373 .. versionadded:: 1.2 374 375Returns 376------- 377DataFrame or TextParser 378 A comma-separated values (csv) file is returned as two-dimensional 379 data structure with labeled axes. 380 381See Also 382-------- 383DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 384read_csv : Read a comma-separated values (csv) file into DataFrame. 385read_fwf : Read a table of fixed-width formatted lines into DataFrame. 386 387Examples 388-------- 389>>> pd.{func_name}('data.csv') # doctest: +SKIP 390""" 391) 392 393 394def validate_integer(name, val, min_val=0): 395 """ 396 Checks whether the 'name' parameter for parsing is either 397 an integer OR float that can SAFELY be cast to an integer 398 without losing accuracy. Raises a ValueError if that is 399 not the case. 400 401 Parameters 402 ---------- 403 name : string 404 Parameter name (used for error reporting) 405 val : int or float 406 The value to check 407 min_val : int 408 Minimum allowed value (val < min_val will result in a ValueError) 409 """ 410 msg = f"'{name:s}' must be an integer >={min_val:d}" 411 412 if val is not None: 413 if is_float(val): 414 if int(val) != val: 415 raise ValueError(msg) 416 val = int(val) 417 elif not (is_integer(val) and val >= min_val): 418 raise ValueError(msg) 419 420 return val 421 422 423def _validate_names(names): 424 """ 425 Raise ValueError if the `names` parameter contains duplicates or has an 426 invalid data type. 427 428 Parameters 429 ---------- 430 names : array-like or None 431 An array containing a list of the names used for the output DataFrame. 432 433 Raises 434 ------ 435 ValueError 436 If names are not unique or are not ordered (e.g. set). 437 """ 438 if names is not None: 439 if len(names) != len(set(names)): 440 raise ValueError("Duplicate names are not allowed.") 441 if not ( 442 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) 443 ): 444 raise ValueError("Names should be an ordered collection.") 445 446 447def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 448 """Generic reader of line files.""" 449 if kwds.get("date_parser", None) is not None: 450 if isinstance(kwds["parse_dates"], bool): 451 kwds["parse_dates"] = True 452 453 # Extract some of the arguments (pass chunksize on). 454 iterator = kwds.get("iterator", False) 455 chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) 456 nrows = kwds.get("nrows", None) 457 458 # Check for duplicates in names. 459 _validate_names(kwds.get("names", None)) 460 461 # Create the parser. 462 parser = TextFileReader(filepath_or_buffer, **kwds) 463 464 if chunksize or iterator: 465 return parser 466 467 with parser: 468 return parser.read(nrows) 469 470 471_parser_defaults = { 472 "delimiter": None, 473 "escapechar": None, 474 "quotechar": '"', 475 "quoting": csv.QUOTE_MINIMAL, 476 "doublequote": True, 477 "skipinitialspace": False, 478 "lineterminator": None, 479 "header": "infer", 480 "index_col": None, 481 "names": None, 482 "prefix": None, 483 "skiprows": None, 484 "skipfooter": 0, 485 "nrows": None, 486 "na_values": None, 487 "keep_default_na": True, 488 "true_values": None, 489 "false_values": None, 490 "converters": None, 491 "dtype": None, 492 "cache_dates": True, 493 "thousands": None, 494 "comment": None, 495 "decimal": ".", 496 # 'engine': 'c', 497 "parse_dates": False, 498 "keep_date_col": False, 499 "dayfirst": False, 500 "date_parser": None, 501 "usecols": None, 502 # 'iterator': False, 503 "chunksize": None, 504 "verbose": False, 505 "encoding": None, 506 "squeeze": False, 507 "compression": None, 508 "mangle_dupe_cols": True, 509 "infer_datetime_format": False, 510 "skip_blank_lines": True, 511} 512 513 514_c_parser_defaults = { 515 "delim_whitespace": False, 516 "na_filter": True, 517 "low_memory": True, 518 "memory_map": False, 519 "error_bad_lines": True, 520 "warn_bad_lines": True, 521 "float_precision": None, 522} 523 524_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} 525 526_c_unsupported = {"skipfooter"} 527_python_unsupported = {"low_memory", "float_precision"} 528 529_deprecated_defaults: Dict[str, Any] = {} 530_deprecated_args: Set[str] = set() 531 532 533@Appender( 534 _doc_read_csv_and_table.format( 535 func_name="read_csv", 536 summary="Read a comma-separated values (csv) file into DataFrame.", 537 _default_sep="','", 538 storage_options=generic._shared_docs["storage_options"], 539 ) 540) 541def read_csv( 542 filepath_or_buffer: FilePathOrBuffer, 543 sep=lib.no_default, 544 delimiter=None, 545 # Column and Index Locations and Names 546 header="infer", 547 names=None, 548 index_col=None, 549 usecols=None, 550 squeeze=False, 551 prefix=None, 552 mangle_dupe_cols=True, 553 # General Parsing Configuration 554 dtype=None, 555 engine=None, 556 converters=None, 557 true_values=None, 558 false_values=None, 559 skipinitialspace=False, 560 skiprows=None, 561 skipfooter=0, 562 nrows=None, 563 # NA and Missing Data Handling 564 na_values=None, 565 keep_default_na=True, 566 na_filter=True, 567 verbose=False, 568 skip_blank_lines=True, 569 # Datetime Handling 570 parse_dates=False, 571 infer_datetime_format=False, 572 keep_date_col=False, 573 date_parser=None, 574 dayfirst=False, 575 cache_dates=True, 576 # Iteration 577 iterator=False, 578 chunksize=None, 579 # Quoting, Compression, and File Format 580 compression="infer", 581 thousands=None, 582 decimal: str = ".", 583 lineterminator=None, 584 quotechar='"', 585 quoting=csv.QUOTE_MINIMAL, 586 doublequote=True, 587 escapechar=None, 588 comment=None, 589 encoding=None, 590 dialect=None, 591 # Error Handling 592 error_bad_lines=True, 593 warn_bad_lines=True, 594 # Internal 595 delim_whitespace=False, 596 low_memory=_c_parser_defaults["low_memory"], 597 memory_map=False, 598 float_precision=None, 599 storage_options: StorageOptions = None, 600): 601 kwds = locals() 602 del kwds["filepath_or_buffer"] 603 del kwds["sep"] 604 605 kwds_defaults = _refine_defaults_read( 606 dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","} 607 ) 608 kwds.update(kwds_defaults) 609 610 return _read(filepath_or_buffer, kwds) 611 612 613@Appender( 614 _doc_read_csv_and_table.format( 615 func_name="read_table", 616 summary="Read general delimited file into DataFrame.", 617 _default_sep=r"'\\t' (tab-stop)", 618 storage_options=generic._shared_docs["storage_options"], 619 ) 620) 621def read_table( 622 filepath_or_buffer: FilePathOrBuffer, 623 sep=lib.no_default, 624 delimiter=None, 625 # Column and Index Locations and Names 626 header="infer", 627 names=None, 628 index_col=None, 629 usecols=None, 630 squeeze=False, 631 prefix=None, 632 mangle_dupe_cols=True, 633 # General Parsing Configuration 634 dtype=None, 635 engine=None, 636 converters=None, 637 true_values=None, 638 false_values=None, 639 skipinitialspace=False, 640 skiprows=None, 641 skipfooter=0, 642 nrows=None, 643 # NA and Missing Data Handling 644 na_values=None, 645 keep_default_na=True, 646 na_filter=True, 647 verbose=False, 648 skip_blank_lines=True, 649 # Datetime Handling 650 parse_dates=False, 651 infer_datetime_format=False, 652 keep_date_col=False, 653 date_parser=None, 654 dayfirst=False, 655 cache_dates=True, 656 # Iteration 657 iterator=False, 658 chunksize=None, 659 # Quoting, Compression, and File Format 660 compression="infer", 661 thousands=None, 662 decimal: str = ".", 663 lineterminator=None, 664 quotechar='"', 665 quoting=csv.QUOTE_MINIMAL, 666 doublequote=True, 667 escapechar=None, 668 comment=None, 669 encoding=None, 670 dialect=None, 671 # Error Handling 672 error_bad_lines=True, 673 warn_bad_lines=True, 674 # Internal 675 delim_whitespace=False, 676 low_memory=_c_parser_defaults["low_memory"], 677 memory_map=False, 678 float_precision=None, 679): 680 kwds = locals() 681 del kwds["filepath_or_buffer"] 682 del kwds["sep"] 683 684 kwds_defaults = _refine_defaults_read( 685 dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"} 686 ) 687 kwds.update(kwds_defaults) 688 689 return _read(filepath_or_buffer, kwds) 690 691 692def read_fwf( 693 filepath_or_buffer: FilePathOrBuffer, 694 colspecs="infer", 695 widths=None, 696 infer_nrows=100, 697 **kwds, 698): 699 r""" 700 Read a table of fixed-width formatted lines into DataFrame. 701 702 Also supports optionally iterating or breaking of the file 703 into chunks. 704 705 Additional help can be found in the `online docs for IO Tools 706 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 707 708 Parameters 709 ---------- 710 filepath_or_buffer : str, path object or file-like object 711 Any valid string path is acceptable. The string could be a URL. Valid 712 URL schemes include http, ftp, s3, and file. For file URLs, a host is 713 expected. A local file could be: 714 ``file://localhost/path/to/table.csv``. 715 716 If you want to pass in a path object, pandas accepts any 717 ``os.PathLike``. 718 719 By file-like object, we refer to objects with a ``read()`` method, 720 such as a file handle (e.g. via builtin ``open`` function) 721 or ``StringIO``. 722 colspecs : list of tuple (int, int) or 'infer'. optional 723 A list of tuples giving the extents of the fixed-width 724 fields of each line as half-open intervals (i.e., [from, to[ ). 725 String value 'infer' can be used to instruct the parser to try 726 detecting the column specifications from the first 100 rows of 727 the data which are not being skipped via skiprows (default='infer'). 728 widths : list of int, optional 729 A list of field widths which can be used instead of 'colspecs' if 730 the intervals are contiguous. 731 infer_nrows : int, default 100 732 The number of rows to consider when letting the parser determine the 733 `colspecs`. 734 735 .. versionadded:: 0.24.0 736 **kwds : optional 737 Optional keyword arguments can be passed to ``TextFileReader``. 738 739 Returns 740 ------- 741 DataFrame or TextParser 742 A comma-separated values (csv) file is returned as two-dimensional 743 data structure with labeled axes. 744 745 See Also 746 -------- 747 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 748 read_csv : Read a comma-separated values (csv) file into DataFrame. 749 750 Examples 751 -------- 752 >>> pd.read_fwf('data.csv') # doctest: +SKIP 753 """ 754 # Check input arguments. 755 if colspecs is None and widths is None: 756 raise ValueError("Must specify either colspecs or widths") 757 elif colspecs not in (None, "infer") and widths is not None: 758 raise ValueError("You must specify only one of 'widths' and 'colspecs'") 759 760 # Compute 'colspecs' from 'widths', if specified. 761 if widths is not None: 762 colspecs, col = [], 0 763 for w in widths: 764 colspecs.append((col, col + w)) 765 col += w 766 767 kwds["colspecs"] = colspecs 768 kwds["infer_nrows"] = infer_nrows 769 kwds["engine"] = "python-fwf" 770 return _read(filepath_or_buffer, kwds) 771 772 773class TextFileReader(abc.Iterator): 774 """ 775 776 Passed dialect overrides any of the related parser options 777 778 """ 779 780 def __init__(self, f, engine=None, **kwds): 781 782 self.f = f 783 784 if engine is not None: 785 engine_specified = True 786 else: 787 engine = "python" 788 engine_specified = False 789 self.engine = engine 790 self._engine_specified = kwds.get("engine_specified", engine_specified) 791 792 _validate_skipfooter(kwds) 793 794 dialect = _extract_dialect(kwds) 795 if dialect is not None: 796 kwds = _merge_with_dialect_properties(dialect, kwds) 797 798 if kwds.get("header", "infer") == "infer": 799 kwds["header"] = 0 if kwds.get("names") is None else None 800 801 self.orig_options = kwds 802 803 # miscellanea 804 self._currow = 0 805 806 options = self._get_options_with_defaults(engine) 807 options["storage_options"] = kwds.get("storage_options", None) 808 809 self.chunksize = options.pop("chunksize", None) 810 self.nrows = options.pop("nrows", None) 811 self.squeeze = options.pop("squeeze", False) 812 813 self._check_file_or_buffer(f, engine) 814 self.options, self.engine = self._clean_options(options, engine) 815 816 if "has_index_names" in kwds: 817 self.options["has_index_names"] = kwds["has_index_names"] 818 819 self._engine = self._make_engine(self.engine) 820 821 def close(self): 822 self._engine.close() 823 824 def _get_options_with_defaults(self, engine): 825 kwds = self.orig_options 826 827 options = {} 828 829 for argname, default in _parser_defaults.items(): 830 value = kwds.get(argname, default) 831 832 # see gh-12935 833 if argname == "mangle_dupe_cols" and not value: 834 raise ValueError("Setting mangle_dupe_cols=False is not supported yet") 835 else: 836 options[argname] = value 837 838 for argname, default in _c_parser_defaults.items(): 839 if argname in kwds: 840 value = kwds[argname] 841 842 if engine != "c" and value != default: 843 if "python" in engine and argname not in _python_unsupported: 844 pass 845 elif value == _deprecated_defaults.get(argname, default): 846 pass 847 else: 848 raise ValueError( 849 f"The {repr(argname)} option is not supported with the " 850 f"{repr(engine)} engine" 851 ) 852 else: 853 value = _deprecated_defaults.get(argname, default) 854 options[argname] = value 855 856 if engine == "python-fwf": 857 # pandas\io\parsers.py:907: error: Incompatible types in assignment 858 # (expression has type "object", variable has type "Union[int, str, 859 # None]") [assignment] 860 for argname, default in _fwf_defaults.items(): # type: ignore[assignment] 861 options[argname] = kwds.get(argname, default) 862 863 return options 864 865 def _check_file_or_buffer(self, f, engine): 866 # see gh-16530 867 if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"): 868 # The C engine doesn't need the file-like to have the "__next__" 869 # attribute. However, the Python engine explicitly calls 870 # "__next__(...)" when iterating through such an object, meaning it 871 # needs to have that attribute 872 raise ValueError( 873 "The 'python' engine cannot iterate through this file buffer." 874 ) 875 876 def _clean_options(self, options, engine): 877 result = options.copy() 878 879 fallback_reason = None 880 881 # C engine not supported yet 882 if engine == "c": 883 if options["skipfooter"] > 0: 884 fallback_reason = "the 'c' engine does not support skipfooter" 885 engine = "python" 886 887 sep = options["delimiter"] 888 delim_whitespace = options["delim_whitespace"] 889 890 if sep is None and not delim_whitespace: 891 if engine == "c": 892 fallback_reason = ( 893 "the 'c' engine does not support " 894 "sep=None with delim_whitespace=False" 895 ) 896 engine = "python" 897 elif sep is not None and len(sep) > 1: 898 if engine == "c" and sep == r"\s+": 899 result["delim_whitespace"] = True 900 del result["delimiter"] 901 elif engine not in ("python", "python-fwf"): 902 # wait until regex engine integrated 903 fallback_reason = ( 904 "the 'c' engine does not support " 905 "regex separators (separators > 1 char and " 906 r"different from '\s+' are interpreted as regex)" 907 ) 908 engine = "python" 909 elif delim_whitespace: 910 if "python" in engine: 911 result["delimiter"] = r"\s+" 912 elif sep is not None: 913 encodeable = True 914 encoding = sys.getfilesystemencoding() or "utf-8" 915 try: 916 if len(sep.encode(encoding)) > 1: 917 encodeable = False 918 except UnicodeDecodeError: 919 encodeable = False 920 if not encodeable and engine not in ("python", "python-fwf"): 921 fallback_reason = ( 922 f"the separator encoded in {encoding} " 923 "is > 1 char long, and the 'c' engine " 924 "does not support such separators" 925 ) 926 engine = "python" 927 928 quotechar = options["quotechar"] 929 if quotechar is not None and isinstance(quotechar, (str, bytes)): 930 if ( 931 len(quotechar) == 1 932 and ord(quotechar) > 127 933 and engine not in ("python", "python-fwf") 934 ): 935 fallback_reason = ( 936 "ord(quotechar) > 127, meaning the " 937 "quotechar is larger than one byte, " 938 "and the 'c' engine does not support such quotechars" 939 ) 940 engine = "python" 941 942 if fallback_reason and self._engine_specified: 943 raise ValueError(fallback_reason) 944 945 if engine == "c": 946 for arg in _c_unsupported: 947 del result[arg] 948 949 if "python" in engine: 950 for arg in _python_unsupported: 951 if fallback_reason and result[arg] != _c_parser_defaults[arg]: 952 raise ValueError( 953 "Falling back to the 'python' engine because " 954 f"{fallback_reason}, but this causes {repr(arg)} to be " 955 "ignored as it is not supported by the 'python' engine." 956 ) 957 del result[arg] 958 959 if fallback_reason: 960 warnings.warn( 961 ( 962 "Falling back to the 'python' engine because " 963 f"{fallback_reason}; you can avoid this warning by specifying " 964 "engine='python'." 965 ), 966 ParserWarning, 967 stacklevel=5, 968 ) 969 970 index_col = options["index_col"] 971 names = options["names"] 972 converters = options["converters"] 973 na_values = options["na_values"] 974 skiprows = options["skiprows"] 975 976 validate_header_arg(options["header"]) 977 978 for arg in _deprecated_args: 979 parser_default = _c_parser_defaults[arg] 980 depr_default = _deprecated_defaults[arg] 981 if result.get(arg, depr_default) != depr_default: 982 msg = ( 983 f"The {arg} argument has been deprecated and will be " 984 "removed in a future version.\n\n" 985 ) 986 warnings.warn(msg, FutureWarning, stacklevel=2) 987 else: 988 result[arg] = parser_default 989 990 if index_col is True: 991 raise ValueError("The value of index_col couldn't be 'True'") 992 if _is_index_col(index_col): 993 if not isinstance(index_col, (list, tuple, np.ndarray)): 994 index_col = [index_col] 995 result["index_col"] = index_col 996 997 names = list(names) if names is not None else names 998 999 # type conversion-related 1000 if converters is not None: 1001 if not isinstance(converters, dict): 1002 raise TypeError( 1003 "Type converters must be a dict or subclass, " 1004 f"input was a {type(converters).__name__}" 1005 ) 1006 else: 1007 converters = {} 1008 1009 # Converting values to NA 1010 keep_default_na = options["keep_default_na"] 1011 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) 1012 1013 # handle skiprows; this is internally handled by the 1014 # c-engine, so only need for python parsers 1015 if engine != "c": 1016 if is_integer(skiprows): 1017 skiprows = list(range(skiprows)) 1018 if skiprows is None: 1019 skiprows = set() 1020 elif not callable(skiprows): 1021 skiprows = set(skiprows) 1022 1023 # put stuff back 1024 result["names"] = names 1025 result["converters"] = converters 1026 result["na_values"] = na_values 1027 result["na_fvalues"] = na_fvalues 1028 result["skiprows"] = skiprows 1029 1030 return result, engine 1031 1032 def __next__(self): 1033 try: 1034 return self.get_chunk() 1035 except StopIteration: 1036 self.close() 1037 raise 1038 1039 def _make_engine(self, engine="c"): 1040 mapping: Dict[str, Type[ParserBase]] = { 1041 "c": CParserWrapper, 1042 "python": PythonParser, 1043 "python-fwf": FixedWidthFieldParser, 1044 } 1045 if engine not in mapping: 1046 raise ValueError( 1047 f"Unknown engine: {engine} (valid options are {mapping.keys()})" 1048 ) 1049 # error: Too many arguments for "ParserBase" 1050 return mapping[engine](self.f, **self.options) # type: ignore[call-arg] 1051 1052 def _failover_to_python(self): 1053 raise AbstractMethodError(self) 1054 1055 def read(self, nrows=None): 1056 nrows = validate_integer("nrows", nrows) 1057 index, columns, col_dict = self._engine.read(nrows) 1058 1059 if index is None: 1060 if col_dict: 1061 # Any column is actually fine: 1062 new_rows = len(next(iter(col_dict.values()))) 1063 index = RangeIndex(self._currow, self._currow + new_rows) 1064 else: 1065 new_rows = 0 1066 else: 1067 new_rows = len(index) 1068 1069 df = DataFrame(col_dict, columns=columns, index=index) 1070 1071 self._currow += new_rows 1072 1073 if self.squeeze and len(df.columns) == 1: 1074 return df[df.columns[0]].copy() 1075 return df 1076 1077 def get_chunk(self, size=None): 1078 if size is None: 1079 size = self.chunksize 1080 if self.nrows is not None: 1081 if self._currow >= self.nrows: 1082 raise StopIteration 1083 size = min(size, self.nrows - self._currow) 1084 return self.read(nrows=size) 1085 1086 def __enter__(self): 1087 return self 1088 1089 def __exit__(self, exc_type, exc_value, traceback): 1090 self.close() 1091 1092 1093def _is_index_col(col): 1094 return col is not None and col is not False 1095 1096 1097def _is_potential_multi_index( 1098 columns, index_col: Optional[Union[bool, Sequence[int]]] = None 1099): 1100 """ 1101 Check whether or not the `columns` parameter 1102 could be converted into a MultiIndex. 1103 1104 Parameters 1105 ---------- 1106 columns : array-like 1107 Object which may or may not be convertible into a MultiIndex 1108 index_col : None, bool or list, optional 1109 Column or columns to use as the (possibly hierarchical) index 1110 1111 Returns 1112 ------- 1113 boolean : Whether or not columns could become a MultiIndex 1114 """ 1115 if index_col is None or isinstance(index_col, bool): 1116 index_col = [] 1117 1118 return ( 1119 len(columns) 1120 and not isinstance(columns, MultiIndex) 1121 and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) 1122 ) 1123 1124 1125def _evaluate_usecols(usecols, names): 1126 """ 1127 Check whether or not the 'usecols' parameter 1128 is a callable. If so, enumerates the 'names' 1129 parameter and returns a set of indices for 1130 each entry in 'names' that evaluates to True. 1131 If not a callable, returns 'usecols'. 1132 """ 1133 if callable(usecols): 1134 return {i for i, name in enumerate(names) if usecols(name)} 1135 return usecols 1136 1137 1138def _validate_usecols_names(usecols, names): 1139 """ 1140 Validates that all usecols are present in a given 1141 list of names. If not, raise a ValueError that 1142 shows what usecols are missing. 1143 1144 Parameters 1145 ---------- 1146 usecols : iterable of usecols 1147 The columns to validate are present in names. 1148 names : iterable of names 1149 The column names to check against. 1150 1151 Returns 1152 ------- 1153 usecols : iterable of usecols 1154 The `usecols` parameter if the validation succeeds. 1155 1156 Raises 1157 ------ 1158 ValueError : Columns were missing. Error message will list them. 1159 """ 1160 missing = [c for c in usecols if c not in names] 1161 if len(missing) > 0: 1162 raise ValueError( 1163 f"Usecols do not match columns, columns expected but not found: {missing}" 1164 ) 1165 1166 return usecols 1167 1168 1169def _validate_skipfooter_arg(skipfooter): 1170 """ 1171 Validate the 'skipfooter' parameter. 1172 1173 Checks whether 'skipfooter' is a non-negative integer. 1174 Raises a ValueError if that is not the case. 1175 1176 Parameters 1177 ---------- 1178 skipfooter : non-negative integer 1179 The number of rows to skip at the end of the file. 1180 1181 Returns 1182 ------- 1183 validated_skipfooter : non-negative integer 1184 The original input if the validation succeeds. 1185 1186 Raises 1187 ------ 1188 ValueError : 'skipfooter' was not a non-negative integer. 1189 """ 1190 if not is_integer(skipfooter): 1191 raise ValueError("skipfooter must be an integer") 1192 1193 if skipfooter < 0: 1194 raise ValueError("skipfooter cannot be negative") 1195 1196 return skipfooter 1197 1198 1199def _validate_usecols_arg(usecols): 1200 """ 1201 Validate the 'usecols' parameter. 1202 1203 Checks whether or not the 'usecols' parameter contains all integers 1204 (column selection by index), strings (column by name) or is a callable. 1205 Raises a ValueError if that is not the case. 1206 1207 Parameters 1208 ---------- 1209 usecols : list-like, callable, or None 1210 List of columns to use when parsing or a callable that can be used 1211 to filter a list of table columns. 1212 1213 Returns 1214 ------- 1215 usecols_tuple : tuple 1216 A tuple of (verified_usecols, usecols_dtype). 1217 1218 'verified_usecols' is either a set if an array-like is passed in or 1219 'usecols' if a callable or None is passed in. 1220 1221 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like 1222 is passed in or None if a callable or None is passed in. 1223 """ 1224 msg = ( 1225 "'usecols' must either be list-like of all strings, all unicode, " 1226 "all integers or a callable." 1227 ) 1228 if usecols is not None: 1229 if callable(usecols): 1230 return usecols, None 1231 1232 if not is_list_like(usecols): 1233 # see gh-20529 1234 # 1235 # Ensure it is iterable container but not string. 1236 raise ValueError(msg) 1237 1238 usecols_dtype = lib.infer_dtype(usecols, skipna=False) 1239 1240 if usecols_dtype not in ("empty", "integer", "string"): 1241 raise ValueError(msg) 1242 1243 usecols = set(usecols) 1244 1245 return usecols, usecols_dtype 1246 return usecols, None 1247 1248 1249def _validate_parse_dates_arg(parse_dates): 1250 """ 1251 Check whether or not the 'parse_dates' parameter 1252 is a non-boolean scalar. Raises a ValueError if 1253 that is the case. 1254 """ 1255 msg = ( 1256 "Only booleans, lists, and dictionaries are accepted " 1257 "for the 'parse_dates' parameter" 1258 ) 1259 1260 if parse_dates is not None: 1261 if is_scalar(parse_dates): 1262 if not lib.is_bool(parse_dates): 1263 raise TypeError(msg) 1264 1265 elif not isinstance(parse_dates, (list, dict)): 1266 raise TypeError(msg) 1267 1268 return parse_dates 1269 1270 1271class ParserBase: 1272 def __init__(self, kwds): 1273 1274 self.names = kwds.get("names") 1275 self.orig_names: Optional[List] = None 1276 self.prefix = kwds.pop("prefix", None) 1277 1278 self.index_col = kwds.get("index_col", None) 1279 self.unnamed_cols: Set = set() 1280 self.index_names: Optional[List] = None 1281 self.col_names = None 1282 1283 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) 1284 self.date_parser = kwds.pop("date_parser", None) 1285 self.dayfirst = kwds.pop("dayfirst", False) 1286 self.keep_date_col = kwds.pop("keep_date_col", False) 1287 1288 self.na_values = kwds.get("na_values") 1289 self.na_fvalues = kwds.get("na_fvalues") 1290 self.na_filter = kwds.get("na_filter", False) 1291 self.keep_default_na = kwds.get("keep_default_na", True) 1292 1293 self.true_values = kwds.get("true_values") 1294 self.false_values = kwds.get("false_values") 1295 self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) 1296 self.infer_datetime_format = kwds.pop("infer_datetime_format", False) 1297 self.cache_dates = kwds.pop("cache_dates", True) 1298 1299 self._date_conv = _make_date_converter( 1300 date_parser=self.date_parser, 1301 dayfirst=self.dayfirst, 1302 infer_datetime_format=self.infer_datetime_format, 1303 cache_dates=self.cache_dates, 1304 ) 1305 1306 # validate header options for mi 1307 self.header = kwds.get("header") 1308 if isinstance(self.header, (list, tuple, np.ndarray)): 1309 if not all(map(is_integer, self.header)): 1310 raise ValueError("header must be integer or list of integers") 1311 if any(i < 0 for i in self.header): 1312 raise ValueError( 1313 "cannot specify multi-index header with negative integers" 1314 ) 1315 if kwds.get("usecols"): 1316 raise ValueError( 1317 "cannot specify usecols when specifying a multi-index header" 1318 ) 1319 if kwds.get("names"): 1320 raise ValueError( 1321 "cannot specify names when specifying a multi-index header" 1322 ) 1323 1324 # validate index_col that only contains integers 1325 if self.index_col is not None: 1326 is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) 1327 if not ( 1328 is_sequence 1329 and all(map(is_integer, self.index_col)) 1330 or is_integer(self.index_col) 1331 ): 1332 raise ValueError( 1333 "index_col must only contain row numbers " 1334 "when specifying a multi-index header" 1335 ) 1336 elif self.header is not None: 1337 # GH 27394 1338 if self.prefix is not None: 1339 raise ValueError( 1340 "Argument prefix must be None if argument header is not None" 1341 ) 1342 # GH 16338 1343 elif not is_integer(self.header): 1344 raise ValueError("header must be integer or list of integers") 1345 # GH 27779 1346 elif self.header < 0: 1347 raise ValueError( 1348 "Passing negative integer to header is invalid. " 1349 "For no header, use header=None instead" 1350 ) 1351 1352 self._name_processed = False 1353 1354 self._first_chunk = True 1355 1356 self.handles: Optional[IOHandles] = None 1357 1358 def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: 1359 """ 1360 Let the readers open IOHanldes after they are done with their potential raises. 1361 """ 1362 self.handles = get_handle( 1363 src, 1364 "r", 1365 encoding=kwds.get("encoding", None), 1366 compression=kwds.get("compression", None), 1367 memory_map=kwds.get("memory_map", False), 1368 storage_options=kwds.get("storage_options", None), 1369 ) 1370 1371 def _validate_parse_dates_presence(self, columns: List[str]) -> None: 1372 """ 1373 Check if parse_dates are in columns. 1374 1375 If user has provided names for parse_dates, check if those columns 1376 are available. 1377 1378 Parameters 1379 ---------- 1380 columns : list 1381 List of names of the dataframe. 1382 1383 Raises 1384 ------ 1385 ValueError 1386 If column to parse_date is not in dataframe. 1387 1388 """ 1389 cols_needed: Iterable 1390 if is_dict_like(self.parse_dates): 1391 cols_needed = itertools.chain(*self.parse_dates.values()) 1392 elif is_list_like(self.parse_dates): 1393 # a column in parse_dates could be represented 1394 # ColReference = Union[int, str] 1395 # DateGroups = List[ColReference] 1396 # ParseDates = Union[DateGroups, List[DateGroups], 1397 # Dict[ColReference, DateGroups]] 1398 cols_needed = itertools.chain.from_iterable( 1399 col if is_list_like(col) else [col] for col in self.parse_dates 1400 ) 1401 else: 1402 cols_needed = [] 1403 1404 # get only columns that are references using names (str), not by index 1405 missing_cols = ", ".join( 1406 sorted( 1407 { 1408 col 1409 for col in cols_needed 1410 if isinstance(col, str) and col not in columns 1411 } 1412 ) 1413 ) 1414 if missing_cols: 1415 raise ValueError( 1416 f"Missing column provided to 'parse_dates': '{missing_cols}'" 1417 ) 1418 1419 def close(self): 1420 if self.handles is not None: 1421 self.handles.close() 1422 1423 @property 1424 def _has_complex_date_col(self): 1425 return isinstance(self.parse_dates, dict) or ( 1426 isinstance(self.parse_dates, list) 1427 and len(self.parse_dates) > 0 1428 and isinstance(self.parse_dates[0], list) 1429 ) 1430 1431 def _should_parse_dates(self, i): 1432 if isinstance(self.parse_dates, bool): 1433 return self.parse_dates 1434 else: 1435 if self.index_names is not None: 1436 name = self.index_names[i] 1437 else: 1438 name = None 1439 j = self.index_col[i] 1440 1441 if is_scalar(self.parse_dates): 1442 return (j == self.parse_dates) or ( 1443 name is not None and name == self.parse_dates 1444 ) 1445 else: 1446 return (j in self.parse_dates) or ( 1447 name is not None and name in self.parse_dates 1448 ) 1449 1450 def _extract_multi_indexer_columns( 1451 self, header, index_names, col_names, passed_names=False 1452 ): 1453 """ 1454 extract and return the names, index_names, col_names 1455 header is a list-of-lists returned from the parsers 1456 """ 1457 if len(header) < 2: 1458 return header[0], index_names, col_names, passed_names 1459 1460 # the names are the tuples of the header that are not the index cols 1461 # 0 is the name of the index, assuming index_col is a list of column 1462 # numbers 1463 ic = self.index_col 1464 if ic is None: 1465 ic = [] 1466 1467 if not isinstance(ic, (list, tuple, np.ndarray)): 1468 ic = [ic] 1469 sic = set(ic) 1470 1471 # clean the index_names 1472 index_names = header.pop(-1) 1473 index_names, names, index_col = _clean_index_names( 1474 index_names, self.index_col, self.unnamed_cols 1475 ) 1476 1477 # extract the columns 1478 field_count = len(header[0]) 1479 1480 def extract(r): 1481 return tuple(r[i] for i in range(field_count) if i not in sic) 1482 1483 columns = list(zip(*(extract(r) for r in header))) 1484 names = ic + columns 1485 1486 # If we find unnamed columns all in a single 1487 # level, then our header was too long. 1488 for n in range(len(columns[0])): 1489 if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): 1490 header = ",".join(str(x) for x in self.header) 1491 raise ParserError( 1492 f"Passed header=[{header}] are too many rows " 1493 "for this multi_index of columns" 1494 ) 1495 1496 # Clean the column names (if we have an index_col). 1497 if len(ic): 1498 col_names = [ 1499 r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None 1500 for r in header 1501 ] 1502 else: 1503 col_names = [None] * len(header) 1504 1505 passed_names = True 1506 1507 return names, index_names, col_names, passed_names 1508 1509 def _maybe_dedup_names(self, names): 1510 # see gh-7160 and gh-9424: this helps to provide 1511 # immediate alleviation of the duplicate names 1512 # issue and appears to be satisfactory to users, 1513 # but ultimately, not needing to butcher the names 1514 # would be nice! 1515 if self.mangle_dupe_cols: 1516 names = list(names) # so we can index 1517 # pandas\io\parsers.py:1559: error: Need type annotation for 1518 # 'counts' [var-annotated] 1519 counts = defaultdict(int) # type: ignore[var-annotated] 1520 is_potential_mi = _is_potential_multi_index(names, self.index_col) 1521 1522 for i, col in enumerate(names): 1523 cur_count = counts[col] 1524 1525 while cur_count > 0: 1526 counts[col] = cur_count + 1 1527 1528 if is_potential_mi: 1529 col = col[:-1] + (f"{col[-1]}.{cur_count}",) 1530 else: 1531 col = f"{col}.{cur_count}" 1532 cur_count = counts[col] 1533 1534 names[i] = col 1535 counts[col] = cur_count + 1 1536 1537 return names 1538 1539 def _maybe_make_multi_index_columns(self, columns, col_names=None): 1540 # possibly create a column mi here 1541 if _is_potential_multi_index(columns): 1542 columns = MultiIndex.from_tuples(columns, names=col_names) 1543 return columns 1544 1545 def _make_index(self, data, alldata, columns, indexnamerow=False): 1546 if not _is_index_col(self.index_col) or not self.index_col: 1547 index = None 1548 1549 elif not self._has_complex_date_col: 1550 index = self._get_simple_index(alldata, columns) 1551 index = self._agg_index(index) 1552 elif self._has_complex_date_col: 1553 if not self._name_processed: 1554 (self.index_names, _, self.index_col) = _clean_index_names( 1555 list(columns), self.index_col, self.unnamed_cols 1556 ) 1557 self._name_processed = True 1558 index = self._get_complex_date_index(data, columns) 1559 index = self._agg_index(index, try_parse_dates=False) 1560 1561 # add names for the index 1562 if indexnamerow: 1563 coffset = len(indexnamerow) - len(columns) 1564 # pandas\io\parsers.py:1604: error: Item "None" of "Optional[Any]" 1565 # has no attribute "set_names" [union-attr] 1566 index = index.set_names(indexnamerow[:coffset]) # type: ignore[union-attr] 1567 1568 # maybe create a mi on the columns 1569 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 1570 1571 return index, columns 1572 1573 _implicit_index = False 1574 1575 def _get_simple_index(self, data, columns): 1576 def ix(col): 1577 if not isinstance(col, str): 1578 return col 1579 raise ValueError(f"Index {col} invalid") 1580 1581 to_remove = [] 1582 index = [] 1583 for idx in self.index_col: 1584 i = ix(idx) 1585 to_remove.append(i) 1586 index.append(data[i]) 1587 1588 # remove index items from content and columns, don't pop in 1589 # loop 1590 for i in sorted(to_remove, reverse=True): 1591 data.pop(i) 1592 if not self._implicit_index: 1593 columns.pop(i) 1594 1595 return index 1596 1597 def _get_complex_date_index(self, data, col_names): 1598 def _get_name(icol): 1599 if isinstance(icol, str): 1600 return icol 1601 1602 if col_names is None: 1603 raise ValueError(f"Must supply column order to use {icol!s} as index") 1604 1605 for i, c in enumerate(col_names): 1606 if i == icol: 1607 return c 1608 1609 to_remove = [] 1610 index = [] 1611 for idx in self.index_col: 1612 name = _get_name(idx) 1613 to_remove.append(name) 1614 index.append(data[name]) 1615 1616 # remove index items from content and columns, don't pop in 1617 # loop 1618 for c in sorted(to_remove, reverse=True): 1619 data.pop(c) 1620 col_names.remove(c) 1621 1622 return index 1623 1624 def _agg_index(self, index, try_parse_dates=True) -> Index: 1625 arrays = [] 1626 1627 for i, arr in enumerate(index): 1628 1629 if try_parse_dates and self._should_parse_dates(i): 1630 arr = self._date_conv(arr) 1631 1632 if self.na_filter: 1633 col_na_values = self.na_values 1634 col_na_fvalues = self.na_fvalues 1635 else: 1636 col_na_values = set() 1637 col_na_fvalues = set() 1638 1639 if isinstance(self.na_values, dict): 1640 # pandas\io\parsers.py:1678: error: Value of type 1641 # "Optional[Any]" is not indexable [index] 1642 col_name = self.index_names[i] # type: ignore[index] 1643 if col_name is not None: 1644 col_na_values, col_na_fvalues = _get_na_values( 1645 col_name, self.na_values, self.na_fvalues, self.keep_default_na 1646 ) 1647 1648 arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) 1649 arrays.append(arr) 1650 1651 names = self.index_names 1652 index = ensure_index_from_sequences(arrays, names) 1653 1654 return index 1655 1656 def _convert_to_ndarrays( 1657 self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None 1658 ): 1659 result = {} 1660 for c, values in dct.items(): 1661 conv_f = None if converters is None else converters.get(c, None) 1662 if isinstance(dtypes, dict): 1663 cast_type = dtypes.get(c, None) 1664 else: 1665 # single dtype or None 1666 cast_type = dtypes 1667 1668 if self.na_filter: 1669 col_na_values, col_na_fvalues = _get_na_values( 1670 c, na_values, na_fvalues, self.keep_default_na 1671 ) 1672 else: 1673 col_na_values, col_na_fvalues = set(), set() 1674 1675 if conv_f is not None: 1676 # conv_f applied to data before inference 1677 if cast_type is not None: 1678 warnings.warn( 1679 ( 1680 "Both a converter and dtype were specified " 1681 f"for column {c} - only the converter will be used" 1682 ), 1683 ParserWarning, 1684 stacklevel=7, 1685 ) 1686 1687 try: 1688 values = lib.map_infer(values, conv_f) 1689 except ValueError: 1690 mask = algorithms.isin(values, list(na_values)).view(np.uint8) 1691 values = lib.map_infer_mask(values, conv_f, mask) 1692 1693 cvals, na_count = self._infer_types( 1694 values, set(col_na_values) | col_na_fvalues, try_num_bool=False 1695 ) 1696 else: 1697 is_ea = is_extension_array_dtype(cast_type) 1698 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) 1699 # skip inference if specified dtype is object 1700 # or casting to an EA 1701 try_num_bool = not (cast_type and is_str_or_ea_dtype) 1702 1703 # general type inference and conversion 1704 cvals, na_count = self._infer_types( 1705 values, set(col_na_values) | col_na_fvalues, try_num_bool 1706 ) 1707 1708 # type specified in dtype param or cast_type is an EA 1709 if cast_type and ( 1710 not is_dtype_equal(cvals, cast_type) 1711 or is_extension_array_dtype(cast_type) 1712 ): 1713 if not is_ea and na_count > 0: 1714 try: 1715 if is_bool_dtype(cast_type): 1716 raise ValueError( 1717 f"Bool column has NA values in column {c}" 1718 ) 1719 except (AttributeError, TypeError): 1720 # invalid input to is_bool_dtype 1721 pass 1722 cvals = self._cast_types(cvals, cast_type, c) 1723 1724 result[c] = cvals 1725 if verbose and na_count: 1726 print(f"Filled {na_count} NA values in column {c!s}") 1727 return result 1728 1729 def _infer_types(self, values, na_values, try_num_bool=True): 1730 """ 1731 Infer types of values, possibly casting 1732 1733 Parameters 1734 ---------- 1735 values : ndarray 1736 na_values : set 1737 try_num_bool : bool, default try 1738 try to cast values to numeric (first preference) or boolean 1739 1740 Returns 1741 ------- 1742 converted : ndarray 1743 na_count : int 1744 """ 1745 na_count = 0 1746 if issubclass(values.dtype.type, (np.number, np.bool_)): 1747 mask = algorithms.isin(values, list(na_values)) 1748 na_count = mask.sum() 1749 if na_count > 0: 1750 if is_integer_dtype(values): 1751 values = values.astype(np.float64) 1752 np.putmask(values, mask, np.nan) 1753 return values, na_count 1754 1755 if try_num_bool and is_object_dtype(values.dtype): 1756 # exclude e.g DatetimeIndex here 1757 try: 1758 result = lib.maybe_convert_numeric(values, na_values, False) 1759 except (ValueError, TypeError): 1760 # e.g. encountering datetime string gets ValueError 1761 # TypeError can be raised in floatify 1762 result = values 1763 na_count = parsers.sanitize_objects(result, na_values, False) 1764 else: 1765 na_count = isna(result).sum() 1766 else: 1767 result = values 1768 if values.dtype == np.object_: 1769 na_count = parsers.sanitize_objects(values, na_values, False) 1770 1771 if result.dtype == np.object_ and try_num_bool: 1772 result = libops.maybe_convert_bool( 1773 np.asarray(values), 1774 true_values=self.true_values, 1775 false_values=self.false_values, 1776 ) 1777 1778 return result, na_count 1779 1780 def _cast_types(self, values, cast_type, column): 1781 """ 1782 Cast values to specified type 1783 1784 Parameters 1785 ---------- 1786 values : ndarray 1787 cast_type : string or np.dtype 1788 dtype to cast values to 1789 column : string 1790 column name - used only for error reporting 1791 1792 Returns 1793 ------- 1794 converted : ndarray 1795 """ 1796 if is_categorical_dtype(cast_type): 1797 known_cats = ( 1798 isinstance(cast_type, CategoricalDtype) 1799 and cast_type.categories is not None 1800 ) 1801 1802 if not is_object_dtype(values) and not known_cats: 1803 # TODO: this is for consistency with 1804 # c-parser which parses all categories 1805 # as strings 1806 values = astype_nansafe(values, str) 1807 1808 cats = Index(values).unique().dropna() 1809 values = Categorical._from_inferred_categories( 1810 cats, cats.get_indexer(values), cast_type, true_values=self.true_values 1811 ) 1812 1813 # use the EA's implementation of casting 1814 elif is_extension_array_dtype(cast_type): 1815 # ensure cast_type is an actual dtype and not a string 1816 cast_type = pandas_dtype(cast_type) 1817 array_type = cast_type.construct_array_type() 1818 try: 1819 return array_type._from_sequence_of_strings(values, dtype=cast_type) 1820 except NotImplementedError as err: 1821 raise NotImplementedError( 1822 f"Extension Array: {array_type} must implement " 1823 "_from_sequence_of_strings in order to be used in parser methods" 1824 ) from err 1825 1826 else: 1827 try: 1828 values = astype_nansafe(values, cast_type, copy=True, skipna=True) 1829 except ValueError as err: 1830 raise ValueError( 1831 f"Unable to convert column {column} to type {cast_type}" 1832 ) from err 1833 return values 1834 1835 def _do_date_conversions(self, names, data): 1836 # returns data, columns 1837 1838 if self.parse_dates is not None: 1839 data, names = _process_date_conversion( 1840 data, 1841 self._date_conv, 1842 self.parse_dates, 1843 self.index_col, 1844 self.index_names, 1845 names, 1846 keep_date_col=self.keep_date_col, 1847 ) 1848 1849 return names, data 1850 1851 1852class CParserWrapper(ParserBase): 1853 def __init__(self, src: FilePathOrBuffer, **kwds): 1854 self.kwds = kwds 1855 kwds = kwds.copy() 1856 1857 ParserBase.__init__(self, kwds) 1858 1859 # #2442 1860 kwds["allow_leading_cols"] = self.index_col is not False 1861 1862 # GH20529, validate usecol arg before TextReader 1863 self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) 1864 kwds["usecols"] = self.usecols 1865 1866 # open handles 1867 self._open_handles(src, kwds) 1868 assert self.handles is not None 1869 for key in ("storage_options", "encoding", "memory_map", "compression"): 1870 kwds.pop(key, None) 1871 1872 try: 1873 self._reader = parsers.TextReader(self.handles.handle, **kwds) 1874 except Exception: 1875 self.handles.close() 1876 raise 1877 self.unnamed_cols = self._reader.unnamed_cols 1878 1879 passed_names = self.names is None 1880 1881 if self._reader.header is None: 1882 self.names = None 1883 else: 1884 if len(self._reader.header) > 1: 1885 # we have a multi index in the columns 1886 ( 1887 self.names, 1888 self.index_names, 1889 self.col_names, 1890 passed_names, 1891 ) = self._extract_multi_indexer_columns( 1892 self._reader.header, self.index_names, self.col_names, passed_names 1893 ) 1894 else: 1895 self.names = list(self._reader.header[0]) 1896 1897 if self.names is None: 1898 if self.prefix: 1899 self.names = [ 1900 f"{self.prefix}{i}" for i in range(self._reader.table_width) 1901 ] 1902 else: 1903 self.names = list(range(self._reader.table_width)) 1904 1905 # gh-9755 1906 # 1907 # need to set orig_names here first 1908 # so that proper indexing can be done 1909 # with _set_noconvert_columns 1910 # 1911 # once names has been filtered, we will 1912 # then set orig_names again to names 1913 self.orig_names = self.names[:] 1914 1915 if self.usecols: 1916 usecols = _evaluate_usecols(self.usecols, self.orig_names) 1917 1918 # GH 14671 1919 # assert for mypy, orig_names is List or None, None would error in issubset 1920 assert self.orig_names is not None 1921 if self.usecols_dtype == "string" and not set(usecols).issubset( 1922 self.orig_names 1923 ): 1924 _validate_usecols_names(usecols, self.orig_names) 1925 1926 if len(self.names) > len(usecols): 1927 self.names = [ 1928 n 1929 for i, n in enumerate(self.names) 1930 if (i in usecols or n in usecols) 1931 ] 1932 1933 if len(self.names) < len(usecols): 1934 _validate_usecols_names(usecols, self.names) 1935 1936 self._validate_parse_dates_presence(self.names) 1937 self._set_noconvert_columns() 1938 1939 self.orig_names = self.names 1940 1941 if not self._has_complex_date_col: 1942 if self._reader.leading_cols == 0 and _is_index_col(self.index_col): 1943 1944 self._name_processed = True 1945 (index_names, self.names, self.index_col) = _clean_index_names( 1946 self.names, self.index_col, self.unnamed_cols 1947 ) 1948 1949 if self.index_names is None: 1950 self.index_names = index_names 1951 1952 if self._reader.header is None and not passed_names: 1953 # pandas\io\parsers.py:1997: error: Argument 1 to "len" has 1954 # incompatible type "Optional[Any]"; expected "Sized" 1955 # [arg-type] 1956 self.index_names = [None] * len( 1957 self.index_names # type: ignore[arg-type] 1958 ) 1959 1960 self._implicit_index = self._reader.leading_cols > 0 1961 1962 def close(self) -> None: 1963 super().close() 1964 1965 # close additional handles opened by C parser 1966 try: 1967 self._reader.close() 1968 except ValueError: 1969 pass 1970 1971 def _set_noconvert_columns(self): 1972 """ 1973 Set the columns that should not undergo dtype conversions. 1974 1975 Currently, any column that is involved with date parsing will not 1976 undergo such conversions. 1977 """ 1978 names = self.orig_names 1979 if self.usecols_dtype == "integer": 1980 # A set of integers will be converted to a list in 1981 # the correct order every single time. 1982 usecols = list(self.usecols) 1983 usecols.sort() 1984 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): 1985 # The names attribute should have the correct columns 1986 # in the proper order for indexing with parse_dates. 1987 usecols = self.names[:] 1988 else: 1989 # Usecols is empty. 1990 1991 # pandas\io\parsers.py:2030: error: Incompatible types in 1992 # assignment (expression has type "None", variable has type 1993 # "List[Any]") [assignment] 1994 usecols = None # type: ignore[assignment] 1995 1996 def _set(x): 1997 if usecols is not None and is_integer(x): 1998 x = usecols[x] 1999 2000 if not is_integer(x): 2001 # assert for mypy, names is List or None, None would error when calling 2002 # .index() 2003 assert names is not None 2004 x = names.index(x) 2005 2006 self._reader.set_noconvert(x) 2007 2008 if isinstance(self.parse_dates, list): 2009 for val in self.parse_dates: 2010 if isinstance(val, list): 2011 for k in val: 2012 _set(k) 2013 else: 2014 _set(val) 2015 2016 elif isinstance(self.parse_dates, dict): 2017 for val in self.parse_dates.values(): 2018 if isinstance(val, list): 2019 for k in val: 2020 _set(k) 2021 else: 2022 _set(val) 2023 2024 elif self.parse_dates: 2025 if isinstance(self.index_col, list): 2026 for k in self.index_col: 2027 _set(k) 2028 elif self.index_col is not None: 2029 _set(self.index_col) 2030 2031 def set_error_bad_lines(self, status): 2032 self._reader.set_error_bad_lines(int(status)) 2033 2034 def read(self, nrows=None): 2035 try: 2036 data = self._reader.read(nrows) 2037 except StopIteration: 2038 if self._first_chunk: 2039 self._first_chunk = False 2040 names = self._maybe_dedup_names(self.orig_names) 2041 index, columns, col_dict = _get_empty_meta( 2042 names, 2043 self.index_col, 2044 self.index_names, 2045 dtype=self.kwds.get("dtype"), 2046 ) 2047 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 2048 2049 if self.usecols is not None: 2050 columns = self._filter_usecols(columns) 2051 2052 col_dict = {k: v for k, v in col_dict.items() if k in columns} 2053 2054 return index, columns, col_dict 2055 2056 else: 2057 self.close() 2058 raise 2059 2060 # Done with first read, next time raise StopIteration 2061 self._first_chunk = False 2062 2063 names = self.names 2064 2065 if self._reader.leading_cols: 2066 if self._has_complex_date_col: 2067 raise NotImplementedError("file structure not yet supported") 2068 2069 # implicit index, no index names 2070 arrays = [] 2071 2072 for i in range(self._reader.leading_cols): 2073 if self.index_col is None: 2074 values = data.pop(i) 2075 else: 2076 values = data.pop(self.index_col[i]) 2077 2078 values = self._maybe_parse_dates(values, i, try_parse_dates=True) 2079 arrays.append(values) 2080 2081 index = ensure_index_from_sequences(arrays) 2082 2083 if self.usecols is not None: 2084 names = self._filter_usecols(names) 2085 2086 names = self._maybe_dedup_names(names) 2087 2088 # rename dict keys 2089 data = sorted(data.items()) 2090 data = {k: v for k, (i, v) in zip(names, data)} 2091 2092 names, data = self._do_date_conversions(names, data) 2093 2094 else: 2095 # rename dict keys 2096 data = sorted(data.items()) 2097 2098 # ugh, mutation 2099 2100 # assert for mypy, orig_names is List or None, None would error in list(...) 2101 assert self.orig_names is not None 2102 names = list(self.orig_names) 2103 names = self._maybe_dedup_names(names) 2104 2105 if self.usecols is not None: 2106 names = self._filter_usecols(names) 2107 2108 # columns as list 2109 alldata = [x[1] for x in data] 2110 2111 data = {k: v for k, (i, v) in zip(names, data)} 2112 2113 names, data = self._do_date_conversions(names, data) 2114 index, names = self._make_index(data, alldata, names) 2115 2116 # maybe create a mi on the columns 2117 names = self._maybe_make_multi_index_columns(names, self.col_names) 2118 2119 return index, names, data 2120 2121 def _filter_usecols(self, names): 2122 # hackish 2123 usecols = _evaluate_usecols(self.usecols, names) 2124 if usecols is not None and len(names) != len(usecols): 2125 names = [ 2126 name for i, name in enumerate(names) if i in usecols or name in usecols 2127 ] 2128 return names 2129 2130 def _get_index_names(self): 2131 names = list(self._reader.header[0]) 2132 idx_names = None 2133 2134 if self._reader.leading_cols == 0 and self.index_col is not None: 2135 (idx_names, names, self.index_col) = _clean_index_names( 2136 names, self.index_col, self.unnamed_cols 2137 ) 2138 2139 return names, idx_names 2140 2141 def _maybe_parse_dates(self, values, index, try_parse_dates=True): 2142 if try_parse_dates and self._should_parse_dates(index): 2143 values = self._date_conv(values) 2144 return values 2145 2146 2147def TextParser(*args, **kwds): 2148 """ 2149 Converts lists of lists/tuples into DataFrames with proper type inference 2150 and optional (e.g. string to datetime) conversion. Also enables iterating 2151 lazily over chunks of large files 2152 2153 Parameters 2154 ---------- 2155 data : file-like object or list 2156 delimiter : separator character to use 2157 dialect : str or csv.Dialect instance, optional 2158 Ignored if delimiter is longer than 1 character 2159 names : sequence, default 2160 header : int, default 0 2161 Row to use to parse column labels. Defaults to the first row. Prior 2162 rows will be discarded 2163 index_col : int or list, optional 2164 Column or columns to use as the (possibly hierarchical) index 2165 has_index_names: bool, default False 2166 True if the cols defined in index_col have an index name and are 2167 not in the header. 2168 na_values : scalar, str, list-like, or dict, optional 2169 Additional strings to recognize as NA/NaN. 2170 keep_default_na : bool, default True 2171 thousands : str, optional 2172 Thousands separator 2173 comment : str, optional 2174 Comment out remainder of line 2175 parse_dates : bool, default False 2176 keep_date_col : bool, default False 2177 date_parser : function, optional 2178 skiprows : list of integers 2179 Row numbers to skip 2180 skipfooter : int 2181 Number of line at bottom of file to skip 2182 converters : dict, optional 2183 Dict of functions for converting values in certain columns. Keys can 2184 either be integers or column labels, values are functions that take one 2185 input argument, the cell (not column) content, and return the 2186 transformed content. 2187 encoding : str, optional 2188 Encoding to use for UTF when reading/writing (ex. 'utf-8') 2189 squeeze : bool, default False 2190 returns Series if only one column. 2191 infer_datetime_format: bool, default False 2192 If True and `parse_dates` is True for a column, try to infer the 2193 datetime format based on the first datetime string. If the format 2194 can be inferred, there often will be a large parsing speed-up. 2195 float_precision : str, optional 2196 Specifies which converter the C engine should use for floating-point 2197 values. The options are `None` or `high` for the ordinary converter, 2198 `legacy` for the original lower precision pandas converter, and 2199 `round_trip` for the round-trip converter. 2200 2201 .. versionchanged:: 1.2 2202 """ 2203 kwds["engine"] = "python" 2204 return TextFileReader(*args, **kwds) 2205 2206 2207def count_empty_vals(vals) -> int: 2208 return sum(1 for v in vals if v == "" or v is None) 2209 2210 2211class PythonParser(ParserBase): 2212 def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): 2213 """ 2214 Workhorse function for processing nested list into DataFrame 2215 """ 2216 ParserBase.__init__(self, kwds) 2217 2218 self.data: Optional[Iterator[str]] = None 2219 self.buf: List = [] 2220 self.pos = 0 2221 self.line_pos = 0 2222 2223 self.skiprows = kwds["skiprows"] 2224 2225 if callable(self.skiprows): 2226 self.skipfunc = self.skiprows 2227 else: 2228 self.skipfunc = lambda x: x in self.skiprows 2229 2230 self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) 2231 self.delimiter = kwds["delimiter"] 2232 2233 self.quotechar = kwds["quotechar"] 2234 if isinstance(self.quotechar, str): 2235 self.quotechar = str(self.quotechar) 2236 2237 self.escapechar = kwds["escapechar"] 2238 self.doublequote = kwds["doublequote"] 2239 self.skipinitialspace = kwds["skipinitialspace"] 2240 self.lineterminator = kwds["lineterminator"] 2241 self.quoting = kwds["quoting"] 2242 self.usecols, _ = _validate_usecols_arg(kwds["usecols"]) 2243 self.skip_blank_lines = kwds["skip_blank_lines"] 2244 2245 self.warn_bad_lines = kwds["warn_bad_lines"] 2246 self.error_bad_lines = kwds["error_bad_lines"] 2247 2248 self.names_passed = kwds["names"] or None 2249 2250 self.has_index_names = False 2251 if "has_index_names" in kwds: 2252 self.has_index_names = kwds["has_index_names"] 2253 2254 self.verbose = kwds["verbose"] 2255 self.converters = kwds["converters"] 2256 2257 self.dtype = kwds["dtype"] 2258 self.thousands = kwds["thousands"] 2259 self.decimal = kwds["decimal"] 2260 2261 self.comment = kwds["comment"] 2262 2263 # Set self.data to something that can read lines. 2264 if isinstance(f, list): 2265 # read_excel: f is a list 2266 self.data = cast(Iterator[str], f) 2267 else: 2268 self._open_handles(f, kwds) 2269 assert self.handles is not None 2270 assert hasattr(self.handles.handle, "readline") 2271 try: 2272 self._make_reader(self.handles.handle) 2273 except (csv.Error, UnicodeDecodeError): 2274 self.close() 2275 raise 2276 2277 # Get columns in two steps: infer from data, then 2278 # infer column indices from self.usecols if it is specified. 2279 self._col_indices = None 2280 try: 2281 ( 2282 self.columns, 2283 self.num_original_columns, 2284 self.unnamed_cols, 2285 ) = self._infer_columns() 2286 except (TypeError, ValueError): 2287 self.close() 2288 raise 2289 2290 # Now self.columns has the set of columns that we will process. 2291 # The original set is stored in self.original_columns. 2292 if len(self.columns) > 1: 2293 # we are processing a multi index column 2294 ( 2295 self.columns, 2296 self.index_names, 2297 self.col_names, 2298 _, 2299 ) = self._extract_multi_indexer_columns( 2300 self.columns, self.index_names, self.col_names 2301 ) 2302 # Update list of original names to include all indices. 2303 self.num_original_columns = len(self.columns) 2304 else: 2305 self.columns = self.columns[0] 2306 2307 # get popped off for index 2308 self.orig_names = list(self.columns) 2309 2310 # needs to be cleaned/refactored 2311 # multiple date column thing turning into a real spaghetti factory 2312 2313 if not self._has_complex_date_col: 2314 (index_names, self.orig_names, self.columns) = self._get_index_name( 2315 self.columns 2316 ) 2317 self._name_processed = True 2318 if self.index_names is None: 2319 self.index_names = index_names 2320 2321 self._validate_parse_dates_presence(self.columns) 2322 if self.parse_dates: 2323 self._no_thousands_columns = self._set_no_thousands_columns() 2324 else: 2325 self._no_thousands_columns = None 2326 2327 if len(self.decimal) != 1: 2328 raise ValueError("Only length-1 decimal markers supported") 2329 2330 if self.thousands is None: 2331 self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+") 2332 else: 2333 self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+") 2334 2335 def _set_no_thousands_columns(self): 2336 # Create a set of column ids that are not to be stripped of thousands 2337 # operators. 2338 noconvert_columns = set() 2339 2340 def _set(x): 2341 if is_integer(x): 2342 noconvert_columns.add(x) 2343 else: 2344 noconvert_columns.add(self.columns.index(x)) 2345 2346 if isinstance(self.parse_dates, list): 2347 for val in self.parse_dates: 2348 if isinstance(val, list): 2349 for k in val: 2350 _set(k) 2351 else: 2352 _set(val) 2353 2354 elif isinstance(self.parse_dates, dict): 2355 for val in self.parse_dates.values(): 2356 if isinstance(val, list): 2357 for k in val: 2358 _set(k) 2359 else: 2360 _set(val) 2361 2362 elif self.parse_dates: 2363 if isinstance(self.index_col, list): 2364 for k in self.index_col: 2365 _set(k) 2366 elif self.index_col is not None: 2367 _set(self.index_col) 2368 2369 return noconvert_columns 2370 2371 def _make_reader(self, f): 2372 sep = self.delimiter 2373 2374 if sep is None or len(sep) == 1: 2375 if self.lineterminator: 2376 raise ValueError( 2377 "Custom line terminators not supported in python parser (yet)" 2378 ) 2379 2380 class MyDialect(csv.Dialect): 2381 delimiter = self.delimiter 2382 quotechar = self.quotechar 2383 escapechar = self.escapechar 2384 doublequote = self.doublequote 2385 skipinitialspace = self.skipinitialspace 2386 quoting = self.quoting 2387 lineterminator = "\n" 2388 2389 dia = MyDialect 2390 2391 if sep is not None: 2392 dia.delimiter = sep 2393 else: 2394 # attempt to sniff the delimiter from the first valid line, 2395 # i.e. no comment line and not in skiprows 2396 line = f.readline() 2397 lines = self._check_comments([[line]])[0] 2398 while self.skipfunc(self.pos) or not lines: 2399 self.pos += 1 2400 line = f.readline() 2401 lines = self._check_comments([[line]])[0] 2402 2403 # since `line` was a string, lines will be a list containing 2404 # only a single string 2405 line = lines[0] 2406 2407 self.pos += 1 2408 self.line_pos += 1 2409 sniffed = csv.Sniffer().sniff(line) 2410 dia.delimiter = sniffed.delimiter 2411 2412 # Note: encoding is irrelevant here 2413 line_rdr = csv.reader(StringIO(line), dialect=dia) 2414 self.buf.extend(list(line_rdr)) 2415 2416 # Note: encoding is irrelevant here 2417 reader = csv.reader(f, dialect=dia, strict=True) 2418 2419 else: 2420 2421 def _read(): 2422 line = f.readline() 2423 pat = re.compile(sep) 2424 2425 yield pat.split(line.strip()) 2426 2427 for line in f: 2428 yield pat.split(line.strip()) 2429 2430 reader = _read() 2431 2432 # pandas\io\parsers.py:2427: error: Incompatible types in assignment 2433 # (expression has type "_reader", variable has type "Union[IO[Any], 2434 # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap, None]") 2435 # [assignment] 2436 self.data = reader # type: ignore[assignment] 2437 2438 def read(self, rows=None): 2439 try: 2440 content = self._get_lines(rows) 2441 except StopIteration: 2442 if self._first_chunk: 2443 content = [] 2444 else: 2445 self.close() 2446 raise 2447 2448 # done with first read, next time raise StopIteration 2449 self._first_chunk = False 2450 2451 # pandas\io\parsers.py:2480: error: Argument 1 to "list" has 2452 # incompatible type "Optional[Any]"; expected "Iterable[Any]" 2453 # [arg-type] 2454 columns = list(self.orig_names) # type: ignore[arg-type] 2455 if not len(content): # pragma: no cover 2456 # DataFrame with the right metadata, even though it's length 0 2457 names = self._maybe_dedup_names(self.orig_names) 2458 index, columns, col_dict = _get_empty_meta( 2459 names, self.index_col, self.index_names, self.dtype 2460 ) 2461 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 2462 return index, columns, col_dict 2463 2464 # handle new style for names in index 2465 count_empty_content_vals = count_empty_vals(content[0]) 2466 indexnamerow = None 2467 if self.has_index_names and count_empty_content_vals == len(columns): 2468 indexnamerow = content[0] 2469 content = content[1:] 2470 2471 alldata = self._rows_to_cols(content) 2472 data = self._exclude_implicit_index(alldata) 2473 2474 columns = self._maybe_dedup_names(self.columns) 2475 columns, data = self._do_date_conversions(columns, data) 2476 2477 data = self._convert_data(data) 2478 index, columns = self._make_index(data, alldata, columns, indexnamerow) 2479 2480 return index, columns, data 2481 2482 def _exclude_implicit_index(self, alldata): 2483 names = self._maybe_dedup_names(self.orig_names) 2484 2485 if self._implicit_index: 2486 excl_indices = self.index_col 2487 2488 data = {} 2489 offset = 0 2490 for i, col in enumerate(names): 2491 while i + offset in excl_indices: 2492 offset += 1 2493 data[col] = alldata[i + offset] 2494 else: 2495 data = {k: v for k, v in zip(names, alldata)} 2496 2497 return data 2498 2499 # legacy 2500 def get_chunk(self, size=None): 2501 if size is None: 2502 # pandas\io\parsers.py:2528: error: "PythonParser" has no attribute 2503 # "chunksize" [attr-defined] 2504 size = self.chunksize # type: ignore[attr-defined] 2505 return self.read(rows=size) 2506 2507 def _convert_data(self, data): 2508 # apply converters 2509 def _clean_mapping(mapping): 2510 """converts col numbers to names""" 2511 clean = {} 2512 for col, v in mapping.items(): 2513 # pandas\io\parsers.py:2537: error: Unsupported right operand 2514 # type for in ("Optional[Any]") [operator] 2515 if ( 2516 isinstance(col, int) 2517 and col not in self.orig_names # type: ignore[operator] 2518 ): 2519 # pandas\io\parsers.py:2538: error: Value of type 2520 # "Optional[Any]" is not indexable [index] 2521 col = self.orig_names[col] # type: ignore[index] 2522 clean[col] = v 2523 return clean 2524 2525 clean_conv = _clean_mapping(self.converters) 2526 if not isinstance(self.dtype, dict): 2527 # handles single dtype applied to all columns 2528 clean_dtypes = self.dtype 2529 else: 2530 clean_dtypes = _clean_mapping(self.dtype) 2531 2532 # Apply NA values. 2533 clean_na_values = {} 2534 clean_na_fvalues = {} 2535 2536 if isinstance(self.na_values, dict): 2537 for col in self.na_values: 2538 na_value = self.na_values[col] 2539 na_fvalue = self.na_fvalues[col] 2540 2541 # pandas\io\parsers.py:2558: error: Unsupported right operand 2542 # type for in ("Optional[Any]") [operator] 2543 if ( 2544 isinstance(col, int) 2545 and col not in self.orig_names # type: ignore[operator] 2546 ): 2547 # pandas\io\parsers.py:2559: error: Value of type 2548 # "Optional[Any]" is not indexable [index] 2549 col = self.orig_names[col] # type: ignore[index] 2550 2551 clean_na_values[col] = na_value 2552 clean_na_fvalues[col] = na_fvalue 2553 else: 2554 clean_na_values = self.na_values 2555 clean_na_fvalues = self.na_fvalues 2556 2557 return self._convert_to_ndarrays( 2558 data, 2559 clean_na_values, 2560 clean_na_fvalues, 2561 self.verbose, 2562 clean_conv, 2563 clean_dtypes, 2564 ) 2565 2566 def _infer_columns(self): 2567 names = self.names 2568 num_original_columns = 0 2569 clear_buffer = True 2570 # pandas\io\parsers.py:2580: error: Need type annotation for 2571 # 'unnamed_cols' (hint: "unnamed_cols: Set[<type>] = ...") 2572 # [var-annotated] 2573 unnamed_cols = set() # type: ignore[var-annotated] 2574 2575 if self.header is not None: 2576 header = self.header 2577 2578 if isinstance(header, (list, tuple, np.ndarray)): 2579 have_mi_columns = len(header) > 1 2580 # we have a mi columns, so read an extra line 2581 if have_mi_columns: 2582 header = list(header) + [header[-1] + 1] 2583 else: 2584 have_mi_columns = False 2585 header = [header] 2586 2587 # pandas\io\parsers.py:2594: error: Need type annotation for 2588 # 'columns' (hint: "columns: List[<type>] = ...") [var-annotated] 2589 columns = [] # type: ignore[var-annotated] 2590 for level, hr in enumerate(header): 2591 try: 2592 line = self._buffered_line() 2593 2594 while self.line_pos <= hr: 2595 line = self._next_line() 2596 2597 except StopIteration as err: 2598 if self.line_pos < hr: 2599 raise ValueError( 2600 f"Passed header={hr} but only {self.line_pos + 1} lines in " 2601 "file" 2602 ) from err 2603 2604 # We have an empty file, so check 2605 # if columns are provided. That will 2606 # serve as the 'line' for parsing 2607 if have_mi_columns and hr > 0: 2608 if clear_buffer: 2609 self._clear_buffer() 2610 columns.append([None] * len(columns[-1])) 2611 return columns, num_original_columns, unnamed_cols 2612 2613 if not self.names: 2614 raise EmptyDataError("No columns to parse from file") from err 2615 2616 line = self.names[:] 2617 2618 this_columns = [] 2619 this_unnamed_cols = [] 2620 2621 for i, c in enumerate(line): 2622 if c == "": 2623 if have_mi_columns: 2624 col_name = f"Unnamed: {i}_level_{level}" 2625 else: 2626 col_name = f"Unnamed: {i}" 2627 2628 this_unnamed_cols.append(i) 2629 this_columns.append(col_name) 2630 else: 2631 this_columns.append(c) 2632 2633 if not have_mi_columns and self.mangle_dupe_cols: 2634 # pandas\io\parsers.py:2639: error: Need type annotation 2635 # for 'counts' [var-annotated] 2636 counts = defaultdict(int) # type: ignore[var-annotated] 2637 2638 for i, col in enumerate(this_columns): 2639 cur_count = counts[col] 2640 2641 while cur_count > 0: 2642 counts[col] = cur_count + 1 2643 col = f"{col}.{cur_count}" 2644 cur_count = counts[col] 2645 2646 this_columns[i] = col 2647 counts[col] = cur_count + 1 2648 elif have_mi_columns: 2649 2650 # if we have grabbed an extra line, but its not in our 2651 # format so save in the buffer, and create an blank extra 2652 # line for the rest of the parsing code 2653 if hr == header[-1]: 2654 lc = len(this_columns) 2655 ic = len(self.index_col) if self.index_col is not None else 0 2656 unnamed_count = len(this_unnamed_cols) 2657 2658 if lc != unnamed_count and lc - ic > unnamed_count: 2659 clear_buffer = False 2660 # pandas\io\parsers.py:2663: error: List item 0 has 2661 # incompatible type "None"; expected "str" 2662 # [list-item] 2663 this_columns = [None] * lc # type: ignore[list-item] 2664 self.buf = [self.buf[-1]] 2665 2666 # pandas\io\parsers.py:2666: error: Argument 1 to "append" of 2667 # "list" has incompatible type "List[str]"; expected 2668 # "List[None]" [arg-type] 2669 columns.append(this_columns) # type: ignore[arg-type] 2670 unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) 2671 2672 if len(columns) == 1: 2673 num_original_columns = len(this_columns) 2674 2675 if clear_buffer: 2676 self._clear_buffer() 2677 2678 if names is not None: 2679 if (self.usecols is not None and len(names) != len(self.usecols)) or ( 2680 self.usecols is None and len(names) != len(columns[0]) 2681 ): 2682 raise ValueError( 2683 "Number of passed names did not match " 2684 "number of header fields in the file" 2685 ) 2686 if len(columns) > 1: 2687 raise TypeError("Cannot pass names with multi-index columns") 2688 2689 if self.usecols is not None: 2690 # Set _use_cols. We don't store columns because they are 2691 # overwritten. 2692 self._handle_usecols(columns, names) 2693 else: 2694 self._col_indices = None 2695 num_original_columns = len(names) 2696 columns = [names] 2697 else: 2698 columns = self._handle_usecols(columns, columns[0]) 2699 else: 2700 try: 2701 line = self._buffered_line() 2702 2703 except StopIteration as err: 2704 if not names: 2705 raise EmptyDataError("No columns to parse from file") from err 2706 2707 line = names[:] 2708 2709 ncols = len(line) 2710 num_original_columns = ncols 2711 2712 if not names: 2713 if self.prefix: 2714 # pandas\io\parsers.py:2711: error: List comprehension has 2715 # incompatible type List[str]; expected List[None] [misc] 2716 columns = [ 2717 [ 2718 f"{self.prefix}{i}" # type: ignore[misc] 2719 for i in range(ncols) 2720 ] 2721 ] 2722 else: 2723 # pandas\io\parsers.py:2713: error: Argument 1 to "list" 2724 # has incompatible type "range"; expected "Iterable[None]" 2725 # [arg-type] 2726 columns = [list(range(ncols))] # type: ignore[arg-type] 2727 columns = self._handle_usecols(columns, columns[0]) 2728 else: 2729 if self.usecols is None or len(names) >= num_original_columns: 2730 columns = self._handle_usecols([names], names) 2731 num_original_columns = len(names) 2732 else: 2733 if not callable(self.usecols) and len(names) != len(self.usecols): 2734 raise ValueError( 2735 "Number of passed names did not match number of " 2736 "header fields in the file" 2737 ) 2738 # Ignore output but set used columns. 2739 self._handle_usecols([names], names) 2740 columns = [names] 2741 num_original_columns = ncols 2742 2743 return columns, num_original_columns, unnamed_cols 2744 2745 def _handle_usecols(self, columns, usecols_key): 2746 """ 2747 Sets self._col_indices 2748 2749 usecols_key is used if there are string usecols. 2750 """ 2751 if self.usecols is not None: 2752 if callable(self.usecols): 2753 col_indices = _evaluate_usecols(self.usecols, usecols_key) 2754 elif any(isinstance(u, str) for u in self.usecols): 2755 if len(columns) > 1: 2756 raise ValueError( 2757 "If using multiple headers, usecols must be integers." 2758 ) 2759 col_indices = [] 2760 2761 for col in self.usecols: 2762 if isinstance(col, str): 2763 try: 2764 col_indices.append(usecols_key.index(col)) 2765 except ValueError: 2766 _validate_usecols_names(self.usecols, usecols_key) 2767 else: 2768 col_indices.append(col) 2769 else: 2770 col_indices = self.usecols 2771 2772 columns = [ 2773 [n for i, n in enumerate(column) if i in col_indices] 2774 for column in columns 2775 ] 2776 self._col_indices = col_indices 2777 return columns 2778 2779 def _buffered_line(self): 2780 """ 2781 Return a line from buffer, filling buffer if required. 2782 """ 2783 if len(self.buf) > 0: 2784 return self.buf[0] 2785 else: 2786 return self._next_line() 2787 2788 def _check_for_bom(self, first_row): 2789 """ 2790 Checks whether the file begins with the BOM character. 2791 If it does, remove it. In addition, if there is quoting 2792 in the field subsequent to the BOM, remove it as well 2793 because it technically takes place at the beginning of 2794 the name, not the middle of it. 2795 """ 2796 # first_row will be a list, so we need to check 2797 # that that list is not empty before proceeding. 2798 if not first_row: 2799 return first_row 2800 2801 # The first element of this row is the one that could have the 2802 # BOM that we want to remove. Check that the first element is a 2803 # string before proceeding. 2804 if not isinstance(first_row[0], str): 2805 return first_row 2806 2807 # Check that the string is not empty, as that would 2808 # obviously not have a BOM at the start of it. 2809 if not first_row[0]: 2810 return first_row 2811 2812 # Since the string is non-empty, check that it does 2813 # in fact begin with a BOM. 2814 first_elt = first_row[0][0] 2815 if first_elt != _BOM: 2816 return first_row 2817 2818 first_row_bom = first_row[0] 2819 2820 if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: 2821 start = 2 2822 quote = first_row_bom[1] 2823 end = first_row_bom[2:].index(quote) + 2 2824 2825 # Extract the data between the quotation marks 2826 new_row = first_row_bom[start:end] 2827 2828 # Extract any remaining data after the second 2829 # quotation mark. 2830 if len(first_row_bom) > end + 1: 2831 new_row += first_row_bom[end + 1 :] 2832 2833 else: 2834 2835 # No quotation so just remove BOM from first element 2836 new_row = first_row_bom[1:] 2837 return [new_row] + first_row[1:] 2838 2839 def _is_line_empty(self, line): 2840 """ 2841 Check if a line is empty or not. 2842 2843 Parameters 2844 ---------- 2845 line : str, array-like 2846 The line of data to check. 2847 2848 Returns 2849 ------- 2850 boolean : Whether or not the line is empty. 2851 """ 2852 return not line or all(not x for x in line) 2853 2854 def _next_line(self): 2855 if isinstance(self.data, list): 2856 while self.skipfunc(self.pos): 2857 self.pos += 1 2858 2859 while True: 2860 try: 2861 line = self._check_comments([self.data[self.pos]])[0] 2862 self.pos += 1 2863 # either uncommented or blank to begin with 2864 if not self.skip_blank_lines and ( 2865 self._is_line_empty(self.data[self.pos - 1]) or line 2866 ): 2867 break 2868 elif self.skip_blank_lines: 2869 ret = self._remove_empty_lines([line]) 2870 if ret: 2871 line = ret[0] 2872 break 2873 except IndexError: 2874 raise StopIteration 2875 else: 2876 while self.skipfunc(self.pos): 2877 self.pos += 1 2878 # assert for mypy, data is Iterator[str] or None, would error in next 2879 assert self.data is not None 2880 next(self.data) 2881 2882 while True: 2883 orig_line = self._next_iter_line(row_num=self.pos + 1) 2884 self.pos += 1 2885 2886 if orig_line is not None: 2887 line = self._check_comments([orig_line])[0] 2888 2889 if self.skip_blank_lines: 2890 ret = self._remove_empty_lines([line]) 2891 2892 if ret: 2893 line = ret[0] 2894 break 2895 elif self._is_line_empty(orig_line) or line: 2896 break 2897 2898 # This was the first line of the file, 2899 # which could contain the BOM at the 2900 # beginning of it. 2901 if self.pos == 1: 2902 line = self._check_for_bom(line) 2903 2904 self.line_pos += 1 2905 self.buf.append(line) 2906 return line 2907 2908 def _alert_malformed(self, msg, row_num): 2909 """ 2910 Alert a user about a malformed row. 2911 2912 If `self.error_bad_lines` is True, the alert will be `ParserError`. 2913 If `self.warn_bad_lines` is True, the alert will be printed out. 2914 2915 Parameters 2916 ---------- 2917 msg : The error message to display. 2918 row_num : The row number where the parsing error occurred. 2919 Because this row number is displayed, we 1-index, 2920 even though we 0-index internally. 2921 """ 2922 if self.error_bad_lines: 2923 raise ParserError(msg) 2924 elif self.warn_bad_lines: 2925 base = f"Skipping line {row_num}: " 2926 sys.stderr.write(base + msg + "\n") 2927 2928 def _next_iter_line(self, row_num): 2929 """ 2930 Wrapper around iterating through `self.data` (CSV source). 2931 2932 When a CSV error is raised, we check for specific 2933 error messages that allow us to customize the 2934 error message displayed to the user. 2935 2936 Parameters 2937 ---------- 2938 row_num : The row number of the line being parsed. 2939 """ 2940 try: 2941 # assert for mypy, data is Iterator[str] or None, would error in next 2942 assert self.data is not None 2943 return next(self.data) 2944 except csv.Error as e: 2945 if self.warn_bad_lines or self.error_bad_lines: 2946 msg = str(e) 2947 2948 if "NULL byte" in msg or "line contains NUL" in msg: 2949 msg = ( 2950 "NULL byte detected. This byte " 2951 "cannot be processed in Python's " 2952 "native csv library at the moment, " 2953 "so please pass in engine='c' instead" 2954 ) 2955 2956 if self.skipfooter > 0: 2957 reason = ( 2958 "Error could possibly be due to " 2959 "parsing errors in the skipped footer rows " 2960 "(the skipfooter keyword is only applied " 2961 "after Python's csv library has parsed " 2962 "all rows)." 2963 ) 2964 msg += ". " + reason 2965 2966 self._alert_malformed(msg, row_num) 2967 return None 2968 2969 def _check_comments(self, lines): 2970 if self.comment is None: 2971 return lines 2972 ret = [] 2973 for line in lines: 2974 rl = [] 2975 for x in line: 2976 if not isinstance(x, str) or self.comment not in x: 2977 rl.append(x) 2978 else: 2979 x = x[: x.find(self.comment)] 2980 if len(x) > 0: 2981 rl.append(x) 2982 break 2983 ret.append(rl) 2984 return ret 2985 2986 def _remove_empty_lines(self, lines): 2987 """ 2988 Iterate through the lines and remove any that are 2989 either empty or contain only one whitespace value 2990 2991 Parameters 2992 ---------- 2993 lines : array-like 2994 The array of lines that we are to filter. 2995 2996 Returns 2997 ------- 2998 filtered_lines : array-like 2999 The same array of lines with the "empty" ones removed. 3000 """ 3001 ret = [] 3002 for line in lines: 3003 # Remove empty lines and lines with only one whitespace value 3004 if ( 3005 len(line) > 1 3006 or len(line) == 1 3007 and (not isinstance(line[0], str) or line[0].strip()) 3008 ): 3009 ret.append(line) 3010 return ret 3011 3012 def _check_thousands(self, lines): 3013 if self.thousands is None: 3014 return lines 3015 3016 return self._search_replace_num_columns( 3017 lines=lines, search=self.thousands, replace="" 3018 ) 3019 3020 def _search_replace_num_columns(self, lines, search, replace): 3021 ret = [] 3022 for line in lines: 3023 rl = [] 3024 for i, x in enumerate(line): 3025 if ( 3026 not isinstance(x, str) 3027 or search not in x 3028 or (self._no_thousands_columns and i in self._no_thousands_columns) 3029 or self.nonnum.search(x.strip()) 3030 ): 3031 rl.append(x) 3032 else: 3033 rl.append(x.replace(search, replace)) 3034 ret.append(rl) 3035 return ret 3036 3037 def _check_decimal(self, lines): 3038 if self.decimal == _parser_defaults["decimal"]: 3039 return lines 3040 3041 return self._search_replace_num_columns( 3042 lines=lines, search=self.decimal, replace="." 3043 ) 3044 3045 def _clear_buffer(self): 3046 self.buf = [] 3047 3048 _implicit_index = False 3049 3050 def _get_index_name(self, columns): 3051 """ 3052 Try several cases to get lines: 3053 3054 0) There are headers on row 0 and row 1 and their 3055 total summed lengths equals the length of the next line. 3056 Treat row 0 as columns and row 1 as indices 3057 1) Look for implicit index: there are more columns 3058 on row 1 than row 0. If this is true, assume that row 3059 1 lists index columns and row 0 lists normal columns. 3060 2) Get index from the columns if it was listed. 3061 """ 3062 orig_names = list(columns) 3063 columns = list(columns) 3064 3065 try: 3066 line = self._next_line() 3067 except StopIteration: 3068 line = None 3069 3070 try: 3071 next_line = self._next_line() 3072 except StopIteration: 3073 next_line = None 3074 3075 # implicitly index_col=0 b/c 1 fewer column names 3076 implicit_first_cols = 0 3077 if line is not None: 3078 # leave it 0, #2442 3079 # Case 1 3080 if self.index_col is not False: 3081 implicit_first_cols = len(line) - self.num_original_columns 3082 3083 # Case 0 3084 if next_line is not None: 3085 if len(next_line) == len(line) + self.num_original_columns: 3086 # column and index names on diff rows 3087 self.index_col = list(range(len(line))) 3088 self.buf = self.buf[1:] 3089 3090 for c in reversed(line): 3091 columns.insert(0, c) 3092 3093 # Update list of original names to include all indices. 3094 orig_names = list(columns) 3095 self.num_original_columns = len(columns) 3096 return line, orig_names, columns 3097 3098 if implicit_first_cols > 0: 3099 # Case 1 3100 self._implicit_index = True 3101 if self.index_col is None: 3102 self.index_col = list(range(implicit_first_cols)) 3103 3104 index_name = None 3105 3106 else: 3107 # Case 2 3108 (index_name, columns_, self.index_col) = _clean_index_names( 3109 columns, self.index_col, self.unnamed_cols 3110 ) 3111 3112 return index_name, orig_names, columns 3113 3114 def _rows_to_cols(self, content): 3115 col_len = self.num_original_columns 3116 3117 if self._implicit_index: 3118 col_len += len(self.index_col) 3119 3120 max_len = max(len(row) for row in content) 3121 3122 # Check that there are no rows with too many 3123 # elements in their row (rows with too few 3124 # elements are padded with NaN). 3125 if max_len > col_len and self.index_col is not False and self.usecols is None: 3126 3127 footers = self.skipfooter if self.skipfooter else 0 3128 bad_lines = [] 3129 3130 iter_content = enumerate(content) 3131 content_len = len(content) 3132 content = [] 3133 3134 for (i, l) in iter_content: 3135 actual_len = len(l) 3136 3137 if actual_len > col_len: 3138 if self.error_bad_lines or self.warn_bad_lines: 3139 row_num = self.pos - (content_len - i + footers) 3140 bad_lines.append((row_num, actual_len)) 3141 3142 if self.error_bad_lines: 3143 break 3144 else: 3145 content.append(l) 3146 3147 for row_num, actual_len in bad_lines: 3148 msg = ( 3149 f"Expected {col_len} fields in line {row_num + 1}, saw " 3150 f"{actual_len}" 3151 ) 3152 if ( 3153 self.delimiter 3154 and len(self.delimiter) > 1 3155 and self.quoting != csv.QUOTE_NONE 3156 ): 3157 # see gh-13374 3158 reason = ( 3159 "Error could possibly be due to quotes being " 3160 "ignored when a multi-char delimiter is used." 3161 ) 3162 msg += ". " + reason 3163 3164 self._alert_malformed(msg, row_num + 1) 3165 3166 # see gh-13320 3167 zipped_content = list(lib.to_object_array(content, min_width=col_len).T) 3168 3169 if self.usecols: 3170 if self._implicit_index: 3171 zipped_content = [ 3172 a 3173 for i, a in enumerate(zipped_content) 3174 if ( 3175 i < len(self.index_col) 3176 # pandas\io\parsers.py:3159: error: Unsupported right 3177 # operand type for in ("Optional[Any]") [operator] 3178 or i - len(self.index_col) # type: ignore[operator] 3179 in self._col_indices 3180 ) 3181 ] 3182 else: 3183 zipped_content = [ 3184 # pandas\io\parsers.py:3164: error: Unsupported right 3185 # operand type for in ("Optional[Any]") [operator] 3186 a 3187 for i, a in enumerate(zipped_content) 3188 if i in self._col_indices # type: ignore[operator] 3189 ] 3190 return zipped_content 3191 3192 def _get_lines(self, rows=None): 3193 lines = self.buf 3194 new_rows = None 3195 3196 # already fetched some number 3197 if rows is not None: 3198 # we already have the lines in the buffer 3199 if len(self.buf) >= rows: 3200 new_rows, self.buf = self.buf[:rows], self.buf[rows:] 3201 3202 # need some lines 3203 else: 3204 rows -= len(self.buf) 3205 3206 if new_rows is None: 3207 if isinstance(self.data, list): 3208 if self.pos > len(self.data): 3209 raise StopIteration 3210 if rows is None: 3211 new_rows = self.data[self.pos :] 3212 new_pos = len(self.data) 3213 else: 3214 new_rows = self.data[self.pos : self.pos + rows] 3215 new_pos = self.pos + rows 3216 3217 # Check for stop rows. n.b.: self.skiprows is a set. 3218 if self.skiprows: 3219 new_rows = [ 3220 row 3221 for i, row in enumerate(new_rows) 3222 if not self.skipfunc(i + self.pos) 3223 ] 3224 3225 lines.extend(new_rows) 3226 self.pos = new_pos 3227 3228 else: 3229 new_rows = [] 3230 try: 3231 if rows is not None: 3232 for _ in range(rows): 3233 # assert for mypy, data is Iterator[str] or None, would 3234 # error in next 3235 assert self.data is not None 3236 new_rows.append(next(self.data)) 3237 lines.extend(new_rows) 3238 else: 3239 rows = 0 3240 3241 while True: 3242 new_row = self._next_iter_line(row_num=self.pos + rows + 1) 3243 rows += 1 3244 3245 if new_row is not None: 3246 new_rows.append(new_row) 3247 3248 except StopIteration: 3249 if self.skiprows: 3250 new_rows = [ 3251 row 3252 for i, row in enumerate(new_rows) 3253 if not self.skipfunc(i + self.pos) 3254 ] 3255 lines.extend(new_rows) 3256 if len(lines) == 0: 3257 raise 3258 self.pos += len(new_rows) 3259 3260 self.buf = [] 3261 else: 3262 lines = new_rows 3263 3264 if self.skipfooter: 3265 lines = lines[: -self.skipfooter] 3266 3267 lines = self._check_comments(lines) 3268 if self.skip_blank_lines: 3269 lines = self._remove_empty_lines(lines) 3270 lines = self._check_thousands(lines) 3271 return self._check_decimal(lines) 3272 3273 3274def _make_date_converter( 3275 date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True 3276): 3277 def converter(*date_cols): 3278 if date_parser is None: 3279 strs = parsing.concat_date_cols(date_cols) 3280 3281 try: 3282 return tools.to_datetime( 3283 ensure_object(strs), 3284 utc=None, 3285 dayfirst=dayfirst, 3286 errors="ignore", 3287 infer_datetime_format=infer_datetime_format, 3288 cache=cache_dates, 3289 ).to_numpy() 3290 3291 except ValueError: 3292 return tools.to_datetime( 3293 parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates 3294 ) 3295 else: 3296 try: 3297 result = tools.to_datetime( 3298 date_parser(*date_cols), errors="ignore", cache=cache_dates 3299 ) 3300 if isinstance(result, datetime.datetime): 3301 raise Exception("scalar parser") 3302 return result 3303 except Exception: 3304 try: 3305 return tools.to_datetime( 3306 parsing.try_parse_dates( 3307 parsing.concat_date_cols(date_cols), 3308 parser=date_parser, 3309 dayfirst=dayfirst, 3310 ), 3311 errors="ignore", 3312 ) 3313 except Exception: 3314 return generic_parser(date_parser, *date_cols) 3315 3316 return converter 3317 3318 3319def _process_date_conversion( 3320 data_dict, 3321 converter, 3322 parse_spec, 3323 index_col, 3324 index_names, 3325 columns, 3326 keep_date_col=False, 3327): 3328 def _isindex(colspec): 3329 return (isinstance(index_col, list) and colspec in index_col) or ( 3330 isinstance(index_names, list) and colspec in index_names 3331 ) 3332 3333 new_cols = [] 3334 new_data = {} 3335 3336 orig_names = columns 3337 columns = list(columns) 3338 3339 date_cols = set() 3340 3341 if parse_spec is None or isinstance(parse_spec, bool): 3342 return data_dict, columns 3343 3344 if isinstance(parse_spec, list): 3345 # list of column lists 3346 for colspec in parse_spec: 3347 if is_scalar(colspec): 3348 if isinstance(colspec, int) and colspec not in data_dict: 3349 colspec = orig_names[colspec] 3350 if _isindex(colspec): 3351 continue 3352 data_dict[colspec] = converter(data_dict[colspec]) 3353 else: 3354 new_name, col, old_names = _try_convert_dates( 3355 converter, colspec, data_dict, orig_names 3356 ) 3357 if new_name in data_dict: 3358 raise ValueError(f"New date column already in dict {new_name}") 3359 new_data[new_name] = col 3360 new_cols.append(new_name) 3361 date_cols.update(old_names) 3362 3363 elif isinstance(parse_spec, dict): 3364 # dict of new name to column list 3365 for new_name, colspec in parse_spec.items(): 3366 if new_name in data_dict: 3367 raise ValueError(f"Date column {new_name} already in dict") 3368 3369 _, col, old_names = _try_convert_dates( 3370 converter, colspec, data_dict, orig_names 3371 ) 3372 3373 new_data[new_name] = col 3374 new_cols.append(new_name) 3375 date_cols.update(old_names) 3376 3377 data_dict.update(new_data) 3378 new_cols.extend(columns) 3379 3380 if not keep_date_col: 3381 for c in list(date_cols): 3382 data_dict.pop(c) 3383 new_cols.remove(c) 3384 3385 return data_dict, new_cols 3386 3387 3388def _try_convert_dates(parser, colspec, data_dict, columns): 3389 colset = set(columns) 3390 colnames = [] 3391 3392 for c in colspec: 3393 if c in colset: 3394 colnames.append(c) 3395 elif isinstance(c, int) and c not in columns: 3396 colnames.append(columns[c]) 3397 else: 3398 colnames.append(c) 3399 3400 new_name = "_".join(str(x) for x in colnames) 3401 to_parse = [data_dict[c] for c in colnames if c in data_dict] 3402 3403 new_col = parser(*to_parse) 3404 return new_name, new_col, colnames 3405 3406 3407def _clean_na_values(na_values, keep_default_na=True): 3408 3409 if na_values is None: 3410 if keep_default_na: 3411 na_values = STR_NA_VALUES 3412 else: 3413 na_values = set() 3414 # pandas\io\parsers.py:3387: error: Need type annotation for 3415 # 'na_fvalues' (hint: "na_fvalues: Set[<type>] = ...") [var-annotated] 3416 na_fvalues = set() # type: ignore[var-annotated] 3417 elif isinstance(na_values, dict): 3418 old_na_values = na_values.copy() 3419 na_values = {} # Prevent aliasing. 3420 3421 # Convert the values in the na_values dictionary 3422 # into array-likes for further use. This is also 3423 # where we append the default NaN values, provided 3424 # that `keep_default_na=True`. 3425 for k, v in old_na_values.items(): 3426 if not is_list_like(v): 3427 v = [v] 3428 3429 if keep_default_na: 3430 v = set(v) | STR_NA_VALUES 3431 3432 na_values[k] = v 3433 # pandas\io\parsers.py:3404: error: Incompatible types in assignment 3434 # (expression has type "Dict[Any, Any]", variable has type "Set[Any]") 3435 # [assignment] 3436 na_fvalues = { # type: ignore[assignment] 3437 k: _floatify_na_values(v) for k, v in na_values.items() 3438 } 3439 else: 3440 if not is_list_like(na_values): 3441 na_values = [na_values] 3442 na_values = _stringify_na_values(na_values) 3443 if keep_default_na: 3444 na_values = na_values | STR_NA_VALUES 3445 3446 na_fvalues = _floatify_na_values(na_values) 3447 3448 return na_values, na_fvalues 3449 3450 3451def _clean_index_names(columns, index_col, unnamed_cols): 3452 if not _is_index_col(index_col): 3453 return None, columns, index_col 3454 3455 columns = list(columns) 3456 3457 cp_cols = list(columns) 3458 index_names = [] 3459 3460 # don't mutate 3461 index_col = list(index_col) 3462 3463 for i, c in enumerate(index_col): 3464 if isinstance(c, str): 3465 index_names.append(c) 3466 for j, name in enumerate(cp_cols): 3467 if name == c: 3468 index_col[i] = j 3469 columns.remove(name) 3470 break 3471 else: 3472 name = cp_cols[c] 3473 columns.remove(name) 3474 index_names.append(name) 3475 3476 # Only clean index names that were placeholders. 3477 for i, name in enumerate(index_names): 3478 if isinstance(name, str) and name in unnamed_cols: 3479 # pandas\io\parsers.py:3445: error: No overload variant of 3480 # "__setitem__" of "list" matches argument types "int", "None" 3481 # [call-overload] 3482 index_names[i] = None # type: ignore[call-overload] 3483 3484 return index_names, columns, index_col 3485 3486 3487def _get_empty_meta(columns, index_col, index_names, dtype=None): 3488 columns = list(columns) 3489 3490 # Convert `dtype` to a defaultdict of some kind. 3491 # This will enable us to write `dtype[col_name]` 3492 # without worrying about KeyError issues later on. 3493 if not isinstance(dtype, dict): 3494 # if dtype == None, default will be object. 3495 default_dtype = dtype or object 3496 dtype = defaultdict(lambda: default_dtype) 3497 else: 3498 # Save a copy of the dictionary. 3499 _dtype = dtype.copy() 3500 dtype = defaultdict(lambda: object) 3501 3502 # Convert column indexes to column names. 3503 for k, v in _dtype.items(): 3504 col = columns[k] if is_integer(k) else k 3505 dtype[col] = v 3506 3507 # Even though we have no data, the "index" of the empty DataFrame 3508 # could for example still be an empty MultiIndex. Thus, we need to 3509 # check whether we have any index columns specified, via either: 3510 # 3511 # 1) index_col (column indices) 3512 # 2) index_names (column names) 3513 # 3514 # Both must be non-null to ensure a successful construction. Otherwise, 3515 # we have to create a generic empty Index. 3516 if (index_col is None or index_col is False) or index_names is None: 3517 index = Index([]) 3518 else: 3519 data = [Series([], dtype=dtype[name]) for name in index_names] 3520 index = ensure_index_from_sequences(data, names=index_names) 3521 index_col.sort() 3522 3523 for i, n in enumerate(index_col): 3524 columns.pop(n - i) 3525 3526 col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns} 3527 3528 return index, columns, col_dict 3529 3530 3531def _floatify_na_values(na_values): 3532 # create float versions of the na_values 3533 result = set() 3534 for v in na_values: 3535 try: 3536 v = float(v) 3537 if not np.isnan(v): 3538 result.add(v) 3539 except (TypeError, ValueError, OverflowError): 3540 pass 3541 return result 3542 3543 3544def _stringify_na_values(na_values): 3545 """ return a stringified and numeric for these values """ 3546 result = [] 3547 for x in na_values: 3548 result.append(str(x)) 3549 result.append(x) 3550 try: 3551 v = float(x) 3552 3553 # we are like 999 here 3554 if v == int(v): 3555 v = int(v) 3556 result.append(f"{v}.0") 3557 result.append(str(v)) 3558 3559 # pandas\io\parsers.py:3522: error: Argument 1 to "append" of 3560 # "list" has incompatible type "float"; expected "str" [arg-type] 3561 result.append(v) # type: ignore[arg-type] 3562 except (TypeError, ValueError, OverflowError): 3563 pass 3564 try: 3565 # pandas\io\parsers.py:3526: error: Argument 1 to "append" of 3566 # "list" has incompatible type "int"; expected "str" [arg-type] 3567 result.append(int(x)) # type: ignore[arg-type] 3568 except (TypeError, ValueError, OverflowError): 3569 pass 3570 return set(result) 3571 3572 3573def _get_na_values(col, na_values, na_fvalues, keep_default_na): 3574 """ 3575 Get the NaN values for a given column. 3576 3577 Parameters 3578 ---------- 3579 col : str 3580 The name of the column. 3581 na_values : array-like, dict 3582 The object listing the NaN values as strings. 3583 na_fvalues : array-like, dict 3584 The object listing the NaN values as floats. 3585 keep_default_na : bool 3586 If `na_values` is a dict, and the column is not mapped in the 3587 dictionary, whether to return the default NaN values or the empty set. 3588 3589 Returns 3590 ------- 3591 nan_tuple : A length-two tuple composed of 3592 3593 1) na_values : the string NaN values for that column. 3594 2) na_fvalues : the float NaN values for that column. 3595 """ 3596 if isinstance(na_values, dict): 3597 if col in na_values: 3598 return na_values[col], na_fvalues[col] 3599 else: 3600 if keep_default_na: 3601 return STR_NA_VALUES, set() 3602 3603 return set(), set() 3604 else: 3605 return na_values, na_fvalues 3606 3607 3608def _get_col_names(colspec, columns): 3609 colset = set(columns) 3610 colnames = [] 3611 for c in colspec: 3612 if c in colset: 3613 colnames.append(c) 3614 elif isinstance(c, int): 3615 colnames.append(columns[c]) 3616 return colnames 3617 3618 3619class FixedWidthReader(abc.Iterator): 3620 """ 3621 A reader of fixed-width lines. 3622 """ 3623 3624 def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): 3625 self.f = f 3626 self.buffer = None 3627 self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " 3628 self.comment = comment 3629 if colspecs == "infer": 3630 self.colspecs = self.detect_colspecs( 3631 infer_nrows=infer_nrows, skiprows=skiprows 3632 ) 3633 else: 3634 self.colspecs = colspecs 3635 3636 if not isinstance(self.colspecs, (tuple, list)): 3637 raise TypeError( 3638 "column specifications must be a list or tuple, " 3639 f"input was a {type(colspecs).__name__}" 3640 ) 3641 3642 for colspec in self.colspecs: 3643 if not ( 3644 isinstance(colspec, (tuple, list)) 3645 and len(colspec) == 2 3646 and isinstance(colspec[0], (int, np.integer, type(None))) 3647 and isinstance(colspec[1], (int, np.integer, type(None))) 3648 ): 3649 raise TypeError( 3650 "Each column specification must be " 3651 "2 element tuple or list of integers" 3652 ) 3653 3654 def get_rows(self, infer_nrows, skiprows=None): 3655 """ 3656 Read rows from self.f, skipping as specified. 3657 3658 We distinguish buffer_rows (the first <= infer_nrows 3659 lines) from the rows returned to detect_colspecs 3660 because it's simpler to leave the other locations 3661 with skiprows logic alone than to modify them to 3662 deal with the fact we skipped some rows here as 3663 well. 3664 3665 Parameters 3666 ---------- 3667 infer_nrows : int 3668 Number of rows to read from self.f, not counting 3669 rows that are skipped. 3670 skiprows: set, optional 3671 Indices of rows to skip. 3672 3673 Returns 3674 ------- 3675 detect_rows : list of str 3676 A list containing the rows to read. 3677 3678 """ 3679 if skiprows is None: 3680 skiprows = set() 3681 buffer_rows = [] 3682 detect_rows = [] 3683 for i, row in enumerate(self.f): 3684 if i not in skiprows: 3685 detect_rows.append(row) 3686 buffer_rows.append(row) 3687 if len(detect_rows) >= infer_nrows: 3688 break 3689 self.buffer = iter(buffer_rows) 3690 return detect_rows 3691 3692 def detect_colspecs(self, infer_nrows=100, skiprows=None): 3693 # Regex escape the delimiters 3694 delimiters = "".join(fr"\{x}" for x in self.delimiter) 3695 pattern = re.compile(f"([^{delimiters}]+)") 3696 rows = self.get_rows(infer_nrows, skiprows) 3697 if not rows: 3698 raise EmptyDataError("No rows from which to infer column width") 3699 max_len = max(map(len, rows)) 3700 mask = np.zeros(max_len + 1, dtype=int) 3701 if self.comment is not None: 3702 rows = [row.partition(self.comment)[0] for row in rows] 3703 for row in rows: 3704 for m in pattern.finditer(row): 3705 mask[m.start() : m.end()] = 1 3706 shifted = np.roll(mask, 1) 3707 shifted[0] = 0 3708 edges = np.where((mask ^ shifted) == 1)[0] 3709 edge_pairs = list(zip(edges[::2], edges[1::2])) 3710 return edge_pairs 3711 3712 def __next__(self): 3713 if self.buffer is not None: 3714 try: 3715 line = next(self.buffer) 3716 except StopIteration: 3717 self.buffer = None 3718 line = next(self.f) 3719 else: 3720 line = next(self.f) 3721 # Note: 'colspecs' is a sequence of half-open intervals. 3722 return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] 3723 3724 3725class FixedWidthFieldParser(PythonParser): 3726 """ 3727 Specialization that Converts fixed-width fields into DataFrames. 3728 See PythonParser for details. 3729 """ 3730 3731 def __init__(self, f, **kwds): 3732 # Support iterators, convert to a list. 3733 self.colspecs = kwds.pop("colspecs") 3734 self.infer_nrows = kwds.pop("infer_nrows") 3735 PythonParser.__init__(self, f, **kwds) 3736 3737 def _make_reader(self, f): 3738 self.data = FixedWidthReader( 3739 f, 3740 self.colspecs, 3741 self.delimiter, 3742 self.comment, 3743 self.skiprows, 3744 self.infer_nrows, 3745 ) 3746 3747 def _remove_empty_lines(self, lines) -> List: 3748 """ 3749 Returns the list of lines without the empty ones. With fixed-width 3750 fields, empty lines become arrays of empty strings. 3751 3752 See PythonParser._remove_empty_lines. 3753 """ 3754 return [ 3755 line 3756 for line in lines 3757 if any(not isinstance(e, str) or e.strip() for e in line) 3758 ] 3759 3760 3761def _refine_defaults_read( 3762 dialect: Union[str, csv.Dialect], 3763 delimiter: Union[str, object], 3764 delim_whitespace: bool, 3765 engine: str, 3766 sep: Union[str, object], 3767 defaults: Dict[str, Any], 3768): 3769 """Validate/refine default values of input parameters of read_csv, read_table. 3770 3771 Parameters 3772 ---------- 3773 dialect : str or csv.Dialect 3774 If provided, this parameter will override values (default or not) for the 3775 following parameters: `delimiter`, `doublequote`, `escapechar`, 3776 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 3777 override values, a ParserWarning will be issued. See csv.Dialect 3778 documentation for more details. 3779 delimiter : str or object 3780 Alias for sep. 3781 delim_whitespace : bool 3782 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 3783 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 3784 is set to True, nothing should be passed in for the ``delimiter`` 3785 parameter. 3786 engine : {{'c', 'python'}} 3787 Parser engine to use. The C engine is faster while the python engine is 3788 currently more feature-complete. 3789 sep : str or object 3790 A delimiter provided by the user (str) or a sentinel value, i.e. 3791 pandas._libs.lib.no_default. 3792 defaults: dict 3793 Default values of input parameters. 3794 3795 Returns 3796 ------- 3797 kwds : dict 3798 Input parameters with correct values. 3799 3800 Raises 3801 ------ 3802 ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and 3803 ``delim_whitespace=True``. 3804 """ 3805 # fix types for sep, delimiter to Union(str, Any) 3806 delim_default = defaults["delimiter"] 3807 kwds: Dict[str, Any] = {} 3808 # gh-23761 3809 # 3810 # When a dialect is passed, it overrides any of the overlapping 3811 # parameters passed in directly. We don't want to warn if the 3812 # default parameters were passed in (since it probably means 3813 # that the user didn't pass them in explicitly in the first place). 3814 # 3815 # "delimiter" is the annoying corner case because we alias it to 3816 # "sep" before doing comparison to the dialect values later on. 3817 # Thus, we need a flag to indicate that we need to "override" 3818 # the comparison to dialect values by checking if default values 3819 # for BOTH "delimiter" and "sep" were provided. 3820 if dialect is not None: 3821 kwds["sep_override"] = delimiter is None and ( 3822 sep is lib.no_default or sep == delim_default 3823 ) 3824 3825 # Alias sep -> delimiter. 3826 if delimiter is None: 3827 delimiter = sep 3828 3829 if delim_whitespace and (delimiter is not lib.no_default): 3830 raise ValueError( 3831 "Specified a delimiter with both sep and " 3832 "delim_whitespace=True; you can only specify one." 3833 ) 3834 3835 if delimiter is lib.no_default: 3836 # assign default separator value 3837 kwds["delimiter"] = delim_default 3838 else: 3839 kwds["delimiter"] = delimiter 3840 3841 if engine is not None: 3842 kwds["engine_specified"] = True 3843 else: 3844 kwds["engine"] = "c" 3845 kwds["engine_specified"] = False 3846 3847 return kwds 3848 3849 3850def _extract_dialect(kwds: Dict[str, Any]) -> Optional[csv.Dialect]: 3851 """ 3852 Extract concrete csv dialect instance. 3853 3854 Returns 3855 ------- 3856 csv.Dialect or None 3857 """ 3858 if kwds.get("dialect") is None: 3859 return None 3860 3861 dialect = kwds["dialect"] 3862 if dialect in csv.list_dialects(): 3863 dialect = csv.get_dialect(dialect) 3864 3865 _validate_dialect(dialect) 3866 3867 return dialect 3868 3869 3870MANDATORY_DIALECT_ATTRS = ( 3871 "delimiter", 3872 "doublequote", 3873 "escapechar", 3874 "skipinitialspace", 3875 "quotechar", 3876 "quoting", 3877) 3878 3879 3880def _validate_dialect(dialect: csv.Dialect) -> None: 3881 """ 3882 Validate csv dialect instance. 3883 3884 Raises 3885 ------ 3886 ValueError 3887 If incorrect dialect is provided. 3888 """ 3889 for param in MANDATORY_DIALECT_ATTRS: 3890 if not hasattr(dialect, param): 3891 raise ValueError(f"Invalid dialect {dialect} provided") 3892 3893 3894def _merge_with_dialect_properties( 3895 dialect: csv.Dialect, 3896 defaults: Dict[str, Any], 3897) -> Dict[str, Any]: 3898 """ 3899 Merge default kwargs in TextFileReader with dialect parameters. 3900 3901 Parameters 3902 ---------- 3903 dialect : csv.Dialect 3904 Concrete csv dialect. See csv.Dialect documentation for more details. 3905 defaults : dict 3906 Keyword arguments passed to TextFileReader. 3907 3908 Returns 3909 ------- 3910 kwds : dict 3911 Updated keyword arguments, merged with dialect parameters. 3912 """ 3913 kwds = defaults.copy() 3914 3915 for param in MANDATORY_DIALECT_ATTRS: 3916 dialect_val = getattr(dialect, param) 3917 3918 parser_default = _parser_defaults[param] 3919 provided = kwds.get(param, parser_default) 3920 3921 # Messages for conflicting values between the dialect 3922 # instance and the actual parameters provided. 3923 conflict_msgs = [] 3924 3925 # Don't warn if the default parameter was passed in, 3926 # even if it conflicts with the dialect (gh-23761). 3927 if provided != parser_default and provided != dialect_val: 3928 msg = ( 3929 f"Conflicting values for '{param}': '{provided}' was " 3930 f"provided, but the dialect specifies '{dialect_val}'. " 3931 "Using the dialect-specified value." 3932 ) 3933 3934 # Annoying corner case for not warning about 3935 # conflicts between dialect and delimiter parameter. 3936 # Refer to the outer "_read_" function for more info. 3937 if not (param == "delimiter" and kwds.pop("sep_override", False)): 3938 conflict_msgs.append(msg) 3939 3940 if conflict_msgs: 3941 warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2) 3942 kwds[param] = dialect_val 3943 return kwds 3944 3945 3946def _validate_skipfooter(kwds: Dict[str, Any]) -> None: 3947 """ 3948 Check whether skipfooter is compatible with other kwargs in TextFileReader. 3949 3950 Parameters 3951 ---------- 3952 kwds : dict 3953 Keyword arguments passed to TextFileReader. 3954 3955 Raises 3956 ------ 3957 ValueError 3958 If skipfooter is not compatible with other parameters. 3959 """ 3960 if kwds.get("skipfooter"): 3961 if kwds.get("iterator") or kwds.get("chunksize"): 3962 raise ValueError("'skipfooter' not supported for iteration") 3963 if kwds.get("nrows"): 3964 raise ValueError("'skipfooter' not supported with 'nrows'") 3965