1# Copyright (c) 2012, Lambda Foundry, Inc. 2# See LICENSE for the license 3from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC 4from errno import ENOENT 5import sys 6import time 7import warnings 8 9from libc.stdlib cimport free 10from libc.string cimport strcasecmp, strlen, strncpy 11 12import cython 13from cython import Py_ssize_t 14 15from cpython.bytes cimport PyBytes_AsString 16from cpython.exc cimport PyErr_Fetch, PyErr_Occurred 17from cpython.object cimport PyObject 18from cpython.ref cimport Py_XDECREF 19from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode 20 21 22cdef extern from "Python.h": 23 object PyUnicode_FromString(char *v) 24 25 26import numpy as np 27 28cimport numpy as cnp 29from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t 30 31cnp.import_array() 32 33from pandas._libs cimport util 34from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX 35 36import pandas._libs.lib as lib 37 38from pandas._libs.khash cimport ( 39 kh_destroy_float64, 40 kh_destroy_str, 41 kh_destroy_str_starts, 42 kh_destroy_strbox, 43 kh_exist_str, 44 kh_float64_t, 45 kh_get_float64, 46 kh_get_str, 47 kh_get_str_starts_item, 48 kh_get_strbox, 49 kh_init_float64, 50 kh_init_str, 51 kh_init_str_starts, 52 kh_init_strbox, 53 kh_put_float64, 54 kh_put_str, 55 kh_put_str_starts_item, 56 kh_put_strbox, 57 kh_resize_float64, 58 kh_resize_str_starts, 59 kh_str_starts_t, 60 kh_str_t, 61 kh_strbox_t, 62 khiter_t, 63) 64 65from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning 66 67from pandas.core.dtypes.common import ( 68 is_bool_dtype, 69 is_categorical_dtype, 70 is_datetime64_dtype, 71 is_extension_array_dtype, 72 is_float_dtype, 73 is_integer_dtype, 74 is_object_dtype, 75 pandas_dtype, 76) 77from pandas.core.dtypes.concat import union_categoricals 78 79cdef: 80 float64_t INF = <float64_t>np.inf 81 float64_t NEGINF = -INF 82 int64_t DEFAULT_CHUNKSIZE = 256 * 1024 83 84 85cdef extern from "headers/portable.h": 86 # I *think* this is here so that strcasecmp is defined on Windows 87 # so we don't get 88 # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp` 89 # in Appveyor. 90 # In a sane world, the `from libc.string cimport` above would fail 91 # loudly. 92 pass 93 94 95cdef extern from "parser/tokenizer.h": 96 97 ctypedef enum ParserState: 98 START_RECORD 99 START_FIELD 100 ESCAPED_CHAR 101 IN_FIELD 102 IN_QUOTED_FIELD 103 ESCAPE_IN_QUOTED_FIELD 104 QUOTE_IN_QUOTED_FIELD 105 EAT_CRNL 106 EAT_CRNL_NOP 107 EAT_WHITESPACE 108 EAT_COMMENT 109 EAT_LINE_COMMENT 110 WHITESPACE_LINE 111 SKIP_LINE 112 FINISHED 113 114 enum: ERROR_OVERFLOW 115 116 ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, 117 int *status) 118 ctypedef int (*io_cleanup)(void *src) 119 120 ctypedef struct parser_t: 121 void *source 122 io_callback cb_io 123 io_cleanup cb_cleanup 124 125 int64_t chunksize # Number of bytes to prepare for each chunk 126 char *data # pointer to data to be processed 127 int64_t datalen # amount of data available 128 int64_t datapos 129 130 # where to write out tokenized data 131 char *stream 132 uint64_t stream_len 133 uint64_t stream_cap 134 135 # Store words in (potentially ragged) matrix for now, hmm 136 char **words 137 int64_t *word_starts # where we are in the stream 138 uint64_t words_len 139 uint64_t words_cap 140 uint64_t max_words_cap # maximum word cap encountered 141 142 char *pword_start # pointer to stream start of current field 143 int64_t word_start # position start of current field 144 145 int64_t *line_start # position in words for start of line 146 int64_t *line_fields # Number of fields in each line 147 uint64_t lines # Number of lines observed 148 uint64_t file_lines # Number of lines observed (with bad/skipped) 149 uint64_t lines_cap # Vector capacity 150 151 # Tokenizing stuff 152 ParserState state 153 int doublequote # is " represented by ""? */ 154 char delimiter # field separator */ 155 int delim_whitespace # consume tabs / spaces instead 156 char quotechar # quote character */ 157 char escapechar # escape character */ 158 char lineterminator 159 int skipinitialspace # ignore spaces following delimiter? */ 160 int quoting # style of quoting to write */ 161 162 char commentchar 163 int allow_embedded_newline 164 int strict # raise exception on bad CSV */ 165 166 int usecols 167 168 int expected_fields 169 int error_bad_lines 170 int warn_bad_lines 171 172 # floating point options 173 char decimal 174 char sci 175 176 # thousands separator (comma, period) 177 char thousands 178 179 int header # Boolean: 1: has header, 0: no header 180 int64_t header_start # header row start 181 uint64_t header_end # header row end 182 183 void *skipset 184 PyObject *skipfunc 185 int64_t skip_first_N_rows 186 int64_t skipfooter 187 # pick one, depending on whether the converter requires GIL 188 float64_t (*double_converter)(const char *, char **, 189 char, char, char, 190 int, int *, int *) nogil 191 192 # error handling 193 char *warn_msg 194 char *error_msg 195 196 int64_t skip_empty_lines 197 198 ctypedef struct coliter_t: 199 char **words 200 int64_t *line_start 201 int64_t col 202 203 ctypedef struct uint_state: 204 int seen_sint 205 int seen_uint 206 int seen_null 207 208 void uint_state_init(uint_state *self) 209 int uint64_conflict(uint_state *self) 210 211 void coliter_setup(coliter_t *it, parser_t *parser, 212 int64_t i, int64_t start) nogil 213 void COLITER_NEXT(coliter_t, const char *) nogil 214 215 parser_t* parser_new() 216 217 int parser_init(parser_t *self) nogil 218 void parser_free(parser_t *self) nogil 219 void parser_del(parser_t *self) nogil 220 int parser_add_skiprow(parser_t *self, int64_t row) 221 222 int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) 223 224 void parser_set_default_options(parser_t *self) 225 226 int parser_consume_rows(parser_t *self, size_t nrows) 227 228 int parser_trim_buffers(parser_t *self) 229 230 int tokenize_all_rows(parser_t *self) nogil 231 int tokenize_nrows(parser_t *self, size_t nrows) nogil 232 233 int64_t str_to_int64(char *p_item, int64_t int_min, 234 int64_t int_max, int *error, char tsep) nogil 235 uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, 236 uint64_t uint_max, int *error, char tsep) nogil 237 238 float64_t xstrtod(const char *p, char **q, char decimal, 239 char sci, char tsep, int skip_trailing, 240 int *error, int *maybe_int) nogil 241 float64_t precise_xstrtod(const char *p, char **q, char decimal, 242 char sci, char tsep, int skip_trailing, 243 int *error, int *maybe_int) nogil 244 float64_t round_trip(const char *p, char **q, char decimal, 245 char sci, char tsep, int skip_trailing, 246 int *error, int *maybe_int) nogil 247 248 int to_boolean(const char *item, uint8_t *val) nogil 249 250 251cdef extern from "parser/io.h": 252 void *new_mmap(char *fname) 253 int del_mmap(void *src) 254 void* buffer_mmap_bytes(void *source, size_t nbytes, 255 size_t *bytes_read, int *status) 256 257 void *new_file_source(char *fname, size_t buffer_size) except NULL 258 259 void *new_rd_source(object obj) except NULL 260 261 int del_file_source(void *src) 262 int del_rd_source(void *src) 263 264 void* buffer_file_bytes(void *source, size_t nbytes, 265 size_t *bytes_read, int *status) 266 267 void* buffer_rd_bytes(void *source, size_t nbytes, 268 size_t *bytes_read, int *status) 269 270 271cdef class TextReader: 272 """ 273 274 # source: StringIO or file object 275 276 ..versionchange:: 1.2.0 277 removed 'compression', 'memory_map', and 'encoding' argument. 278 These arguments are outsourced to CParserWrapper. 279 'source' has to be a file handle. 280 """ 281 282 cdef: 283 parser_t *parser 284 object na_fvalues 285 object true_values, false_values 286 object handle 287 bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns 288 uint64_t parser_start 289 list clocks 290 char *c_encoding 291 kh_str_starts_t *false_set 292 kh_str_starts_t *true_set 293 294 cdef public: 295 int64_t leading_cols, table_width, skipfooter, buffer_lines 296 bint allow_leading_cols, mangle_dupe_cols, low_memory 297 bint delim_whitespace 298 object delimiter, converters 299 object na_values 300 object header, orig_header, names, header_start, header_end 301 object index_col 302 object skiprows 303 object dtype 304 object usecols 305 list dtype_cast_order 306 set unnamed_cols 307 set noconvert 308 309 def __cinit__(self, source, 310 delimiter=b',', 311 header=0, 312 header_start=0, 313 header_end=0, 314 index_col=None, 315 names=None, 316 tokenize_chunksize=DEFAULT_CHUNKSIZE, 317 bint delim_whitespace=False, 318 converters=None, 319 bint skipinitialspace=False, 320 escapechar=None, 321 bint doublequote=True, 322 quotechar=b'"', 323 quoting=0, 324 lineterminator=None, 325 comment=None, 326 decimal=b'.', 327 thousands=None, 328 dtype=None, 329 usecols=None, 330 bint error_bad_lines=True, 331 bint warn_bad_lines=True, 332 bint na_filter=True, 333 na_values=None, 334 na_fvalues=None, 335 bint keep_default_na=True, 336 true_values=None, 337 false_values=None, 338 bint allow_leading_cols=True, 339 bint low_memory=False, 340 skiprows=None, 341 skipfooter=0, 342 bint verbose=False, 343 bint mangle_dupe_cols=True, 344 float_precision=None, 345 bint skip_blank_lines=True): 346 347 # set encoding for native Python and C library 348 self.c_encoding = NULL 349 350 self.parser = parser_new() 351 self.parser.chunksize = tokenize_chunksize 352 353 self.mangle_dupe_cols = mangle_dupe_cols 354 355 # For timekeeping 356 self.clocks = [] 357 358 self.parser.usecols = (usecols is not None) 359 360 self._setup_parser_source(source) 361 parser_set_default_options(self.parser) 362 363 parser_init(self.parser) 364 365 if delim_whitespace: 366 self.parser.delim_whitespace = delim_whitespace 367 else: 368 if len(delimiter) > 1: 369 raise ValueError('only length-1 separators excluded right now') 370 self.parser.delimiter = ord(delimiter) 371 372 # ---------------------------------------- 373 # parser options 374 375 self.parser.doublequote = doublequote 376 self.parser.skipinitialspace = skipinitialspace 377 self.parser.skip_empty_lines = skip_blank_lines 378 379 if lineterminator is not None: 380 if len(lineterminator) != 1: 381 raise ValueError('Only length-1 line terminators supported') 382 self.parser.lineterminator = ord(lineterminator) 383 384 if len(decimal) != 1: 385 raise ValueError('Only length-1 decimal markers supported') 386 self.parser.decimal = ord(decimal) 387 388 if thousands is not None: 389 if len(thousands) != 1: 390 raise ValueError('Only length-1 thousands markers supported') 391 self.parser.thousands = ord(thousands) 392 393 if escapechar is not None: 394 if len(escapechar) != 1: 395 raise ValueError('Only length-1 escapes supported') 396 self.parser.escapechar = ord(escapechar) 397 398 self._set_quoting(quotechar, quoting) 399 400 dtype_order = ['int64', 'float64', 'bool', 'object'] 401 if quoting == QUOTE_NONNUMERIC: 402 # consistent with csv module semantics, cast all to float 403 dtype_order = dtype_order[1:] 404 self.dtype_cast_order = [np.dtype(x) for x in dtype_order] 405 406 if comment is not None: 407 if len(comment) > 1: 408 raise ValueError('Only length-1 comment characters supported') 409 self.parser.commentchar = ord(comment) 410 411 # error handling of bad lines 412 self.parser.error_bad_lines = int(error_bad_lines) 413 self.parser.warn_bad_lines = int(warn_bad_lines) 414 415 self.skiprows = skiprows 416 if skiprows is not None: 417 self._make_skiprow_set() 418 419 self.skipfooter = skipfooter 420 421 # suboptimal 422 if usecols is not None: 423 self.has_usecols = 1 424 # GH-20558, validate usecols at higher level and only pass clean 425 # usecols into TextReader. 426 self.usecols = usecols 427 428 # XXX 429 if skipfooter > 0: 430 self.parser.error_bad_lines = 0 431 self.parser.warn_bad_lines = 0 432 433 self.delimiter = delimiter 434 self.delim_whitespace = delim_whitespace 435 436 self.na_values = na_values 437 if na_fvalues is None: 438 na_fvalues = set() 439 self.na_fvalues = na_fvalues 440 441 self.true_values = _maybe_encode(true_values) + _true_values 442 self.false_values = _maybe_encode(false_values) + _false_values 443 444 self.true_set = kset_from_list(self.true_values) 445 self.false_set = kset_from_list(self.false_values) 446 447 self.keep_default_na = keep_default_na 448 self.converters = converters 449 self.na_filter = na_filter 450 451 self.verbose = verbose 452 self.low_memory = low_memory 453 454 if float_precision == "round_trip": 455 # see gh-15140 456 self.parser.double_converter = round_trip 457 elif float_precision == "legacy": 458 self.parser.double_converter = xstrtod 459 elif float_precision == "high" or float_precision is None: 460 self.parser.double_converter = precise_xstrtod 461 else: 462 raise ValueError(f'Unrecognized float_precision option: ' 463 f'{float_precision}') 464 465 if isinstance(dtype, dict): 466 dtype = {k: pandas_dtype(dtype[k]) 467 for k in dtype} 468 elif dtype is not None: 469 dtype = pandas_dtype(dtype) 470 471 self.dtype = dtype 472 473 # XXX 474 self.noconvert = set() 475 476 self.index_col = index_col 477 478 # ---------------------------------------- 479 # header stuff 480 481 self.allow_leading_cols = allow_leading_cols 482 self.leading_cols = 0 483 484 # TODO: no header vs. header is not the first row 485 self.has_mi_columns = 0 486 self.orig_header = header 487 if header is None: 488 # sentinel value 489 self.parser.header_start = -1 490 self.parser.header_end = -1 491 self.parser.header = -1 492 self.parser_start = 0 493 self.header = [] 494 else: 495 if isinstance(header, list): 496 if len(header) > 1: 497 # need to artificially skip the final line 498 # which is still a header line 499 header = list(header) 500 header.append(header[-1] + 1) 501 self.parser.header_end = header[-1] 502 self.has_mi_columns = 1 503 else: 504 self.parser.header_end = header[0] 505 506 self.parser_start = header[-1] + 1 507 self.parser.header_start = header[0] 508 self.parser.header = header[0] 509 self.header = header 510 else: 511 self.parser.header_start = header 512 self.parser.header_end = header 513 self.parser_start = header + 1 514 self.parser.header = header 515 self.header = [ header ] 516 517 self.names = names 518 self.header, self.table_width, self.unnamed_cols = self._get_header() 519 520 if not self.table_width: 521 raise EmptyDataError("No columns to parse from file") 522 523 # Compute buffer_lines as function of table width. 524 heuristic = 2**20 // self.table_width 525 self.buffer_lines = 1 526 while self.buffer_lines * 2 < heuristic: 527 self.buffer_lines *= 2 528 529 def __init__(self, *args, **kwargs): 530 pass 531 532 def __dealloc__(self): 533 parser_free(self.parser) 534 if self.true_set: 535 kh_destroy_str_starts(self.true_set) 536 self.true_set = NULL 537 if self.false_set: 538 kh_destroy_str_starts(self.false_set) 539 self.false_set = NULL 540 parser_del(self.parser) 541 542 def close(self): 543 # also preemptively free all allocated memory 544 parser_free(self.parser) 545 if self.true_set: 546 kh_destroy_str_starts(self.true_set) 547 self.true_set = NULL 548 if self.false_set: 549 kh_destroy_str_starts(self.false_set) 550 self.false_set = NULL 551 552 def set_error_bad_lines(self, int status): 553 self.parser.error_bad_lines = status 554 555 def _set_quoting(self, quote_char, quoting): 556 if not isinstance(quoting, int): 557 raise TypeError('"quoting" must be an integer') 558 559 if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: 560 raise TypeError('bad "quoting" value') 561 562 if not isinstance(quote_char, (str, bytes)) and quote_char is not None: 563 dtype = type(quote_char).__name__ 564 raise TypeError(f'"quotechar" must be string, not {dtype}') 565 566 if quote_char is None or quote_char == '': 567 if quoting != QUOTE_NONE: 568 raise TypeError("quotechar must be set if quoting enabled") 569 self.parser.quoting = quoting 570 self.parser.quotechar = -1 571 elif len(quote_char) > 1: # 0-len case handled earlier 572 raise TypeError('"quotechar" must be a 1-character string') 573 else: 574 self.parser.quoting = quoting 575 self.parser.quotechar = ord(quote_char) 576 577 cdef _make_skiprow_set(self): 578 if util.is_integer_object(self.skiprows): 579 parser_set_skipfirstnrows(self.parser, self.skiprows) 580 elif not callable(self.skiprows): 581 for i in self.skiprows: 582 parser_add_skiprow(self.parser, i) 583 else: 584 self.parser.skipfunc = <PyObject *>self.skiprows 585 586 cdef _setup_parser_source(self, source): 587 cdef: 588 void *ptr 589 590 if not hasattr(source, "read"): 591 raise IOError(f'Expected file path name or file-like object, ' 592 f'got {type(source)} type') 593 594 ptr = new_rd_source(source) 595 self.parser.source = ptr 596 self.parser.cb_io = &buffer_rd_bytes 597 self.parser.cb_cleanup = &del_rd_source 598 599 cdef _get_header(self): 600 # header is now a list of lists, so field_count should use header[0] 601 602 cdef: 603 Py_ssize_t i, start, field_count, passed_count, unnamed_count, level 604 char *word 605 object name, old_name 606 uint64_t hr, data_line = 0 607 char *errors = "strict" 608 StringPath path = _string_path(self.c_encoding) 609 list header = [] 610 set unnamed_cols = set() 611 612 if self.parser.header_start >= 0: 613 614 # Header is in the file 615 for level, hr in enumerate(self.header): 616 617 this_header = [] 618 619 if self.parser.lines < hr + 1: 620 self._tokenize_rows(hr + 2) 621 622 if self.parser.lines == 0: 623 field_count = 0 624 start = self.parser.line_start[0] 625 626 # e.g., if header=3 and file only has 2 lines 627 elif (self.parser.lines < hr + 1 628 and not isinstance(self.orig_header, list)) or ( 629 self.parser.lines < hr): 630 msg = self.orig_header 631 if isinstance(msg, list): 632 joined = ','.join(str(m) for m in msg) 633 msg = f"[{joined}], len of {len(msg)}," 634 raise ParserError( 635 f'Passed header={msg} but only ' 636 f'{self.parser.lines} lines in file') 637 638 else: 639 field_count = self.parser.line_fields[hr] 640 start = self.parser.line_start[hr] 641 642 counts = {} 643 unnamed_count = 0 644 645 for i in range(field_count): 646 word = self.parser.words[start + i] 647 648 if path == UTF8: 649 name = PyUnicode_FromString(word) 650 elif path == ENCODED: 651 name = PyUnicode_Decode(word, strlen(word), 652 self.c_encoding, errors) 653 654 # We use this later when collecting placeholder names. 655 old_name = name 656 657 if name == '': 658 if self.has_mi_columns: 659 name = f'Unnamed: {i}_level_{level}' 660 else: 661 name = f'Unnamed: {i}' 662 unnamed_count += 1 663 664 count = counts.get(name, 0) 665 666 if not self.has_mi_columns and self.mangle_dupe_cols: 667 while count > 0: 668 counts[name] = count + 1 669 name = f'{name}.{count}' 670 count = counts.get(name, 0) 671 672 if old_name == '': 673 unnamed_cols.add(name) 674 675 this_header.append(name) 676 counts[name] = count + 1 677 678 if self.has_mi_columns: 679 680 # If we have grabbed an extra line, but it's not in our 681 # format, save in the buffer, and create an blank extra 682 # line for the rest of the parsing code. 683 if hr == self.header[-1]: 684 lc = len(this_header) 685 ic = (len(self.index_col) if self.index_col 686 is not None else 0) 687 688 if lc != unnamed_count and lc - ic > unnamed_count: 689 hr -= 1 690 self.parser_start -= 1 691 this_header = [None] * lc 692 693 data_line = hr + 1 694 header.append(this_header) 695 696 if self.names is not None: 697 header = [ self.names ] 698 699 elif self.names is not None: 700 # Enforce this unless usecols 701 if not self.has_usecols: 702 self.parser.expected_fields = len(self.names) 703 704 # Names passed 705 if self.parser.lines < 1: 706 self._tokenize_rows(1) 707 708 header = [ self.names ] 709 710 if self.parser.lines < 1: 711 field_count = len(header[0]) 712 else: 713 field_count = self.parser.line_fields[data_line] 714 else: 715 # No header passed nor to be found in the file 716 if self.parser.lines < 1: 717 self._tokenize_rows(1) 718 719 return None, self.parser.line_fields[0], unnamed_cols 720 721 # Corner case, not enough lines in the file 722 if self.parser.lines < data_line + 1: 723 field_count = len(header[0]) 724 else: # not self.has_usecols: 725 726 field_count = self.parser.line_fields[data_line] 727 728 # #2981 729 if self.names is not None: 730 field_count = max(field_count, len(self.names)) 731 732 passed_count = len(header[0]) 733 734 if (self.has_usecols and self.allow_leading_cols and 735 not callable(self.usecols)): 736 nuse = len(self.usecols) 737 if nuse == passed_count: 738 self.leading_cols = 0 739 elif self.names is None and nuse < passed_count: 740 self.leading_cols = field_count - passed_count 741 elif passed_count != field_count: 742 raise ValueError('Passed header names ' 743 'mismatches usecols') 744 # oh boy, #2442, #2981 745 elif self.allow_leading_cols and passed_count < field_count: 746 self.leading_cols = field_count - passed_count 747 748 return header, field_count, unnamed_cols 749 750 def read(self, rows=None): 751 """ 752 rows=None --> read all rows 753 """ 754 if self.low_memory: 755 # Conserve intermediate space 756 columns = self._read_low_memory(rows) 757 else: 758 # Don't care about memory usage 759 columns = self._read_rows(rows, 1) 760 761 return columns 762 763 cdef _read_low_memory(self, rows): 764 cdef: 765 size_t rows_read = 0 766 list chunks = [] 767 768 if rows is None: 769 while True: 770 try: 771 chunk = self._read_rows(self.buffer_lines, 0) 772 if len(chunk) == 0: 773 break 774 except StopIteration: 775 break 776 else: 777 chunks.append(chunk) 778 else: 779 while rows_read < rows: 780 try: 781 crows = min(self.buffer_lines, rows - rows_read) 782 783 chunk = self._read_rows(crows, 0) 784 if len(chunk) == 0: 785 break 786 787 rows_read += len(list(chunk.values())[0]) 788 except StopIteration: 789 break 790 else: 791 chunks.append(chunk) 792 793 parser_trim_buffers(self.parser) 794 795 if len(chunks) == 0: 796 raise StopIteration 797 798 # destructive to chunks 799 return _concatenate_chunks(chunks) 800 801 cdef _tokenize_rows(self, size_t nrows): 802 cdef: 803 int status 804 805 with nogil: 806 status = tokenize_nrows(self.parser, nrows) 807 808 if self.parser.warn_msg != NULL: 809 print(self.parser.warn_msg, file=sys.stderr) 810 free(self.parser.warn_msg) 811 self.parser.warn_msg = NULL 812 813 if status < 0: 814 raise_parser_error('Error tokenizing data', self.parser) 815 816 cdef _read_rows(self, rows, bint trim): 817 cdef: 818 int64_t buffered_lines 819 int64_t irows, footer = 0 820 821 self._start_clock() 822 823 if rows is not None: 824 irows = rows 825 buffered_lines = self.parser.lines - self.parser_start 826 if buffered_lines < irows: 827 self._tokenize_rows(irows - buffered_lines) 828 829 if self.skipfooter > 0: 830 raise ValueError('skipfooter can only be used to read ' 831 'the whole file') 832 else: 833 with nogil: 834 status = tokenize_all_rows(self.parser) 835 836 if self.parser.warn_msg != NULL: 837 print(self.parser.warn_msg, file=sys.stderr) 838 free(self.parser.warn_msg) 839 self.parser.warn_msg = NULL 840 841 if status < 0: 842 raise_parser_error('Error tokenizing data', self.parser) 843 footer = self.skipfooter 844 845 if self.parser_start >= self.parser.lines: 846 raise StopIteration 847 self._end_clock('Tokenization') 848 849 self._start_clock() 850 columns = self._convert_column_data(rows=rows, 851 footer=footer, 852 upcast_na=True) 853 self._end_clock('Type conversion') 854 self._start_clock() 855 if len(columns) > 0: 856 rows_read = len(list(columns.values())[0]) 857 # trim 858 parser_consume_rows(self.parser, rows_read) 859 if trim: 860 parser_trim_buffers(self.parser) 861 self.parser_start -= rows_read 862 863 self._end_clock('Parser memory cleanup') 864 865 return columns 866 867 cdef _start_clock(self): 868 self.clocks.append(time.time()) 869 870 cdef _end_clock(self, what): 871 if self.verbose: 872 elapsed = time.time() - self.clocks.pop(-1) 873 print(f'{what} took: {elapsed * 1000:.2f} ms') 874 875 def set_noconvert(self, i): 876 self.noconvert.add(i) 877 878 def remove_noconvert(self, i): 879 self.noconvert.remove(i) 880 881 def _convert_column_data(self, rows=None, upcast_na=False, footer=0): 882 cdef: 883 int64_t i 884 int nused 885 kh_str_starts_t *na_hashset = NULL 886 int64_t start, end 887 object name, na_flist, col_dtype = None 888 bint na_filter = 0 889 int64_t num_cols 890 891 start = self.parser_start 892 893 if rows is None: 894 end = self.parser.lines 895 else: 896 end = min(start + rows, self.parser.lines) 897 898 # FIXME: dont leave commented-out 899 # # skip footer 900 # if footer > 0: 901 # end -= footer 902 903 num_cols = -1 904 # Py_ssize_t cast prevents build warning 905 for i in range(<Py_ssize_t>self.parser.lines): 906 num_cols = (num_cols < self.parser.line_fields[i]) * \ 907 self.parser.line_fields[i] + \ 908 (num_cols >= self.parser.line_fields[i]) * num_cols 909 910 if self.table_width - self.leading_cols > num_cols: 911 raise ParserError(f"Too many columns specified: expected " 912 f"{self.table_width - self.leading_cols} " 913 f"and found {num_cols}") 914 915 results = {} 916 nused = 0 917 for i in range(self.table_width): 918 if i < self.leading_cols: 919 # Pass through leading columns always 920 name = i 921 elif (self.usecols and not callable(self.usecols) and 922 nused == len(self.usecols)): 923 # Once we've gathered all requested columns, stop. GH5766 924 break 925 else: 926 name = self._get_column_name(i, nused) 927 usecols = set() 928 if callable(self.usecols): 929 if self.usecols(name): 930 usecols = {i} 931 else: 932 usecols = self.usecols 933 if self.has_usecols and not (i in usecols or 934 name in usecols): 935 continue 936 nused += 1 937 938 conv = self._get_converter(i, name) 939 940 col_dtype = None 941 if self.dtype is not None: 942 if isinstance(self.dtype, dict): 943 if name in self.dtype: 944 col_dtype = self.dtype[name] 945 elif i in self.dtype: 946 col_dtype = self.dtype[i] 947 else: 948 if self.dtype.names: 949 # structured array 950 col_dtype = np.dtype(self.dtype.descr[i][1]) 951 else: 952 col_dtype = self.dtype 953 954 if conv: 955 if col_dtype is not None: 956 warnings.warn((f"Both a converter and dtype were specified " 957 f"for column {name} - only the converter will " 958 f"be used"), ParserWarning, 959 stacklevel=5) 960 results[i] = _apply_converter(conv, self.parser, i, start, end, 961 self.c_encoding) 962 continue 963 964 # Collect the list of NaN values associated with the column. 965 # If we aren't supposed to do that, or none are collected, 966 # we set `na_filter` to `0` (`1` otherwise). 967 na_flist = set() 968 969 if self.na_filter: 970 na_list, na_flist = self._get_na_list(i, name) 971 if na_list is None: 972 na_filter = 0 973 else: 974 na_filter = 1 975 na_hashset = kset_from_list(na_list) 976 else: 977 na_filter = 0 978 979 # Attempt to parse tokens and infer dtype of the column. 980 # Should return as the desired dtype (inferred or specified). 981 try: 982 col_res, na_count = self._convert_tokens( 983 i, start, end, name, na_filter, na_hashset, 984 na_flist, col_dtype) 985 finally: 986 # gh-21353 987 # 988 # Cleanup the NaN hash that we generated 989 # to avoid memory leaks. 990 if na_filter: 991 self._free_na_set(na_hashset) 992 993 # don't try to upcast EAs 994 try_upcast = upcast_na and na_count > 0 995 if try_upcast and not is_extension_array_dtype(col_dtype): 996 col_res = _maybe_upcast(col_res) 997 998 if col_res is None: 999 raise ParserError(f'Unable to parse column {i}') 1000 1001 results[i] = col_res 1002 1003 self.parser_start += end - start 1004 1005 return results 1006 1007 cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, 1008 object name, bint na_filter, 1009 kh_str_starts_t *na_hashset, 1010 object na_flist, object col_dtype): 1011 1012 if col_dtype is not None: 1013 col_res, na_count = self._convert_with_dtype( 1014 col_dtype, i, start, end, na_filter, 1015 1, na_hashset, na_flist) 1016 1017 # Fallback on the parse (e.g. we requested int dtype, 1018 # but its actually a float). 1019 if col_res is not None: 1020 return col_res, na_count 1021 1022 if i in self.noconvert: 1023 return self._string_convert(i, start, end, na_filter, na_hashset) 1024 else: 1025 col_res = None 1026 for dt in self.dtype_cast_order: 1027 try: 1028 col_res, na_count = self._convert_with_dtype( 1029 dt, i, start, end, na_filter, 0, na_hashset, na_flist) 1030 except ValueError: 1031 # This error is raised from trying to convert to uint64, 1032 # and we discover that we cannot convert to any numerical 1033 # dtype successfully. As a result, we leave the data 1034 # column AS IS with object dtype. 1035 col_res, na_count = self._convert_with_dtype( 1036 np.dtype('object'), i, start, end, 0, 1037 0, na_hashset, na_flist) 1038 except OverflowError: 1039 col_res, na_count = self._convert_with_dtype( 1040 np.dtype('object'), i, start, end, na_filter, 1041 0, na_hashset, na_flist) 1042 1043 if col_res is not None: 1044 break 1045 1046 # we had a fallback parse on the dtype, so now try to cast 1047 # only allow safe casts, eg. with a nan you cannot safely cast to int 1048 if col_res is not None and col_dtype is not None: 1049 try: 1050 col_res = col_res.astype(col_dtype, casting='safe') 1051 except TypeError: 1052 1053 # float -> int conversions can fail the above 1054 # even with no nans 1055 col_res_orig = col_res 1056 col_res = col_res.astype(col_dtype) 1057 if (col_res != col_res_orig).any(): 1058 raise ValueError( 1059 f"cannot safely convert passed user dtype of " 1060 f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in " 1061 f"column {i}") 1062 1063 return col_res, na_count 1064 1065 cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, 1066 int64_t start, int64_t end, 1067 bint na_filter, 1068 bint user_dtype, 1069 kh_str_starts_t *na_hashset, 1070 object na_flist): 1071 if is_categorical_dtype(dtype): 1072 # TODO: I suspect that _categorical_convert could be 1073 # optimized when dtype is an instance of CategoricalDtype 1074 codes, cats, na_count = _categorical_convert( 1075 self.parser, i, start, end, na_filter, 1076 na_hashset, self.c_encoding) 1077 1078 # Method accepts list of strings, not encoded ones. 1079 true_values = [x.decode() for x in self.true_values] 1080 array_type = dtype.construct_array_type() 1081 cat = array_type._from_inferred_categories( 1082 cats, codes, dtype, true_values=true_values) 1083 return cat, na_count 1084 1085 elif is_extension_array_dtype(dtype): 1086 result, na_count = self._string_convert(i, start, end, na_filter, 1087 na_hashset) 1088 array_type = dtype.construct_array_type() 1089 try: 1090 # use _from_sequence_of_strings if the class defines it 1091 result = array_type._from_sequence_of_strings(result, 1092 dtype=dtype) 1093 except NotImplementedError: 1094 raise NotImplementedError( 1095 f"Extension Array: {array_type} must implement " 1096 f"_from_sequence_of_strings in order " 1097 f"to be used in parser methods") 1098 1099 return result, na_count 1100 1101 elif is_integer_dtype(dtype): 1102 try: 1103 result, na_count = _try_int64(self.parser, i, start, 1104 end, na_filter, na_hashset) 1105 if user_dtype and na_count is not None: 1106 if na_count > 0: 1107 raise ValueError(f"Integer column has NA values in column {i}") 1108 except OverflowError: 1109 result = _try_uint64(self.parser, i, start, end, 1110 na_filter, na_hashset) 1111 na_count = 0 1112 1113 if result is not None and dtype != 'int64': 1114 result = result.astype(dtype) 1115 1116 return result, na_count 1117 1118 elif is_float_dtype(dtype): 1119 result, na_count = _try_double(self.parser, i, start, end, 1120 na_filter, na_hashset, na_flist) 1121 1122 if result is not None and dtype != 'float64': 1123 result = result.astype(dtype) 1124 return result, na_count 1125 elif is_bool_dtype(dtype): 1126 result, na_count = _try_bool_flex(self.parser, i, start, end, 1127 na_filter, na_hashset, 1128 self.true_set, self.false_set) 1129 if user_dtype and na_count is not None: 1130 if na_count > 0: 1131 raise ValueError(f"Bool column has NA values in column {i}") 1132 return result, na_count 1133 1134 elif dtype.kind == 'S': 1135 # TODO: na handling 1136 width = dtype.itemsize 1137 if width > 0: 1138 result = _to_fw_string(self.parser, i, start, end, width) 1139 return result, 0 1140 1141 # treat as a regular string parsing 1142 return self._string_convert(i, start, end, na_filter, 1143 na_hashset) 1144 elif dtype.kind == 'U': 1145 width = dtype.itemsize 1146 if width > 0: 1147 raise TypeError(f"the dtype {dtype} is not supported for parsing") 1148 1149 # unicode variable width 1150 return self._string_convert(i, start, end, na_filter, 1151 na_hashset) 1152 elif is_object_dtype(dtype): 1153 return self._string_convert(i, start, end, na_filter, 1154 na_hashset) 1155 elif is_datetime64_dtype(dtype): 1156 raise TypeError(f"the dtype {dtype} is not supported " 1157 f"for parsing, pass this column " 1158 f"using parse_dates instead") 1159 else: 1160 raise TypeError(f"the dtype {dtype} is not supported for parsing") 1161 1162 cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, 1163 bint na_filter, kh_str_starts_t *na_hashset): 1164 1165 cdef StringPath path = _string_path(self.c_encoding) 1166 1167 if path == UTF8: 1168 return _string_box_utf8(self.parser, i, start, end, na_filter, 1169 na_hashset) 1170 elif path == ENCODED: 1171 return _string_box_decode(self.parser, i, start, end, 1172 na_filter, na_hashset, self.c_encoding) 1173 1174 def _get_converter(self, i, name): 1175 if self.converters is None: 1176 return None 1177 1178 if name is not None and name in self.converters: 1179 return self.converters[name] 1180 1181 # Converter for position, if any 1182 return self.converters.get(i) 1183 1184 cdef _get_na_list(self, i, name): 1185 if self.na_values is None: 1186 return None, set() 1187 1188 if isinstance(self.na_values, dict): 1189 key = None 1190 values = None 1191 1192 if name is not None and name in self.na_values: 1193 key = name 1194 elif i in self.na_values: 1195 key = i 1196 else: # No na_values provided for this column. 1197 if self.keep_default_na: 1198 return _NA_VALUES, set() 1199 1200 return list(), set() 1201 1202 values = self.na_values[key] 1203 if values is not None and not isinstance(values, list): 1204 values = list(values) 1205 1206 fvalues = self.na_fvalues[key] 1207 if fvalues is not None and not isinstance(fvalues, set): 1208 fvalues = set(fvalues) 1209 1210 return _ensure_encoded(values), fvalues 1211 else: 1212 if not isinstance(self.na_values, list): 1213 self.na_values = list(self.na_values) 1214 if not isinstance(self.na_fvalues, set): 1215 self.na_fvalues = set(self.na_fvalues) 1216 1217 return _ensure_encoded(self.na_values), self.na_fvalues 1218 1219 cdef _free_na_set(self, kh_str_starts_t *table): 1220 kh_destroy_str_starts(table) 1221 1222 cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): 1223 cdef int64_t j 1224 if self.has_usecols and self.names is not None: 1225 if (not callable(self.usecols) and 1226 len(self.names) == len(self.usecols)): 1227 return self.names[nused] 1228 else: 1229 return self.names[i - self.leading_cols] 1230 else: 1231 if self.header is not None: 1232 j = i - self.leading_cols 1233 # generate extra (bogus) headers if there are more columns than headers 1234 if j >= len(self.header[0]): 1235 return j 1236 else: 1237 return self.header[0][j] 1238 else: 1239 return None 1240 1241 1242cdef: 1243 object _true_values = [b'True', b'TRUE', b'true'] 1244 object _false_values = [b'False', b'FALSE', b'false'] 1245 1246 1247def _ensure_encoded(list lst): 1248 cdef: 1249 list result = [] 1250 for x in lst: 1251 if isinstance(x, str): 1252 x = PyUnicode_AsUTF8String(x) 1253 elif not isinstance(x, bytes): 1254 x = str(x).encode('utf-8') 1255 1256 result.append(x) 1257 return result 1258 1259 1260# common NA values 1261# no longer excluding inf representations 1262# '1.#INF','-1.#INF', '1.#INF000000', 1263STR_NA_VALUES = { 1264 "-1.#IND", 1265 "1.#QNAN", 1266 "1.#IND", 1267 "-1.#QNAN", 1268 "#N/A N/A", 1269 "#N/A", 1270 "N/A", 1271 "n/a", 1272 "NA", 1273 "<NA>", 1274 "#NA", 1275 "NULL", 1276 "null", 1277 "NaN", 1278 "-NaN", 1279 "nan", 1280 "-nan", 1281 "", 1282} 1283_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) 1284 1285 1286def _maybe_upcast(arr): 1287 """ 1288 1289 """ 1290 if issubclass(arr.dtype.type, np.integer): 1291 na_value = na_values[arr.dtype] 1292 arr = arr.astype(float) 1293 np.putmask(arr, arr == na_value, np.nan) 1294 elif arr.dtype == np.bool_: 1295 mask = arr.view(np.uint8) == na_values[np.uint8] 1296 arr = arr.astype(object) 1297 np.putmask(arr, mask, np.nan) 1298 1299 return arr 1300 1301 1302cdef enum StringPath: 1303 UTF8 1304 ENCODED 1305 1306 1307# factored out logic to pick string converter 1308cdef inline StringPath _string_path(char *encoding): 1309 if encoding != NULL and encoding != b"utf-8": 1310 return ENCODED 1311 return UTF8 1312 1313 1314# ---------------------------------------------------------------------- 1315# Type conversions / inference support code 1316 1317 1318cdef _string_box_utf8(parser_t *parser, int64_t col, 1319 int64_t line_start, int64_t line_end, 1320 bint na_filter, kh_str_starts_t *na_hashset): 1321 cdef: 1322 int error, na_count = 0 1323 Py_ssize_t i, lines 1324 coliter_t it 1325 const char *word = NULL 1326 ndarray[object] result 1327 1328 int ret = 0 1329 kh_strbox_t *table 1330 1331 object pyval 1332 1333 object NA = na_values[np.object_] 1334 khiter_t k 1335 1336 table = kh_init_strbox() 1337 lines = line_end - line_start 1338 result = np.empty(lines, dtype=np.object_) 1339 coliter_setup(&it, parser, col, line_start) 1340 1341 for i in range(lines): 1342 COLITER_NEXT(it, word) 1343 1344 if na_filter: 1345 if kh_get_str_starts_item(na_hashset, word): 1346 # in the hash table 1347 na_count += 1 1348 result[i] = NA 1349 continue 1350 1351 k = kh_get_strbox(table, word) 1352 1353 # in the hash table 1354 if k != table.n_buckets: 1355 # this increments the refcount, but need to test 1356 pyval = <object>table.vals[k] 1357 else: 1358 # box it. new ref? 1359 pyval = PyUnicode_FromString(word) 1360 1361 k = kh_put_strbox(table, word, &ret) 1362 table.vals[k] = <PyObject *>pyval 1363 1364 result[i] = pyval 1365 1366 kh_destroy_strbox(table) 1367 1368 return result, na_count 1369 1370 1371cdef _string_box_decode(parser_t *parser, int64_t col, 1372 int64_t line_start, int64_t line_end, 1373 bint na_filter, kh_str_starts_t *na_hashset, 1374 char *encoding): 1375 cdef: 1376 int na_count = 0 1377 Py_ssize_t i, size, lines 1378 coliter_t it 1379 const char *word = NULL 1380 ndarray[object] result 1381 1382 int ret = 0 1383 kh_strbox_t *table 1384 1385 char *errors = "strict" 1386 1387 object pyval 1388 1389 object NA = na_values[np.object_] 1390 khiter_t k 1391 1392 table = kh_init_strbox() 1393 lines = line_end - line_start 1394 result = np.empty(lines, dtype=np.object_) 1395 coliter_setup(&it, parser, col, line_start) 1396 1397 for i in range(lines): 1398 COLITER_NEXT(it, word) 1399 1400 if na_filter: 1401 if kh_get_str_starts_item(na_hashset, word): 1402 # in the hash table 1403 na_count += 1 1404 result[i] = NA 1405 continue 1406 1407 k = kh_get_strbox(table, word) 1408 1409 # in the hash table 1410 if k != table.n_buckets: 1411 # this increments the refcount, but need to test 1412 pyval = <object>table.vals[k] 1413 else: 1414 # box it. new ref? 1415 size = strlen(word) 1416 pyval = PyUnicode_Decode(word, size, encoding, errors) 1417 1418 k = kh_put_strbox(table, word, &ret) 1419 table.vals[k] = <PyObject *>pyval 1420 1421 result[i] = pyval 1422 1423 kh_destroy_strbox(table) 1424 1425 return result, na_count 1426 1427 1428@cython.boundscheck(False) 1429cdef _categorical_convert(parser_t *parser, int64_t col, 1430 int64_t line_start, int64_t line_end, 1431 bint na_filter, kh_str_starts_t *na_hashset, 1432 char *encoding): 1433 "Convert column data into codes, categories" 1434 cdef: 1435 int na_count = 0 1436 Py_ssize_t i, size, lines 1437 coliter_t it 1438 const char *word = NULL 1439 1440 int64_t NA = -1 1441 int64_t[:] codes 1442 int64_t current_category = 0 1443 1444 char *errors = "strict" 1445 StringPath path = _string_path(encoding) 1446 1447 int ret = 0 1448 kh_str_t *table 1449 khiter_t k 1450 1451 lines = line_end - line_start 1452 codes = np.empty(lines, dtype=np.int64) 1453 1454 # factorize parsed values, creating a hash table 1455 # bytes -> category code 1456 with nogil: 1457 table = kh_init_str() 1458 coliter_setup(&it, parser, col, line_start) 1459 1460 for i in range(lines): 1461 COLITER_NEXT(it, word) 1462 1463 if na_filter: 1464 if kh_get_str_starts_item(na_hashset, word): 1465 # is in NA values 1466 na_count += 1 1467 codes[i] = NA 1468 continue 1469 1470 k = kh_get_str(table, word) 1471 # not in the hash table 1472 if k == table.n_buckets: 1473 k = kh_put_str(table, word, &ret) 1474 table.vals[k] = current_category 1475 current_category += 1 1476 1477 codes[i] = table.vals[k] 1478 1479 # parse and box categories to python strings 1480 result = np.empty(table.n_occupied, dtype=np.object_) 1481 if path == ENCODED: 1482 for k in range(table.n_buckets): 1483 if kh_exist_str(table, k): 1484 size = strlen(table.keys[k]) 1485 result[table.vals[k]] = PyUnicode_Decode( 1486 table.keys[k], size, encoding, errors) 1487 elif path == UTF8: 1488 for k in range(table.n_buckets): 1489 if kh_exist_str(table, k): 1490 result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) 1491 1492 kh_destroy_str(table) 1493 return np.asarray(codes), result, na_count 1494 1495 1496cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, 1497 int64_t line_end, int64_t width): 1498 cdef: 1499 char *data 1500 ndarray result 1501 1502 result = np.empty(line_end - line_start, dtype=f'|S{width}') 1503 data = <char*>result.data 1504 1505 with nogil: 1506 _to_fw_string_nogil(parser, col, line_start, line_end, width, data) 1507 1508 return result 1509 1510 1511cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col, 1512 int64_t line_start, int64_t line_end, 1513 size_t width, char *data) nogil: 1514 cdef: 1515 int64_t i 1516 coliter_t it 1517 const char *word = NULL 1518 1519 coliter_setup(&it, parser, col, line_start) 1520 1521 for i in range(line_end - line_start): 1522 COLITER_NEXT(it, word) 1523 strncpy(data, word, width) 1524 data += width 1525 1526 1527cdef: 1528 char* cinf = b'inf' 1529 char* cposinf = b'+inf' 1530 char* cneginf = b'-inf' 1531 1532 char* cinfty = b'Infinity' 1533 char* cposinfty = b'+Infinity' 1534 char* cneginfty = b'-Infinity' 1535 1536 1537cdef _try_double(parser_t *parser, int64_t col, 1538 int64_t line_start, int64_t line_end, 1539 bint na_filter, kh_str_starts_t *na_hashset, object na_flist): 1540 cdef: 1541 int error, na_count = 0 1542 Py_ssize_t lines 1543 float64_t *data 1544 float64_t NA = na_values[np.float64] 1545 kh_float64_t *na_fset 1546 ndarray result 1547 bint use_na_flist = len(na_flist) > 0 1548 1549 lines = line_end - line_start 1550 result = np.empty(lines, dtype=np.float64) 1551 data = <float64_t *>result.data 1552 na_fset = kset_float64_from_list(na_flist) 1553 with nogil: 1554 error = _try_double_nogil(parser, parser.double_converter, 1555 col, line_start, line_end, 1556 na_filter, na_hashset, use_na_flist, 1557 na_fset, NA, data, &na_count) 1558 1559 kh_destroy_float64(na_fset) 1560 if error != 0: 1561 return None, None 1562 return result, na_count 1563 1564 1565cdef inline int _try_double_nogil(parser_t *parser, 1566 float64_t (*double_converter)( 1567 const char *, char **, char, 1568 char, char, int, int *, int *) nogil, 1569 int col, int line_start, int line_end, 1570 bint na_filter, kh_str_starts_t *na_hashset, 1571 bint use_na_flist, 1572 const kh_float64_t *na_flist, 1573 float64_t NA, float64_t *data, 1574 int *na_count) nogil: 1575 cdef: 1576 int error = 0, 1577 Py_ssize_t i, lines = line_end - line_start 1578 coliter_t it 1579 const char *word = NULL 1580 char *p_end 1581 khiter_t k64 1582 1583 na_count[0] = 0 1584 coliter_setup(&it, parser, col, line_start) 1585 1586 if na_filter: 1587 for i in range(lines): 1588 COLITER_NEXT(it, word) 1589 1590 if kh_get_str_starts_item(na_hashset, word): 1591 # in the hash table 1592 na_count[0] += 1 1593 data[0] = NA 1594 else: 1595 data[0] = double_converter(word, &p_end, parser.decimal, 1596 parser.sci, parser.thousands, 1597 1, &error, NULL) 1598 if error != 0 or p_end == word or p_end[0]: 1599 error = 0 1600 if (strcasecmp(word, cinf) == 0 or 1601 strcasecmp(word, cposinf) == 0 or 1602 strcasecmp(word, cinfty) == 0 or 1603 strcasecmp(word, cposinfty) == 0): 1604 data[0] = INF 1605 elif (strcasecmp(word, cneginf) == 0 or 1606 strcasecmp(word, cneginfty) == 0 ): 1607 data[0] = NEGINF 1608 else: 1609 return 1 1610 if use_na_flist: 1611 k64 = kh_get_float64(na_flist, data[0]) 1612 if k64 != na_flist.n_buckets: 1613 na_count[0] += 1 1614 data[0] = NA 1615 data += 1 1616 else: 1617 for i in range(lines): 1618 COLITER_NEXT(it, word) 1619 data[0] = double_converter(word, &p_end, parser.decimal, 1620 parser.sci, parser.thousands, 1621 1, &error, NULL) 1622 if error != 0 or p_end == word or p_end[0]: 1623 error = 0 1624 if (strcasecmp(word, cinf) == 0 or 1625 strcasecmp(word, cposinf) == 0 or 1626 strcasecmp(word, cinfty) == 0 or 1627 strcasecmp(word, cposinfty) == 0): 1628 data[0] = INF 1629 elif (strcasecmp(word, cneginf) == 0 or 1630 strcasecmp(word, cneginfty) == 0): 1631 data[0] = NEGINF 1632 else: 1633 return 1 1634 data += 1 1635 1636 return 0 1637 1638 1639cdef _try_uint64(parser_t *parser, int64_t col, 1640 int64_t line_start, int64_t line_end, 1641 bint na_filter, kh_str_starts_t *na_hashset): 1642 cdef: 1643 int error 1644 Py_ssize_t lines 1645 coliter_t it 1646 uint64_t *data 1647 ndarray result 1648 uint_state state 1649 1650 lines = line_end - line_start 1651 result = np.empty(lines, dtype=np.uint64) 1652 data = <uint64_t *>result.data 1653 1654 uint_state_init(&state) 1655 coliter_setup(&it, parser, col, line_start) 1656 with nogil: 1657 error = _try_uint64_nogil(parser, col, line_start, line_end, 1658 na_filter, na_hashset, data, &state) 1659 if error != 0: 1660 if error == ERROR_OVERFLOW: 1661 # Can't get the word variable 1662 raise OverflowError('Overflow') 1663 return None 1664 1665 if uint64_conflict(&state): 1666 raise ValueError('Cannot convert to numerical dtype') 1667 1668 if state.seen_sint: 1669 raise OverflowError('Overflow') 1670 1671 return result 1672 1673 1674cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, 1675 int64_t line_start, 1676 int64_t line_end, bint na_filter, 1677 const kh_str_starts_t *na_hashset, 1678 uint64_t *data, uint_state *state) nogil: 1679 cdef: 1680 int error 1681 Py_ssize_t i, lines = line_end - line_start 1682 coliter_t it 1683 const char *word = NULL 1684 1685 coliter_setup(&it, parser, col, line_start) 1686 1687 if na_filter: 1688 for i in range(lines): 1689 COLITER_NEXT(it, word) 1690 if kh_get_str_starts_item(na_hashset, word): 1691 # in the hash table 1692 state.seen_null = 1 1693 data[i] = 0 1694 continue 1695 1696 data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, 1697 &error, parser.thousands) 1698 if error != 0: 1699 return error 1700 else: 1701 for i in range(lines): 1702 COLITER_NEXT(it, word) 1703 data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, 1704 &error, parser.thousands) 1705 if error != 0: 1706 return error 1707 1708 return 0 1709 1710 1711cdef _try_int64(parser_t *parser, int64_t col, 1712 int64_t line_start, int64_t line_end, 1713 bint na_filter, kh_str_starts_t *na_hashset): 1714 cdef: 1715 int error, na_count = 0 1716 Py_ssize_t lines 1717 coliter_t it 1718 int64_t *data 1719 ndarray result 1720 int64_t NA = na_values[np.int64] 1721 1722 lines = line_end - line_start 1723 result = np.empty(lines, dtype=np.int64) 1724 data = <int64_t *>result.data 1725 coliter_setup(&it, parser, col, line_start) 1726 with nogil: 1727 error = _try_int64_nogil(parser, col, line_start, line_end, 1728 na_filter, na_hashset, NA, data, &na_count) 1729 if error != 0: 1730 if error == ERROR_OVERFLOW: 1731 # Can't get the word variable 1732 raise OverflowError('Overflow') 1733 return None, None 1734 1735 return result, na_count 1736 1737 1738cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, 1739 int64_t line_start, 1740 int64_t line_end, bint na_filter, 1741 const kh_str_starts_t *na_hashset, int64_t NA, 1742 int64_t *data, int *na_count) nogil: 1743 cdef: 1744 int error 1745 Py_ssize_t i, lines = line_end - line_start 1746 coliter_t it 1747 const char *word = NULL 1748 1749 na_count[0] = 0 1750 coliter_setup(&it, parser, col, line_start) 1751 1752 if na_filter: 1753 for i in range(lines): 1754 COLITER_NEXT(it, word) 1755 if kh_get_str_starts_item(na_hashset, word): 1756 # in the hash table 1757 na_count[0] += 1 1758 data[i] = NA 1759 continue 1760 1761 data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, 1762 &error, parser.thousands) 1763 if error != 0: 1764 return error 1765 else: 1766 for i in range(lines): 1767 COLITER_NEXT(it, word) 1768 data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, 1769 &error, parser.thousands) 1770 if error != 0: 1771 return error 1772 1773 return 0 1774 1775 1776cdef _try_bool_flex(parser_t *parser, int64_t col, 1777 int64_t line_start, int64_t line_end, 1778 bint na_filter, const kh_str_starts_t *na_hashset, 1779 const kh_str_starts_t *true_hashset, 1780 const kh_str_starts_t *false_hashset): 1781 cdef: 1782 int error, na_count = 0 1783 Py_ssize_t lines 1784 uint8_t *data 1785 ndarray result 1786 uint8_t NA = na_values[np.bool_] 1787 1788 lines = line_end - line_start 1789 result = np.empty(lines, dtype=np.uint8) 1790 data = <uint8_t *>result.data 1791 with nogil: 1792 error = _try_bool_flex_nogil(parser, col, line_start, line_end, 1793 na_filter, na_hashset, true_hashset, 1794 false_hashset, NA, data, &na_count) 1795 if error != 0: 1796 return None, None 1797 return result.view(np.bool_), na_count 1798 1799 1800cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, 1801 int64_t line_start, 1802 int64_t line_end, bint na_filter, 1803 const kh_str_starts_t *na_hashset, 1804 const kh_str_starts_t *true_hashset, 1805 const kh_str_starts_t *false_hashset, 1806 uint8_t NA, uint8_t *data, 1807 int *na_count) nogil: 1808 cdef: 1809 int error = 0 1810 Py_ssize_t i, lines = line_end - line_start 1811 coliter_t it 1812 const char *word = NULL 1813 1814 na_count[0] = 0 1815 coliter_setup(&it, parser, col, line_start) 1816 1817 if na_filter: 1818 for i in range(lines): 1819 COLITER_NEXT(it, word) 1820 1821 if kh_get_str_starts_item(na_hashset, word): 1822 # in the hash table 1823 na_count[0] += 1 1824 data[0] = NA 1825 data += 1 1826 continue 1827 1828 if kh_get_str_starts_item(true_hashset, word): 1829 data[0] = 1 1830 data += 1 1831 continue 1832 if kh_get_str_starts_item(false_hashset, word): 1833 data[0] = 0 1834 data += 1 1835 continue 1836 1837 error = to_boolean(word, data) 1838 if error != 0: 1839 return error 1840 data += 1 1841 else: 1842 for i in range(lines): 1843 COLITER_NEXT(it, word) 1844 1845 if kh_get_str_starts_item(true_hashset, word): 1846 data[0] = 1 1847 data += 1 1848 continue 1849 1850 if kh_get_str_starts_item(false_hashset, word): 1851 data[0] = 0 1852 data += 1 1853 continue 1854 1855 error = to_boolean(word, data) 1856 if error != 0: 1857 return error 1858 data += 1 1859 1860 return 0 1861 1862 1863cdef kh_str_starts_t* kset_from_list(list values) except NULL: 1864 # caller takes responsibility for freeing the hash table 1865 cdef: 1866 Py_ssize_t i 1867 kh_str_starts_t *table 1868 int ret = 0 1869 object val 1870 1871 table = kh_init_str_starts() 1872 1873 for i in range(len(values)): 1874 val = values[i] 1875 1876 # None creeps in sometimes, which isn't possible here 1877 if not isinstance(val, bytes): 1878 kh_destroy_str_starts(table) 1879 raise ValueError('Must be all encoded bytes') 1880 1881 kh_put_str_starts_item(table, PyBytes_AsString(val), &ret) 1882 1883 if table.table.n_buckets <= 128: 1884 # Resize the hash table to make it almost empty, this 1885 # reduces amount of hash collisions on lookup thus 1886 # "key not in table" case is faster. 1887 # Note that this trades table memory footprint for lookup speed. 1888 kh_resize_str_starts(table, table.table.n_buckets * 8) 1889 1890 return table 1891 1892 1893cdef kh_float64_t* kset_float64_from_list(values) except NULL: 1894 # caller takes responsibility for freeing the hash table 1895 cdef: 1896 khiter_t k 1897 kh_float64_t *table 1898 int ret = 0 1899 float64_t val 1900 object value 1901 1902 table = kh_init_float64() 1903 1904 for value in values: 1905 val = float(value) 1906 1907 k = kh_put_float64(table, val, &ret) 1908 1909 if table.n_buckets <= 128: 1910 # See reasoning in kset_from_list 1911 kh_resize_float64(table, table.n_buckets * 8) 1912 return table 1913 1914 1915cdef raise_parser_error(object base, parser_t *parser): 1916 cdef: 1917 object old_exc 1918 object exc_type 1919 PyObject *type 1920 PyObject *value 1921 PyObject *traceback 1922 1923 if PyErr_Occurred(): 1924 PyErr_Fetch(&type, &value, &traceback) 1925 Py_XDECREF(traceback) 1926 1927 if value != NULL: 1928 old_exc = <object>value 1929 Py_XDECREF(value) 1930 1931 # PyErr_Fetch only returned the error message in *value, 1932 # so the Exception class must be extracted from *type. 1933 if isinstance(old_exc, str): 1934 if type != NULL: 1935 exc_type = <object>type 1936 else: 1937 exc_type = ParserError 1938 1939 Py_XDECREF(type) 1940 raise exc_type(old_exc) 1941 else: 1942 Py_XDECREF(type) 1943 raise old_exc 1944 1945 message = f'{base}. C error: ' 1946 if parser.error_msg != NULL: 1947 message += parser.error_msg.decode('utf-8') 1948 else: 1949 message += 'no error message set' 1950 1951 raise ParserError(message) 1952 1953 1954def _concatenate_chunks(list chunks): 1955 cdef: 1956 list names = list(chunks[0].keys()) 1957 object name 1958 list warning_columns = [] 1959 object warning_names 1960 object common_type 1961 1962 result = {} 1963 for name in names: 1964 arrs = [chunk.pop(name) for chunk in chunks] 1965 # Check each arr for consistent types. 1966 dtypes = {a.dtype for a in arrs} 1967 numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} 1968 if len(numpy_dtypes) > 1: 1969 common_type = np.find_common_type(numpy_dtypes, []) 1970 if common_type == object: 1971 warning_columns.append(str(name)) 1972 1973 dtype = dtypes.pop() 1974 if is_categorical_dtype(dtype): 1975 sort_categories = isinstance(dtype, str) 1976 result[name] = union_categoricals(arrs, 1977 sort_categories=sort_categories) 1978 else: 1979 if is_extension_array_dtype(dtype): 1980 array_type = dtype.construct_array_type() 1981 result[name] = array_type._concat_same_type(arrs) 1982 else: 1983 result[name] = np.concatenate(arrs) 1984 1985 if warning_columns: 1986 warning_names = ','.join(warning_columns) 1987 warning_message = " ".join([ 1988 f"Columns ({warning_names}) have mixed types." 1989 f"Specify dtype option on import or set low_memory=False." 1990 ]) 1991 warnings.warn(warning_message, DtypeWarning, stacklevel=8) 1992 return result 1993 1994 1995# ---------------------------------------------------------------------- 1996# NA values 1997def _compute_na_values(): 1998 int64info = np.iinfo(np.int64) 1999 int32info = np.iinfo(np.int32) 2000 int16info = np.iinfo(np.int16) 2001 int8info = np.iinfo(np.int8) 2002 uint64info = np.iinfo(np.uint64) 2003 uint32info = np.iinfo(np.uint32) 2004 uint16info = np.iinfo(np.uint16) 2005 uint8info = np.iinfo(np.uint8) 2006 na_values = { 2007 np.float64: np.nan, 2008 np.int64: int64info.min, 2009 np.int32: int32info.min, 2010 np.int16: int16info.min, 2011 np.int8: int8info.min, 2012 np.uint64: uint64info.max, 2013 np.uint32: uint32info.max, 2014 np.uint16: uint16info.max, 2015 np.uint8: uint8info.max, 2016 np.bool_: uint8info.max, 2017 np.object_: np.nan # oof 2018 } 2019 return na_values 2020 2021 2022na_values = _compute_na_values() 2023 2024for k in list(na_values): 2025 na_values[np.dtype(k)] = na_values[k] 2026 2027 2028cdef _apply_converter(object f, parser_t *parser, int64_t col, 2029 int64_t line_start, int64_t line_end, 2030 char* c_encoding): 2031 cdef: 2032 Py_ssize_t i, lines 2033 coliter_t it 2034 const char *word = NULL 2035 char *errors = "strict" 2036 ndarray[object] result 2037 object val 2038 2039 lines = line_end - line_start 2040 result = np.empty(lines, dtype=np.object_) 2041 2042 coliter_setup(&it, parser, col, line_start) 2043 2044 if c_encoding == NULL or c_encoding == b'utf-8': 2045 for i in range(lines): 2046 COLITER_NEXT(it, word) 2047 val = PyUnicode_FromString(word) 2048 result[i] = f(val) 2049 else: 2050 for i in range(lines): 2051 COLITER_NEXT(it, word) 2052 val = PyUnicode_Decode(word, strlen(word), 2053 c_encoding, errors) 2054 result[i] = f(val) 2055 2056 return lib.maybe_convert_objects(result) 2057 2058 2059def _maybe_encode(values): 2060 if values is None: 2061 return [] 2062 return [x.encode('utf-8') if isinstance(x, str) else x for x in values] 2063 2064 2065def sanitize_objects(ndarray[object] values, set na_values, 2066 bint convert_empty=True): 2067 """ 2068 Convert specified values, including the given set na_values and empty 2069 strings if convert_empty is True, to np.nan. 2070 2071 Parameters 2072 ---------- 2073 values : ndarray[object] 2074 na_values : set 2075 convert_empty : bool, default True 2076 """ 2077 cdef: 2078 Py_ssize_t i, n 2079 object val, onan 2080 Py_ssize_t na_count = 0 2081 dict memo = {} 2082 2083 n = len(values) 2084 onan = np.nan 2085 2086 for i in range(n): 2087 val = values[i] 2088 if (convert_empty and val == '') or (val in na_values): 2089 values[i] = onan 2090 na_count += 1 2091 elif val in memo: 2092 values[i] = memo[val] 2093 else: 2094 memo[val] = val 2095 2096 return na_count 2097