1# Copyright (c) 2012, Lambda Foundry, Inc.
2# See LICENSE for the license
3from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC
4from errno import ENOENT
5import sys
6import time
7import warnings
8
9from libc.stdlib cimport free
10from libc.string cimport strcasecmp, strlen, strncpy
11
12import cython
13from cython import Py_ssize_t
14
15from cpython.bytes cimport PyBytes_AsString
16from cpython.exc cimport PyErr_Fetch, PyErr_Occurred
17from cpython.object cimport PyObject
18from cpython.ref cimport Py_XDECREF
19from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode
20
21
22cdef extern from "Python.h":
23    object PyUnicode_FromString(char *v)
24
25
26import numpy as np
27
28cimport numpy as cnp
29from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t
30
31cnp.import_array()
32
33from pandas._libs cimport util
34from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX
35
36import pandas._libs.lib as lib
37
38from pandas._libs.khash cimport (
39    kh_destroy_float64,
40    kh_destroy_str,
41    kh_destroy_str_starts,
42    kh_destroy_strbox,
43    kh_exist_str,
44    kh_float64_t,
45    kh_get_float64,
46    kh_get_str,
47    kh_get_str_starts_item,
48    kh_get_strbox,
49    kh_init_float64,
50    kh_init_str,
51    kh_init_str_starts,
52    kh_init_strbox,
53    kh_put_float64,
54    kh_put_str,
55    kh_put_str_starts_item,
56    kh_put_strbox,
57    kh_resize_float64,
58    kh_resize_str_starts,
59    kh_str_starts_t,
60    kh_str_t,
61    kh_strbox_t,
62    khiter_t,
63)
64
65from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning
66
67from pandas.core.dtypes.common import (
68    is_bool_dtype,
69    is_categorical_dtype,
70    is_datetime64_dtype,
71    is_extension_array_dtype,
72    is_float_dtype,
73    is_integer_dtype,
74    is_object_dtype,
75    pandas_dtype,
76)
77from pandas.core.dtypes.concat import union_categoricals
78
79cdef:
80    float64_t INF = <float64_t>np.inf
81    float64_t NEGINF = -INF
82    int64_t DEFAULT_CHUNKSIZE = 256 * 1024
83
84
85cdef extern from "headers/portable.h":
86    # I *think* this is here so that strcasecmp is defined on Windows
87    # so we don't get
88    # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp`
89    # in Appveyor.
90    # In a sane world, the `from libc.string cimport` above would fail
91    # loudly.
92    pass
93
94
95cdef extern from "parser/tokenizer.h":
96
97    ctypedef enum ParserState:
98        START_RECORD
99        START_FIELD
100        ESCAPED_CHAR
101        IN_FIELD
102        IN_QUOTED_FIELD
103        ESCAPE_IN_QUOTED_FIELD
104        QUOTE_IN_QUOTED_FIELD
105        EAT_CRNL
106        EAT_CRNL_NOP
107        EAT_WHITESPACE
108        EAT_COMMENT
109        EAT_LINE_COMMENT
110        WHITESPACE_LINE
111        SKIP_LINE
112        FINISHED
113
114    enum: ERROR_OVERFLOW
115
116    ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
117                                  int *status)
118    ctypedef int (*io_cleanup)(void *src)
119
120    ctypedef struct parser_t:
121        void *source
122        io_callback cb_io
123        io_cleanup cb_cleanup
124
125        int64_t chunksize  # Number of bytes to prepare for each chunk
126        char *data         # pointer to data to be processed
127        int64_t datalen    # amount of data available
128        int64_t datapos
129
130        # where to write out tokenized data
131        char *stream
132        uint64_t stream_len
133        uint64_t stream_cap
134
135        # Store words in (potentially ragged) matrix for now, hmm
136        char **words
137        int64_t *word_starts  # where we are in the stream
138        uint64_t words_len
139        uint64_t words_cap
140        uint64_t max_words_cap   # maximum word cap encountered
141
142        char *pword_start        # pointer to stream start of current field
143        int64_t word_start       # position start of current field
144
145        int64_t *line_start      # position in words for start of line
146        int64_t *line_fields     # Number of fields in each line
147        uint64_t lines           # Number of lines observed
148        uint64_t file_lines      # Number of lines observed (with bad/skipped)
149        uint64_t lines_cap       # Vector capacity
150
151        # Tokenizing stuff
152        ParserState state
153        int doublequote            # is " represented by ""? */
154        char delimiter             # field separator */
155        int delim_whitespace       # consume tabs / spaces instead
156        char quotechar             # quote character */
157        char escapechar            # escape character */
158        char lineterminator
159        int skipinitialspace       # ignore spaces following delimiter? */
160        int quoting                # style of quoting to write */
161
162        char commentchar
163        int allow_embedded_newline
164        int strict                 # raise exception on bad CSV */
165
166        int usecols
167
168        int expected_fields
169        int error_bad_lines
170        int warn_bad_lines
171
172        # floating point options
173        char decimal
174        char sci
175
176        # thousands separator (comma, period)
177        char thousands
178
179        int header                  # Boolean: 1: has header, 0: no header
180        int64_t header_start        # header row start
181        uint64_t header_end         # header row end
182
183        void *skipset
184        PyObject *skipfunc
185        int64_t skip_first_N_rows
186        int64_t skipfooter
187        # pick one, depending on whether the converter requires GIL
188        float64_t (*double_converter)(const char *, char **,
189                                      char, char, char,
190                                      int, int *, int *) nogil
191
192        #  error handling
193        char *warn_msg
194        char *error_msg
195
196        int64_t skip_empty_lines
197
198    ctypedef struct coliter_t:
199        char **words
200        int64_t *line_start
201        int64_t col
202
203    ctypedef struct uint_state:
204        int seen_sint
205        int seen_uint
206        int seen_null
207
208    void uint_state_init(uint_state *self)
209    int uint64_conflict(uint_state *self)
210
211    void coliter_setup(coliter_t *it, parser_t *parser,
212                       int64_t i, int64_t start) nogil
213    void COLITER_NEXT(coliter_t, const char *) nogil
214
215    parser_t* parser_new()
216
217    int parser_init(parser_t *self) nogil
218    void parser_free(parser_t *self) nogil
219    void parser_del(parser_t *self) nogil
220    int parser_add_skiprow(parser_t *self, int64_t row)
221
222    int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
223
224    void parser_set_default_options(parser_t *self)
225
226    int parser_consume_rows(parser_t *self, size_t nrows)
227
228    int parser_trim_buffers(parser_t *self)
229
230    int tokenize_all_rows(parser_t *self) nogil
231    int tokenize_nrows(parser_t *self, size_t nrows) nogil
232
233    int64_t str_to_int64(char *p_item, int64_t int_min,
234                         int64_t int_max, int *error, char tsep) nogil
235    uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
236                           uint64_t uint_max, int *error, char tsep) nogil
237
238    float64_t xstrtod(const char *p, char **q, char decimal,
239                      char sci, char tsep, int skip_trailing,
240                      int *error, int *maybe_int) nogil
241    float64_t precise_xstrtod(const char *p, char **q, char decimal,
242                              char sci, char tsep, int skip_trailing,
243                              int *error, int *maybe_int) nogil
244    float64_t round_trip(const char *p, char **q, char decimal,
245                         char sci, char tsep, int skip_trailing,
246                         int *error, int *maybe_int) nogil
247
248    int to_boolean(const char *item, uint8_t *val) nogil
249
250
251cdef extern from "parser/io.h":
252    void *new_mmap(char *fname)
253    int del_mmap(void *src)
254    void* buffer_mmap_bytes(void *source, size_t nbytes,
255                            size_t *bytes_read, int *status)
256
257    void *new_file_source(char *fname, size_t buffer_size) except NULL
258
259    void *new_rd_source(object obj) except NULL
260
261    int del_file_source(void *src)
262    int del_rd_source(void *src)
263
264    void* buffer_file_bytes(void *source, size_t nbytes,
265                            size_t *bytes_read, int *status)
266
267    void* buffer_rd_bytes(void *source, size_t nbytes,
268                          size_t *bytes_read, int *status)
269
270
271cdef class TextReader:
272    """
273
274    # source: StringIO or file object
275
276    ..versionchange:: 1.2.0
277        removed 'compression', 'memory_map', and 'encoding' argument.
278        These arguments are outsourced to CParserWrapper.
279        'source' has to be a file handle.
280    """
281
282    cdef:
283        parser_t *parser
284        object na_fvalues
285        object true_values, false_values
286        object handle
287        bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
288        uint64_t parser_start
289        list clocks
290        char *c_encoding
291        kh_str_starts_t *false_set
292        kh_str_starts_t *true_set
293
294    cdef public:
295        int64_t leading_cols, table_width, skipfooter, buffer_lines
296        bint allow_leading_cols, mangle_dupe_cols, low_memory
297        bint delim_whitespace
298        object delimiter, converters
299        object na_values
300        object header, orig_header, names, header_start, header_end
301        object index_col
302        object skiprows
303        object dtype
304        object usecols
305        list dtype_cast_order
306        set unnamed_cols
307        set noconvert
308
309    def __cinit__(self, source,
310                  delimiter=b',',
311                  header=0,
312                  header_start=0,
313                  header_end=0,
314                  index_col=None,
315                  names=None,
316                  tokenize_chunksize=DEFAULT_CHUNKSIZE,
317                  bint delim_whitespace=False,
318                  converters=None,
319                  bint skipinitialspace=False,
320                  escapechar=None,
321                  bint doublequote=True,
322                  quotechar=b'"',
323                  quoting=0,
324                  lineterminator=None,
325                  comment=None,
326                  decimal=b'.',
327                  thousands=None,
328                  dtype=None,
329                  usecols=None,
330                  bint error_bad_lines=True,
331                  bint warn_bad_lines=True,
332                  bint na_filter=True,
333                  na_values=None,
334                  na_fvalues=None,
335                  bint keep_default_na=True,
336                  true_values=None,
337                  false_values=None,
338                  bint allow_leading_cols=True,
339                  bint low_memory=False,
340                  skiprows=None,
341                  skipfooter=0,
342                  bint verbose=False,
343                  bint mangle_dupe_cols=True,
344                  float_precision=None,
345                  bint skip_blank_lines=True):
346
347        # set encoding for native Python and C library
348        self.c_encoding = NULL
349
350        self.parser = parser_new()
351        self.parser.chunksize = tokenize_chunksize
352
353        self.mangle_dupe_cols = mangle_dupe_cols
354
355        # For timekeeping
356        self.clocks = []
357
358        self.parser.usecols = (usecols is not None)
359
360        self._setup_parser_source(source)
361        parser_set_default_options(self.parser)
362
363        parser_init(self.parser)
364
365        if delim_whitespace:
366            self.parser.delim_whitespace = delim_whitespace
367        else:
368            if len(delimiter) > 1:
369                raise ValueError('only length-1 separators excluded right now')
370            self.parser.delimiter = ord(delimiter)
371
372        # ----------------------------------------
373        # parser options
374
375        self.parser.doublequote = doublequote
376        self.parser.skipinitialspace = skipinitialspace
377        self.parser.skip_empty_lines = skip_blank_lines
378
379        if lineterminator is not None:
380            if len(lineterminator) != 1:
381                raise ValueError('Only length-1 line terminators supported')
382            self.parser.lineterminator = ord(lineterminator)
383
384        if len(decimal) != 1:
385            raise ValueError('Only length-1 decimal markers supported')
386        self.parser.decimal = ord(decimal)
387
388        if thousands is not None:
389            if len(thousands) != 1:
390                raise ValueError('Only length-1 thousands markers supported')
391            self.parser.thousands = ord(thousands)
392
393        if escapechar is not None:
394            if len(escapechar) != 1:
395                raise ValueError('Only length-1 escapes supported')
396            self.parser.escapechar = ord(escapechar)
397
398        self._set_quoting(quotechar, quoting)
399
400        dtype_order = ['int64', 'float64', 'bool', 'object']
401        if quoting == QUOTE_NONNUMERIC:
402            # consistent with csv module semantics, cast all to float
403            dtype_order = dtype_order[1:]
404        self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
405
406        if comment is not None:
407            if len(comment) > 1:
408                raise ValueError('Only length-1 comment characters supported')
409            self.parser.commentchar = ord(comment)
410
411        # error handling of bad lines
412        self.parser.error_bad_lines = int(error_bad_lines)
413        self.parser.warn_bad_lines = int(warn_bad_lines)
414
415        self.skiprows = skiprows
416        if skiprows is not None:
417            self._make_skiprow_set()
418
419        self.skipfooter = skipfooter
420
421        # suboptimal
422        if usecols is not None:
423            self.has_usecols = 1
424            # GH-20558, validate usecols at higher level and only pass clean
425            # usecols into TextReader.
426            self.usecols = usecols
427
428        # XXX
429        if skipfooter > 0:
430            self.parser.error_bad_lines = 0
431            self.parser.warn_bad_lines = 0
432
433        self.delimiter = delimiter
434        self.delim_whitespace = delim_whitespace
435
436        self.na_values = na_values
437        if na_fvalues is None:
438            na_fvalues = set()
439        self.na_fvalues = na_fvalues
440
441        self.true_values = _maybe_encode(true_values) + _true_values
442        self.false_values = _maybe_encode(false_values) + _false_values
443
444        self.true_set = kset_from_list(self.true_values)
445        self.false_set = kset_from_list(self.false_values)
446
447        self.keep_default_na = keep_default_na
448        self.converters = converters
449        self.na_filter = na_filter
450
451        self.verbose = verbose
452        self.low_memory = low_memory
453
454        if float_precision == "round_trip":
455            # see gh-15140
456            self.parser.double_converter = round_trip
457        elif float_precision == "legacy":
458            self.parser.double_converter = xstrtod
459        elif float_precision == "high" or float_precision is None:
460            self.parser.double_converter = precise_xstrtod
461        else:
462            raise ValueError(f'Unrecognized float_precision option: '
463                             f'{float_precision}')
464
465        if isinstance(dtype, dict):
466            dtype = {k: pandas_dtype(dtype[k])
467                     for k in dtype}
468        elif dtype is not None:
469            dtype = pandas_dtype(dtype)
470
471        self.dtype = dtype
472
473        # XXX
474        self.noconvert = set()
475
476        self.index_col = index_col
477
478        # ----------------------------------------
479        # header stuff
480
481        self.allow_leading_cols = allow_leading_cols
482        self.leading_cols = 0
483
484        # TODO: no header vs. header is not the first row
485        self.has_mi_columns = 0
486        self.orig_header = header
487        if header is None:
488            # sentinel value
489            self.parser.header_start = -1
490            self.parser.header_end = -1
491            self.parser.header = -1
492            self.parser_start = 0
493            self.header = []
494        else:
495            if isinstance(header, list):
496                if len(header) > 1:
497                    # need to artificially skip the final line
498                    # which is still a header line
499                    header = list(header)
500                    header.append(header[-1] + 1)
501                    self.parser.header_end = header[-1]
502                    self.has_mi_columns = 1
503                else:
504                    self.parser.header_end = header[0]
505
506                self.parser_start = header[-1] + 1
507                self.parser.header_start = header[0]
508                self.parser.header = header[0]
509                self.header = header
510            else:
511                self.parser.header_start = header
512                self.parser.header_end = header
513                self.parser_start = header + 1
514                self.parser.header = header
515                self.header = [ header ]
516
517        self.names = names
518        self.header, self.table_width, self.unnamed_cols = self._get_header()
519
520        if not self.table_width:
521            raise EmptyDataError("No columns to parse from file")
522
523        # Compute buffer_lines as function of table width.
524        heuristic = 2**20 // self.table_width
525        self.buffer_lines = 1
526        while self.buffer_lines * 2 < heuristic:
527            self.buffer_lines *= 2
528
529    def __init__(self, *args, **kwargs):
530        pass
531
532    def __dealloc__(self):
533        parser_free(self.parser)
534        if self.true_set:
535            kh_destroy_str_starts(self.true_set)
536            self.true_set = NULL
537        if self.false_set:
538            kh_destroy_str_starts(self.false_set)
539            self.false_set = NULL
540        parser_del(self.parser)
541
542    def close(self):
543        # also preemptively free all allocated memory
544        parser_free(self.parser)
545        if self.true_set:
546            kh_destroy_str_starts(self.true_set)
547            self.true_set = NULL
548        if self.false_set:
549            kh_destroy_str_starts(self.false_set)
550            self.false_set = NULL
551
552    def set_error_bad_lines(self, int status):
553        self.parser.error_bad_lines = status
554
555    def _set_quoting(self, quote_char, quoting):
556        if not isinstance(quoting, int):
557            raise TypeError('"quoting" must be an integer')
558
559        if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
560            raise TypeError('bad "quoting" value')
561
562        if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
563            dtype = type(quote_char).__name__
564            raise TypeError(f'"quotechar" must be string, not {dtype}')
565
566        if quote_char is None or quote_char == '':
567            if quoting != QUOTE_NONE:
568                raise TypeError("quotechar must be set if quoting enabled")
569            self.parser.quoting = quoting
570            self.parser.quotechar = -1
571        elif len(quote_char) > 1:  # 0-len case handled earlier
572            raise TypeError('"quotechar" must be a 1-character string')
573        else:
574            self.parser.quoting = quoting
575            self.parser.quotechar = ord(quote_char)
576
577    cdef _make_skiprow_set(self):
578        if util.is_integer_object(self.skiprows):
579            parser_set_skipfirstnrows(self.parser, self.skiprows)
580        elif not callable(self.skiprows):
581            for i in self.skiprows:
582                parser_add_skiprow(self.parser, i)
583        else:
584            self.parser.skipfunc = <PyObject *>self.skiprows
585
586    cdef _setup_parser_source(self, source):
587        cdef:
588            void *ptr
589
590        if not hasattr(source, "read"):
591            raise IOError(f'Expected file path name or file-like object, '
592                          f'got {type(source)} type')
593
594        ptr = new_rd_source(source)
595        self.parser.source = ptr
596        self.parser.cb_io = &buffer_rd_bytes
597        self.parser.cb_cleanup = &del_rd_source
598
599    cdef _get_header(self):
600        # header is now a list of lists, so field_count should use header[0]
601
602        cdef:
603            Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
604            char *word
605            object name, old_name
606            uint64_t hr, data_line = 0
607            char *errors = "strict"
608            StringPath path = _string_path(self.c_encoding)
609            list header = []
610            set unnamed_cols = set()
611
612        if self.parser.header_start >= 0:
613
614            # Header is in the file
615            for level, hr in enumerate(self.header):
616
617                this_header = []
618
619                if self.parser.lines < hr + 1:
620                    self._tokenize_rows(hr + 2)
621
622                if self.parser.lines == 0:
623                    field_count = 0
624                    start = self.parser.line_start[0]
625
626                # e.g., if header=3 and file only has 2 lines
627                elif (self.parser.lines < hr + 1
628                      and not isinstance(self.orig_header, list)) or (
629                          self.parser.lines < hr):
630                    msg = self.orig_header
631                    if isinstance(msg, list):
632                        joined = ','.join(str(m) for m in msg)
633                        msg = f"[{joined}], len of {len(msg)},"
634                    raise ParserError(
635                        f'Passed header={msg} but only '
636                        f'{self.parser.lines} lines in file')
637
638                else:
639                    field_count = self.parser.line_fields[hr]
640                    start = self.parser.line_start[hr]
641
642                counts = {}
643                unnamed_count = 0
644
645                for i in range(field_count):
646                    word = self.parser.words[start + i]
647
648                    if path == UTF8:
649                        name = PyUnicode_FromString(word)
650                    elif path == ENCODED:
651                        name = PyUnicode_Decode(word, strlen(word),
652                                                self.c_encoding, errors)
653
654                    # We use this later when collecting placeholder names.
655                    old_name = name
656
657                    if name == '':
658                        if self.has_mi_columns:
659                            name = f'Unnamed: {i}_level_{level}'
660                        else:
661                            name = f'Unnamed: {i}'
662                        unnamed_count += 1
663
664                    count = counts.get(name, 0)
665
666                    if not self.has_mi_columns and self.mangle_dupe_cols:
667                        while count > 0:
668                            counts[name] = count + 1
669                            name = f'{name}.{count}'
670                            count = counts.get(name, 0)
671
672                    if old_name == '':
673                        unnamed_cols.add(name)
674
675                    this_header.append(name)
676                    counts[name] = count + 1
677
678                if self.has_mi_columns:
679
680                    # If we have grabbed an extra line, but it's not in our
681                    # format, save in the buffer, and create an blank extra
682                    # line for the rest of the parsing code.
683                    if hr == self.header[-1]:
684                        lc = len(this_header)
685                        ic = (len(self.index_col) if self.index_col
686                              is not None else 0)
687
688                        if lc != unnamed_count and lc - ic > unnamed_count:
689                            hr -= 1
690                            self.parser_start -= 1
691                            this_header = [None] * lc
692
693                data_line = hr + 1
694                header.append(this_header)
695
696            if self.names is not None:
697                header = [ self.names ]
698
699        elif self.names is not None:
700            # Enforce this unless usecols
701            if not self.has_usecols:
702                self.parser.expected_fields = len(self.names)
703
704            # Names passed
705            if self.parser.lines < 1:
706                self._tokenize_rows(1)
707
708            header = [ self.names ]
709
710            if self.parser.lines < 1:
711                field_count = len(header[0])
712            else:
713                field_count = self.parser.line_fields[data_line]
714        else:
715            # No header passed nor to be found in the file
716            if self.parser.lines < 1:
717                self._tokenize_rows(1)
718
719            return None, self.parser.line_fields[0], unnamed_cols
720
721        # Corner case, not enough lines in the file
722        if self.parser.lines < data_line + 1:
723            field_count = len(header[0])
724        else:  # not self.has_usecols:
725
726            field_count = self.parser.line_fields[data_line]
727
728            # #2981
729            if self.names is not None:
730                field_count = max(field_count, len(self.names))
731
732            passed_count = len(header[0])
733
734            if (self.has_usecols and self.allow_leading_cols and
735                    not callable(self.usecols)):
736                nuse = len(self.usecols)
737                if nuse == passed_count:
738                    self.leading_cols = 0
739                elif self.names is None and nuse < passed_count:
740                    self.leading_cols = field_count - passed_count
741                elif passed_count != field_count:
742                    raise ValueError('Passed header names '
743                                     'mismatches usecols')
744            # oh boy, #2442, #2981
745            elif self.allow_leading_cols and passed_count < field_count:
746                self.leading_cols = field_count - passed_count
747
748        return header, field_count, unnamed_cols
749
750    def read(self, rows=None):
751        """
752        rows=None --> read all rows
753        """
754        if self.low_memory:
755            # Conserve intermediate space
756            columns = self._read_low_memory(rows)
757        else:
758            # Don't care about memory usage
759            columns = self._read_rows(rows, 1)
760
761        return columns
762
763    cdef _read_low_memory(self, rows):
764        cdef:
765            size_t rows_read = 0
766            list chunks = []
767
768        if rows is None:
769            while True:
770                try:
771                    chunk = self._read_rows(self.buffer_lines, 0)
772                    if len(chunk) == 0:
773                        break
774                except StopIteration:
775                    break
776                else:
777                    chunks.append(chunk)
778        else:
779            while rows_read < rows:
780                try:
781                    crows = min(self.buffer_lines, rows - rows_read)
782
783                    chunk = self._read_rows(crows, 0)
784                    if len(chunk) == 0:
785                        break
786
787                    rows_read += len(list(chunk.values())[0])
788                except StopIteration:
789                    break
790                else:
791                    chunks.append(chunk)
792
793        parser_trim_buffers(self.parser)
794
795        if len(chunks) == 0:
796            raise StopIteration
797
798        # destructive to chunks
799        return _concatenate_chunks(chunks)
800
801    cdef _tokenize_rows(self, size_t nrows):
802        cdef:
803            int status
804
805        with nogil:
806            status = tokenize_nrows(self.parser, nrows)
807
808        if self.parser.warn_msg != NULL:
809            print(self.parser.warn_msg, file=sys.stderr)
810            free(self.parser.warn_msg)
811            self.parser.warn_msg = NULL
812
813        if status < 0:
814            raise_parser_error('Error tokenizing data', self.parser)
815
816    cdef _read_rows(self, rows, bint trim):
817        cdef:
818            int64_t buffered_lines
819            int64_t irows, footer = 0
820
821        self._start_clock()
822
823        if rows is not None:
824            irows = rows
825            buffered_lines = self.parser.lines - self.parser_start
826            if buffered_lines < irows:
827                self._tokenize_rows(irows - buffered_lines)
828
829            if self.skipfooter > 0:
830                raise ValueError('skipfooter can only be used to read '
831                                 'the whole file')
832        else:
833            with nogil:
834                status = tokenize_all_rows(self.parser)
835
836            if self.parser.warn_msg != NULL:
837                print(self.parser.warn_msg, file=sys.stderr)
838                free(self.parser.warn_msg)
839                self.parser.warn_msg = NULL
840
841            if status < 0:
842                raise_parser_error('Error tokenizing data', self.parser)
843            footer = self.skipfooter
844
845        if self.parser_start >= self.parser.lines:
846            raise StopIteration
847        self._end_clock('Tokenization')
848
849        self._start_clock()
850        columns = self._convert_column_data(rows=rows,
851                                            footer=footer,
852                                            upcast_na=True)
853        self._end_clock('Type conversion')
854        self._start_clock()
855        if len(columns) > 0:
856            rows_read = len(list(columns.values())[0])
857            # trim
858            parser_consume_rows(self.parser, rows_read)
859            if trim:
860                parser_trim_buffers(self.parser)
861            self.parser_start -= rows_read
862
863        self._end_clock('Parser memory cleanup')
864
865        return columns
866
867    cdef _start_clock(self):
868        self.clocks.append(time.time())
869
870    cdef _end_clock(self, what):
871        if self.verbose:
872            elapsed = time.time() - self.clocks.pop(-1)
873            print(f'{what} took: {elapsed * 1000:.2f} ms')
874
875    def set_noconvert(self, i):
876        self.noconvert.add(i)
877
878    def remove_noconvert(self, i):
879        self.noconvert.remove(i)
880
881    def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
882        cdef:
883            int64_t i
884            int nused
885            kh_str_starts_t *na_hashset = NULL
886            int64_t start, end
887            object name, na_flist, col_dtype = None
888            bint na_filter = 0
889            int64_t num_cols
890
891        start = self.parser_start
892
893        if rows is None:
894            end = self.parser.lines
895        else:
896            end = min(start + rows, self.parser.lines)
897
898        # FIXME: dont leave commented-out
899        # # skip footer
900        # if footer > 0:
901        #     end -= footer
902
903        num_cols = -1
904        # Py_ssize_t cast prevents build warning
905        for i in range(<Py_ssize_t>self.parser.lines):
906            num_cols = (num_cols < self.parser.line_fields[i]) * \
907                self.parser.line_fields[i] + \
908                (num_cols >= self.parser.line_fields[i]) * num_cols
909
910        if self.table_width - self.leading_cols > num_cols:
911            raise ParserError(f"Too many columns specified: expected "
912                              f"{self.table_width - self.leading_cols} "
913                              f"and found {num_cols}")
914
915        results = {}
916        nused = 0
917        for i in range(self.table_width):
918            if i < self.leading_cols:
919                # Pass through leading columns always
920                name = i
921            elif (self.usecols and not callable(self.usecols) and
922                    nused == len(self.usecols)):
923                # Once we've gathered all requested columns, stop. GH5766
924                break
925            else:
926                name = self._get_column_name(i, nused)
927                usecols = set()
928                if callable(self.usecols):
929                    if self.usecols(name):
930                        usecols = {i}
931                else:
932                    usecols = self.usecols
933                if self.has_usecols and not (i in usecols or
934                                             name in usecols):
935                    continue
936                nused += 1
937
938            conv = self._get_converter(i, name)
939
940            col_dtype = None
941            if self.dtype is not None:
942                if isinstance(self.dtype, dict):
943                    if name in self.dtype:
944                        col_dtype = self.dtype[name]
945                    elif i in self.dtype:
946                        col_dtype = self.dtype[i]
947                else:
948                    if self.dtype.names:
949                        # structured array
950                        col_dtype = np.dtype(self.dtype.descr[i][1])
951                    else:
952                        col_dtype = self.dtype
953
954            if conv:
955                if col_dtype is not None:
956                    warnings.warn((f"Both a converter and dtype were specified "
957                                   f"for column {name} - only the converter will "
958                                   f"be used"), ParserWarning,
959                                  stacklevel=5)
960                results[i] = _apply_converter(conv, self.parser, i, start, end,
961                                              self.c_encoding)
962                continue
963
964            # Collect the list of NaN values associated with the column.
965            # If we aren't supposed to do that, or none are collected,
966            # we set `na_filter` to `0` (`1` otherwise).
967            na_flist = set()
968
969            if self.na_filter:
970                na_list, na_flist = self._get_na_list(i, name)
971                if na_list is None:
972                    na_filter = 0
973                else:
974                    na_filter = 1
975                    na_hashset = kset_from_list(na_list)
976            else:
977                na_filter = 0
978
979            # Attempt to parse tokens and infer dtype of the column.
980            # Should return as the desired dtype (inferred or specified).
981            try:
982                col_res, na_count = self._convert_tokens(
983                    i, start, end, name, na_filter, na_hashset,
984                    na_flist, col_dtype)
985            finally:
986                # gh-21353
987                #
988                # Cleanup the NaN hash that we generated
989                # to avoid memory leaks.
990                if na_filter:
991                    self._free_na_set(na_hashset)
992
993            # don't try to upcast EAs
994            try_upcast = upcast_na and na_count > 0
995            if try_upcast and not is_extension_array_dtype(col_dtype):
996                col_res = _maybe_upcast(col_res)
997
998            if col_res is None:
999                raise ParserError(f'Unable to parse column {i}')
1000
1001            results[i] = col_res
1002
1003        self.parser_start += end - start
1004
1005        return results
1006
1007    cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end,
1008                                object name, bint na_filter,
1009                                kh_str_starts_t *na_hashset,
1010                                object na_flist, object col_dtype):
1011
1012        if col_dtype is not None:
1013            col_res, na_count = self._convert_with_dtype(
1014                col_dtype, i, start, end, na_filter,
1015                1, na_hashset, na_flist)
1016
1017            # Fallback on the parse (e.g. we requested int dtype,
1018            # but its actually a float).
1019            if col_res is not None:
1020                return col_res, na_count
1021
1022        if i in self.noconvert:
1023            return self._string_convert(i, start, end, na_filter, na_hashset)
1024        else:
1025            col_res = None
1026            for dt in self.dtype_cast_order:
1027                try:
1028                    col_res, na_count = self._convert_with_dtype(
1029                        dt, i, start, end, na_filter, 0, na_hashset, na_flist)
1030                except ValueError:
1031                    # This error is raised from trying to convert to uint64,
1032                    # and we discover that we cannot convert to any numerical
1033                    # dtype successfully. As a result, we leave the data
1034                    # column AS IS with object dtype.
1035                    col_res, na_count = self._convert_with_dtype(
1036                        np.dtype('object'), i, start, end, 0,
1037                        0, na_hashset, na_flist)
1038                except OverflowError:
1039                    col_res, na_count = self._convert_with_dtype(
1040                        np.dtype('object'), i, start, end, na_filter,
1041                        0, na_hashset, na_flist)
1042
1043                if col_res is not None:
1044                    break
1045
1046        # we had a fallback parse on the dtype, so now try to cast
1047        # only allow safe casts, eg. with a nan you cannot safely cast to int
1048        if col_res is not None and col_dtype is not None:
1049            try:
1050                col_res = col_res.astype(col_dtype, casting='safe')
1051            except TypeError:
1052
1053                # float -> int conversions can fail the above
1054                # even with no nans
1055                col_res_orig = col_res
1056                col_res = col_res.astype(col_dtype)
1057                if (col_res != col_res_orig).any():
1058                    raise ValueError(
1059                        f"cannot safely convert passed user dtype of "
1060                        f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in "
1061                        f"column {i}")
1062
1063        return col_res, na_count
1064
1065    cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
1066                             int64_t start, int64_t end,
1067                             bint na_filter,
1068                             bint user_dtype,
1069                             kh_str_starts_t *na_hashset,
1070                             object na_flist):
1071        if is_categorical_dtype(dtype):
1072            # TODO: I suspect that _categorical_convert could be
1073            # optimized when dtype is an instance of CategoricalDtype
1074            codes, cats, na_count = _categorical_convert(
1075                self.parser, i, start, end, na_filter,
1076                na_hashset, self.c_encoding)
1077
1078            # Method accepts list of strings, not encoded ones.
1079            true_values = [x.decode() for x in self.true_values]
1080            array_type = dtype.construct_array_type()
1081            cat = array_type._from_inferred_categories(
1082                cats, codes, dtype, true_values=true_values)
1083            return cat, na_count
1084
1085        elif is_extension_array_dtype(dtype):
1086            result, na_count = self._string_convert(i, start, end, na_filter,
1087                                                    na_hashset)
1088            array_type = dtype.construct_array_type()
1089            try:
1090                # use _from_sequence_of_strings if the class defines it
1091                result = array_type._from_sequence_of_strings(result,
1092                                                              dtype=dtype)
1093            except NotImplementedError:
1094                raise NotImplementedError(
1095                    f"Extension Array: {array_type} must implement "
1096                    f"_from_sequence_of_strings in order "
1097                    f"to be used in parser methods")
1098
1099            return result, na_count
1100
1101        elif is_integer_dtype(dtype):
1102            try:
1103                result, na_count = _try_int64(self.parser, i, start,
1104                                              end, na_filter, na_hashset)
1105                if user_dtype and na_count is not None:
1106                    if na_count > 0:
1107                        raise ValueError(f"Integer column has NA values in column {i}")
1108            except OverflowError:
1109                result = _try_uint64(self.parser, i, start, end,
1110                                     na_filter, na_hashset)
1111                na_count = 0
1112
1113            if result is not None and dtype != 'int64':
1114                result = result.astype(dtype)
1115
1116            return result, na_count
1117
1118        elif is_float_dtype(dtype):
1119            result, na_count = _try_double(self.parser, i, start, end,
1120                                           na_filter, na_hashset, na_flist)
1121
1122            if result is not None and dtype != 'float64':
1123                result = result.astype(dtype)
1124            return result, na_count
1125        elif is_bool_dtype(dtype):
1126            result, na_count = _try_bool_flex(self.parser, i, start, end,
1127                                              na_filter, na_hashset,
1128                                              self.true_set, self.false_set)
1129            if user_dtype and na_count is not None:
1130                if na_count > 0:
1131                    raise ValueError(f"Bool column has NA values in column {i}")
1132            return result, na_count
1133
1134        elif dtype.kind == 'S':
1135            # TODO: na handling
1136            width = dtype.itemsize
1137            if width > 0:
1138                result = _to_fw_string(self.parser, i, start, end, width)
1139                return result, 0
1140
1141            # treat as a regular string parsing
1142            return self._string_convert(i, start, end, na_filter,
1143                                        na_hashset)
1144        elif dtype.kind == 'U':
1145            width = dtype.itemsize
1146            if width > 0:
1147                raise TypeError(f"the dtype {dtype} is not supported for parsing")
1148
1149            # unicode variable width
1150            return self._string_convert(i, start, end, na_filter,
1151                                        na_hashset)
1152        elif is_object_dtype(dtype):
1153            return self._string_convert(i, start, end, na_filter,
1154                                        na_hashset)
1155        elif is_datetime64_dtype(dtype):
1156            raise TypeError(f"the dtype {dtype} is not supported "
1157                            f"for parsing, pass this column "
1158                            f"using parse_dates instead")
1159        else:
1160            raise TypeError(f"the dtype {dtype} is not supported for parsing")
1161
1162    cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
1163                         bint na_filter, kh_str_starts_t *na_hashset):
1164
1165        cdef StringPath path = _string_path(self.c_encoding)
1166
1167        if path == UTF8:
1168            return _string_box_utf8(self.parser, i, start, end, na_filter,
1169                                    na_hashset)
1170        elif path == ENCODED:
1171            return _string_box_decode(self.parser, i, start, end,
1172                                      na_filter, na_hashset, self.c_encoding)
1173
1174    def _get_converter(self, i, name):
1175        if self.converters is None:
1176            return None
1177
1178        if name is not None and name in self.converters:
1179            return self.converters[name]
1180
1181        # Converter for position, if any
1182        return self.converters.get(i)
1183
1184    cdef _get_na_list(self, i, name):
1185        if self.na_values is None:
1186            return None, set()
1187
1188        if isinstance(self.na_values, dict):
1189            key = None
1190            values = None
1191
1192            if name is not None and name in self.na_values:
1193                key = name
1194            elif i in self.na_values:
1195                key = i
1196            else:  # No na_values provided for this column.
1197                if self.keep_default_na:
1198                    return _NA_VALUES, set()
1199
1200                return list(), set()
1201
1202            values = self.na_values[key]
1203            if values is not None and not isinstance(values, list):
1204                values = list(values)
1205
1206            fvalues = self.na_fvalues[key]
1207            if fvalues is not None and not isinstance(fvalues, set):
1208                fvalues = set(fvalues)
1209
1210            return _ensure_encoded(values), fvalues
1211        else:
1212            if not isinstance(self.na_values, list):
1213                self.na_values = list(self.na_values)
1214            if not isinstance(self.na_fvalues, set):
1215                self.na_fvalues = set(self.na_fvalues)
1216
1217            return _ensure_encoded(self.na_values), self.na_fvalues
1218
1219    cdef _free_na_set(self, kh_str_starts_t *table):
1220        kh_destroy_str_starts(table)
1221
1222    cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
1223        cdef int64_t j
1224        if self.has_usecols and self.names is not None:
1225            if (not callable(self.usecols) and
1226                    len(self.names) == len(self.usecols)):
1227                return self.names[nused]
1228            else:
1229                return self.names[i - self.leading_cols]
1230        else:
1231            if self.header is not None:
1232                j = i - self.leading_cols
1233                # generate extra (bogus) headers if there are more columns than headers
1234                if j >= len(self.header[0]):
1235                    return j
1236                else:
1237                    return self.header[0][j]
1238            else:
1239                return None
1240
1241
1242cdef:
1243    object _true_values = [b'True', b'TRUE', b'true']
1244    object _false_values = [b'False', b'FALSE', b'false']
1245
1246
1247def _ensure_encoded(list lst):
1248    cdef:
1249        list result = []
1250    for x in lst:
1251        if isinstance(x, str):
1252            x = PyUnicode_AsUTF8String(x)
1253        elif not isinstance(x, bytes):
1254            x = str(x).encode('utf-8')
1255
1256        result.append(x)
1257    return result
1258
1259
1260# common NA values
1261# no longer excluding inf representations
1262# '1.#INF','-1.#INF', '1.#INF000000',
1263STR_NA_VALUES = {
1264    "-1.#IND",
1265    "1.#QNAN",
1266    "1.#IND",
1267    "-1.#QNAN",
1268    "#N/A N/A",
1269    "#N/A",
1270    "N/A",
1271    "n/a",
1272    "NA",
1273    "<NA>",
1274    "#NA",
1275    "NULL",
1276    "null",
1277    "NaN",
1278    "-NaN",
1279    "nan",
1280    "-nan",
1281    "",
1282}
1283_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
1284
1285
1286def _maybe_upcast(arr):
1287    """
1288
1289    """
1290    if issubclass(arr.dtype.type, np.integer):
1291        na_value = na_values[arr.dtype]
1292        arr = arr.astype(float)
1293        np.putmask(arr, arr == na_value, np.nan)
1294    elif arr.dtype == np.bool_:
1295        mask = arr.view(np.uint8) == na_values[np.uint8]
1296        arr = arr.astype(object)
1297        np.putmask(arr, mask, np.nan)
1298
1299    return arr
1300
1301
1302cdef enum StringPath:
1303    UTF8
1304    ENCODED
1305
1306
1307# factored out logic to pick string converter
1308cdef inline StringPath _string_path(char *encoding):
1309    if encoding != NULL and encoding != b"utf-8":
1310        return ENCODED
1311    return UTF8
1312
1313
1314# ----------------------------------------------------------------------
1315# Type conversions / inference support code
1316
1317
1318cdef _string_box_utf8(parser_t *parser, int64_t col,
1319                      int64_t line_start, int64_t line_end,
1320                      bint na_filter, kh_str_starts_t *na_hashset):
1321    cdef:
1322        int error, na_count = 0
1323        Py_ssize_t i, lines
1324        coliter_t it
1325        const char *word = NULL
1326        ndarray[object] result
1327
1328        int ret = 0
1329        kh_strbox_t *table
1330
1331        object pyval
1332
1333        object NA = na_values[np.object_]
1334        khiter_t k
1335
1336    table = kh_init_strbox()
1337    lines = line_end - line_start
1338    result = np.empty(lines, dtype=np.object_)
1339    coliter_setup(&it, parser, col, line_start)
1340
1341    for i in range(lines):
1342        COLITER_NEXT(it, word)
1343
1344        if na_filter:
1345            if kh_get_str_starts_item(na_hashset, word):
1346                # in the hash table
1347                na_count += 1
1348                result[i] = NA
1349                continue
1350
1351        k = kh_get_strbox(table, word)
1352
1353        # in the hash table
1354        if k != table.n_buckets:
1355            # this increments the refcount, but need to test
1356            pyval = <object>table.vals[k]
1357        else:
1358            # box it. new ref?
1359            pyval = PyUnicode_FromString(word)
1360
1361            k = kh_put_strbox(table, word, &ret)
1362            table.vals[k] = <PyObject *>pyval
1363
1364        result[i] = pyval
1365
1366    kh_destroy_strbox(table)
1367
1368    return result, na_count
1369
1370
1371cdef _string_box_decode(parser_t *parser, int64_t col,
1372                        int64_t line_start, int64_t line_end,
1373                        bint na_filter, kh_str_starts_t *na_hashset,
1374                        char *encoding):
1375    cdef:
1376        int na_count = 0
1377        Py_ssize_t i, size, lines
1378        coliter_t it
1379        const char *word = NULL
1380        ndarray[object] result
1381
1382        int ret = 0
1383        kh_strbox_t *table
1384
1385        char *errors = "strict"
1386
1387        object pyval
1388
1389        object NA = na_values[np.object_]
1390        khiter_t k
1391
1392    table = kh_init_strbox()
1393    lines = line_end - line_start
1394    result = np.empty(lines, dtype=np.object_)
1395    coliter_setup(&it, parser, col, line_start)
1396
1397    for i in range(lines):
1398        COLITER_NEXT(it, word)
1399
1400        if na_filter:
1401            if kh_get_str_starts_item(na_hashset, word):
1402            # in the hash table
1403                na_count += 1
1404                result[i] = NA
1405                continue
1406
1407        k = kh_get_strbox(table, word)
1408
1409        # in the hash table
1410        if k != table.n_buckets:
1411            # this increments the refcount, but need to test
1412            pyval = <object>table.vals[k]
1413        else:
1414            # box it. new ref?
1415            size = strlen(word)
1416            pyval = PyUnicode_Decode(word, size, encoding, errors)
1417
1418            k = kh_put_strbox(table, word, &ret)
1419            table.vals[k] = <PyObject *>pyval
1420
1421        result[i] = pyval
1422
1423    kh_destroy_strbox(table)
1424
1425    return result, na_count
1426
1427
1428@cython.boundscheck(False)
1429cdef _categorical_convert(parser_t *parser, int64_t col,
1430                          int64_t line_start, int64_t line_end,
1431                          bint na_filter, kh_str_starts_t *na_hashset,
1432                          char *encoding):
1433    "Convert column data into codes, categories"
1434    cdef:
1435        int na_count = 0
1436        Py_ssize_t i, size, lines
1437        coliter_t it
1438        const char *word = NULL
1439
1440        int64_t NA = -1
1441        int64_t[:] codes
1442        int64_t current_category = 0
1443
1444        char *errors = "strict"
1445        StringPath path = _string_path(encoding)
1446
1447        int ret = 0
1448        kh_str_t *table
1449        khiter_t k
1450
1451    lines = line_end - line_start
1452    codes = np.empty(lines, dtype=np.int64)
1453
1454    # factorize parsed values, creating a hash table
1455    # bytes -> category code
1456    with nogil:
1457        table = kh_init_str()
1458        coliter_setup(&it, parser, col, line_start)
1459
1460        for i in range(lines):
1461            COLITER_NEXT(it, word)
1462
1463            if na_filter:
1464                if kh_get_str_starts_item(na_hashset, word):
1465                # is in NA values
1466                    na_count += 1
1467                    codes[i] = NA
1468                    continue
1469
1470            k = kh_get_str(table, word)
1471            # not in the hash table
1472            if k == table.n_buckets:
1473                k = kh_put_str(table, word, &ret)
1474                table.vals[k] = current_category
1475                current_category += 1
1476
1477            codes[i] = table.vals[k]
1478
1479    # parse and box categories to python strings
1480    result = np.empty(table.n_occupied, dtype=np.object_)
1481    if path == ENCODED:
1482        for k in range(table.n_buckets):
1483            if kh_exist_str(table, k):
1484                size = strlen(table.keys[k])
1485                result[table.vals[k]] = PyUnicode_Decode(
1486                    table.keys[k], size, encoding, errors)
1487    elif path == UTF8:
1488        for k in range(table.n_buckets):
1489            if kh_exist_str(table, k):
1490                result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1491
1492    kh_destroy_str(table)
1493    return np.asarray(codes), result, na_count
1494
1495
1496cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
1497                   int64_t line_end, int64_t width):
1498    cdef:
1499        char *data
1500        ndarray result
1501
1502    result = np.empty(line_end - line_start, dtype=f'|S{width}')
1503    data = <char*>result.data
1504
1505    with nogil:
1506        _to_fw_string_nogil(parser, col, line_start, line_end, width, data)
1507
1508    return result
1509
1510
1511cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
1512                                     int64_t line_start, int64_t line_end,
1513                                     size_t width, char *data) nogil:
1514    cdef:
1515        int64_t i
1516        coliter_t it
1517        const char *word = NULL
1518
1519    coliter_setup(&it, parser, col, line_start)
1520
1521    for i in range(line_end - line_start):
1522        COLITER_NEXT(it, word)
1523        strncpy(data, word, width)
1524        data += width
1525
1526
1527cdef:
1528    char* cinf = b'inf'
1529    char* cposinf = b'+inf'
1530    char* cneginf = b'-inf'
1531
1532    char* cinfty = b'Infinity'
1533    char* cposinfty = b'+Infinity'
1534    char* cneginfty = b'-Infinity'
1535
1536
1537cdef _try_double(parser_t *parser, int64_t col,
1538                 int64_t line_start, int64_t line_end,
1539                 bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
1540    cdef:
1541        int error, na_count = 0
1542        Py_ssize_t lines
1543        float64_t *data
1544        float64_t NA = na_values[np.float64]
1545        kh_float64_t *na_fset
1546        ndarray result
1547        bint use_na_flist = len(na_flist) > 0
1548
1549    lines = line_end - line_start
1550    result = np.empty(lines, dtype=np.float64)
1551    data = <float64_t *>result.data
1552    na_fset = kset_float64_from_list(na_flist)
1553    with nogil:
1554        error = _try_double_nogil(parser, parser.double_converter,
1555                                  col, line_start, line_end,
1556                                  na_filter, na_hashset, use_na_flist,
1557                                  na_fset, NA, data, &na_count)
1558
1559    kh_destroy_float64(na_fset)
1560    if error != 0:
1561        return None, None
1562    return result, na_count
1563
1564
1565cdef inline int _try_double_nogil(parser_t *parser,
1566                                  float64_t (*double_converter)(
1567                                      const char *, char **, char,
1568                                      char, char, int, int *, int *) nogil,
1569                                  int col, int line_start, int line_end,
1570                                  bint na_filter, kh_str_starts_t *na_hashset,
1571                                  bint use_na_flist,
1572                                  const kh_float64_t *na_flist,
1573                                  float64_t NA, float64_t *data,
1574                                  int *na_count) nogil:
1575    cdef:
1576        int error = 0,
1577        Py_ssize_t i, lines = line_end - line_start
1578        coliter_t it
1579        const char *word = NULL
1580        char *p_end
1581        khiter_t k64
1582
1583    na_count[0] = 0
1584    coliter_setup(&it, parser, col, line_start)
1585
1586    if na_filter:
1587        for i in range(lines):
1588            COLITER_NEXT(it, word)
1589
1590            if kh_get_str_starts_item(na_hashset, word):
1591                # in the hash table
1592                na_count[0] += 1
1593                data[0] = NA
1594            else:
1595                data[0] = double_converter(word, &p_end, parser.decimal,
1596                                           parser.sci, parser.thousands,
1597                                           1, &error, NULL)
1598                if error != 0 or p_end == word or p_end[0]:
1599                    error = 0
1600                    if (strcasecmp(word, cinf) == 0 or
1601                            strcasecmp(word, cposinf) == 0 or
1602                            strcasecmp(word, cinfty) == 0 or
1603                            strcasecmp(word, cposinfty) == 0):
1604                        data[0] = INF
1605                    elif (strcasecmp(word, cneginf) == 0 or
1606                            strcasecmp(word, cneginfty) == 0 ):
1607                        data[0] = NEGINF
1608                    else:
1609                        return 1
1610                if use_na_flist:
1611                    k64 = kh_get_float64(na_flist, data[0])
1612                    if k64 != na_flist.n_buckets:
1613                        na_count[0] += 1
1614                        data[0] = NA
1615            data += 1
1616    else:
1617        for i in range(lines):
1618            COLITER_NEXT(it, word)
1619            data[0] = double_converter(word, &p_end, parser.decimal,
1620                                       parser.sci, parser.thousands,
1621                                       1, &error, NULL)
1622            if error != 0 or p_end == word or p_end[0]:
1623                error = 0
1624                if (strcasecmp(word, cinf) == 0 or
1625                        strcasecmp(word, cposinf) == 0 or
1626                        strcasecmp(word, cinfty) == 0 or
1627                        strcasecmp(word, cposinfty) == 0):
1628                    data[0] = INF
1629                elif (strcasecmp(word, cneginf) == 0 or
1630                        strcasecmp(word, cneginfty) == 0):
1631                    data[0] = NEGINF
1632                else:
1633                    return 1
1634            data += 1
1635
1636    return 0
1637
1638
1639cdef _try_uint64(parser_t *parser, int64_t col,
1640                 int64_t line_start, int64_t line_end,
1641                 bint na_filter, kh_str_starts_t *na_hashset):
1642    cdef:
1643        int error
1644        Py_ssize_t lines
1645        coliter_t it
1646        uint64_t *data
1647        ndarray result
1648        uint_state state
1649
1650    lines = line_end - line_start
1651    result = np.empty(lines, dtype=np.uint64)
1652    data = <uint64_t *>result.data
1653
1654    uint_state_init(&state)
1655    coliter_setup(&it, parser, col, line_start)
1656    with nogil:
1657        error = _try_uint64_nogil(parser, col, line_start, line_end,
1658                                  na_filter, na_hashset, data, &state)
1659    if error != 0:
1660        if error == ERROR_OVERFLOW:
1661            # Can't get the word variable
1662            raise OverflowError('Overflow')
1663        return None
1664
1665    if uint64_conflict(&state):
1666        raise ValueError('Cannot convert to numerical dtype')
1667
1668    if state.seen_sint:
1669        raise OverflowError('Overflow')
1670
1671    return result
1672
1673
1674cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,
1675                                  int64_t line_start,
1676                                  int64_t line_end, bint na_filter,
1677                                  const kh_str_starts_t *na_hashset,
1678                                  uint64_t *data, uint_state *state) nogil:
1679    cdef:
1680        int error
1681        Py_ssize_t i, lines = line_end - line_start
1682        coliter_t it
1683        const char *word = NULL
1684
1685    coliter_setup(&it, parser, col, line_start)
1686
1687    if na_filter:
1688        for i in range(lines):
1689            COLITER_NEXT(it, word)
1690            if kh_get_str_starts_item(na_hashset, word):
1691                # in the hash table
1692                state.seen_null = 1
1693                data[i] = 0
1694                continue
1695
1696            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1697                                    &error, parser.thousands)
1698            if error != 0:
1699                return error
1700    else:
1701        for i in range(lines):
1702            COLITER_NEXT(it, word)
1703            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1704                                    &error, parser.thousands)
1705            if error != 0:
1706                return error
1707
1708    return 0
1709
1710
1711cdef _try_int64(parser_t *parser, int64_t col,
1712                int64_t line_start, int64_t line_end,
1713                bint na_filter, kh_str_starts_t *na_hashset):
1714    cdef:
1715        int error, na_count = 0
1716        Py_ssize_t lines
1717        coliter_t it
1718        int64_t *data
1719        ndarray result
1720        int64_t NA = na_values[np.int64]
1721
1722    lines = line_end - line_start
1723    result = np.empty(lines, dtype=np.int64)
1724    data = <int64_t *>result.data
1725    coliter_setup(&it, parser, col, line_start)
1726    with nogil:
1727        error = _try_int64_nogil(parser, col, line_start, line_end,
1728                                 na_filter, na_hashset, NA, data, &na_count)
1729    if error != 0:
1730        if error == ERROR_OVERFLOW:
1731            # Can't get the word variable
1732            raise OverflowError('Overflow')
1733        return None, None
1734
1735    return result, na_count
1736
1737
1738cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
1739                                 int64_t line_start,
1740                                 int64_t line_end, bint na_filter,
1741                                 const kh_str_starts_t *na_hashset, int64_t NA,
1742                                 int64_t *data, int *na_count) nogil:
1743    cdef:
1744        int error
1745        Py_ssize_t i, lines = line_end - line_start
1746        coliter_t it
1747        const char *word = NULL
1748
1749    na_count[0] = 0
1750    coliter_setup(&it, parser, col, line_start)
1751
1752    if na_filter:
1753        for i in range(lines):
1754            COLITER_NEXT(it, word)
1755            if kh_get_str_starts_item(na_hashset, word):
1756                # in the hash table
1757                na_count[0] += 1
1758                data[i] = NA
1759                continue
1760
1761            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1762                                   &error, parser.thousands)
1763            if error != 0:
1764                return error
1765    else:
1766        for i in range(lines):
1767            COLITER_NEXT(it, word)
1768            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1769                                   &error, parser.thousands)
1770            if error != 0:
1771                return error
1772
1773    return 0
1774
1775
1776cdef _try_bool_flex(parser_t *parser, int64_t col,
1777                    int64_t line_start, int64_t line_end,
1778                    bint na_filter, const kh_str_starts_t *na_hashset,
1779                    const kh_str_starts_t *true_hashset,
1780                    const kh_str_starts_t *false_hashset):
1781    cdef:
1782        int error, na_count = 0
1783        Py_ssize_t lines
1784        uint8_t *data
1785        ndarray result
1786        uint8_t NA = na_values[np.bool_]
1787
1788    lines = line_end - line_start
1789    result = np.empty(lines, dtype=np.uint8)
1790    data = <uint8_t *>result.data
1791    with nogil:
1792        error = _try_bool_flex_nogil(parser, col, line_start, line_end,
1793                                     na_filter, na_hashset, true_hashset,
1794                                     false_hashset, NA, data, &na_count)
1795    if error != 0:
1796        return None, None
1797    return result.view(np.bool_), na_count
1798
1799
1800cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,
1801                                     int64_t line_start,
1802                                     int64_t line_end, bint na_filter,
1803                                     const kh_str_starts_t *na_hashset,
1804                                     const kh_str_starts_t *true_hashset,
1805                                     const kh_str_starts_t *false_hashset,
1806                                     uint8_t NA, uint8_t *data,
1807                                     int *na_count) nogil:
1808    cdef:
1809        int error = 0
1810        Py_ssize_t i, lines = line_end - line_start
1811        coliter_t it
1812        const char *word = NULL
1813
1814    na_count[0] = 0
1815    coliter_setup(&it, parser, col, line_start)
1816
1817    if na_filter:
1818        for i in range(lines):
1819            COLITER_NEXT(it, word)
1820
1821            if kh_get_str_starts_item(na_hashset, word):
1822                # in the hash table
1823                na_count[0] += 1
1824                data[0] = NA
1825                data += 1
1826                continue
1827
1828            if kh_get_str_starts_item(true_hashset, word):
1829                data[0] = 1
1830                data += 1
1831                continue
1832            if kh_get_str_starts_item(false_hashset, word):
1833                data[0] = 0
1834                data += 1
1835                continue
1836
1837            error = to_boolean(word, data)
1838            if error != 0:
1839                return error
1840            data += 1
1841    else:
1842        for i in range(lines):
1843            COLITER_NEXT(it, word)
1844
1845            if kh_get_str_starts_item(true_hashset, word):
1846                data[0] = 1
1847                data += 1
1848                continue
1849
1850            if kh_get_str_starts_item(false_hashset, word):
1851                data[0] = 0
1852                data += 1
1853                continue
1854
1855            error = to_boolean(word, data)
1856            if error != 0:
1857                return error
1858            data += 1
1859
1860    return 0
1861
1862
1863cdef kh_str_starts_t* kset_from_list(list values) except NULL:
1864    # caller takes responsibility for freeing the hash table
1865    cdef:
1866        Py_ssize_t i
1867        kh_str_starts_t *table
1868        int ret = 0
1869        object val
1870
1871    table = kh_init_str_starts()
1872
1873    for i in range(len(values)):
1874        val = values[i]
1875
1876        # None creeps in sometimes, which isn't possible here
1877        if not isinstance(val, bytes):
1878            kh_destroy_str_starts(table)
1879            raise ValueError('Must be all encoded bytes')
1880
1881        kh_put_str_starts_item(table, PyBytes_AsString(val), &ret)
1882
1883    if table.table.n_buckets <= 128:
1884        # Resize the hash table to make it almost empty, this
1885        # reduces amount of hash collisions on lookup thus
1886        # "key not in table" case is faster.
1887        # Note that this trades table memory footprint for lookup speed.
1888        kh_resize_str_starts(table, table.table.n_buckets * 8)
1889
1890    return table
1891
1892
1893cdef kh_float64_t* kset_float64_from_list(values) except NULL:
1894    # caller takes responsibility for freeing the hash table
1895    cdef:
1896        khiter_t k
1897        kh_float64_t *table
1898        int ret = 0
1899        float64_t val
1900        object value
1901
1902    table = kh_init_float64()
1903
1904    for value in values:
1905        val = float(value)
1906
1907        k = kh_put_float64(table, val, &ret)
1908
1909    if table.n_buckets <= 128:
1910        # See reasoning in kset_from_list
1911        kh_resize_float64(table, table.n_buckets * 8)
1912    return table
1913
1914
1915cdef raise_parser_error(object base, parser_t *parser):
1916    cdef:
1917        object old_exc
1918        object exc_type
1919        PyObject *type
1920        PyObject *value
1921        PyObject *traceback
1922
1923    if PyErr_Occurred():
1924        PyErr_Fetch(&type, &value, &traceback)
1925        Py_XDECREF(traceback)
1926
1927        if value != NULL:
1928            old_exc = <object>value
1929            Py_XDECREF(value)
1930
1931            # PyErr_Fetch only returned the error message in *value,
1932            # so the Exception class must be extracted from *type.
1933            if isinstance(old_exc, str):
1934                if type != NULL:
1935                    exc_type = <object>type
1936                else:
1937                    exc_type = ParserError
1938
1939                Py_XDECREF(type)
1940                raise exc_type(old_exc)
1941            else:
1942                Py_XDECREF(type)
1943                raise old_exc
1944
1945    message = f'{base}. C error: '
1946    if parser.error_msg != NULL:
1947        message += parser.error_msg.decode('utf-8')
1948    else:
1949        message += 'no error message set'
1950
1951    raise ParserError(message)
1952
1953
1954def _concatenate_chunks(list chunks):
1955    cdef:
1956        list names = list(chunks[0].keys())
1957        object name
1958        list warning_columns = []
1959        object warning_names
1960        object common_type
1961
1962    result = {}
1963    for name in names:
1964        arrs = [chunk.pop(name) for chunk in chunks]
1965        # Check each arr for consistent types.
1966        dtypes = {a.dtype for a in arrs}
1967        numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
1968        if len(numpy_dtypes) > 1:
1969            common_type = np.find_common_type(numpy_dtypes, [])
1970            if common_type == object:
1971                warning_columns.append(str(name))
1972
1973        dtype = dtypes.pop()
1974        if is_categorical_dtype(dtype):
1975            sort_categories = isinstance(dtype, str)
1976            result[name] = union_categoricals(arrs,
1977                                              sort_categories=sort_categories)
1978        else:
1979            if is_extension_array_dtype(dtype):
1980                array_type = dtype.construct_array_type()
1981                result[name] = array_type._concat_same_type(arrs)
1982            else:
1983                result[name] = np.concatenate(arrs)
1984
1985    if warning_columns:
1986        warning_names = ','.join(warning_columns)
1987        warning_message = " ".join([
1988            f"Columns ({warning_names}) have mixed types."
1989            f"Specify dtype option on import or set low_memory=False."
1990          ])
1991        warnings.warn(warning_message, DtypeWarning, stacklevel=8)
1992    return result
1993
1994
1995# ----------------------------------------------------------------------
1996# NA values
1997def _compute_na_values():
1998    int64info = np.iinfo(np.int64)
1999    int32info = np.iinfo(np.int32)
2000    int16info = np.iinfo(np.int16)
2001    int8info = np.iinfo(np.int8)
2002    uint64info = np.iinfo(np.uint64)
2003    uint32info = np.iinfo(np.uint32)
2004    uint16info = np.iinfo(np.uint16)
2005    uint8info = np.iinfo(np.uint8)
2006    na_values = {
2007        np.float64: np.nan,
2008        np.int64: int64info.min,
2009        np.int32: int32info.min,
2010        np.int16: int16info.min,
2011        np.int8: int8info.min,
2012        np.uint64: uint64info.max,
2013        np.uint32: uint32info.max,
2014        np.uint16: uint16info.max,
2015        np.uint8: uint8info.max,
2016        np.bool_: uint8info.max,
2017        np.object_: np.nan   # oof
2018    }
2019    return na_values
2020
2021
2022na_values = _compute_na_values()
2023
2024for k in list(na_values):
2025    na_values[np.dtype(k)] = na_values[k]
2026
2027
2028cdef _apply_converter(object f, parser_t *parser, int64_t col,
2029                      int64_t line_start, int64_t line_end,
2030                      char* c_encoding):
2031    cdef:
2032        Py_ssize_t i, lines
2033        coliter_t it
2034        const char *word = NULL
2035        char *errors = "strict"
2036        ndarray[object] result
2037        object val
2038
2039    lines = line_end - line_start
2040    result = np.empty(lines, dtype=np.object_)
2041
2042    coliter_setup(&it, parser, col, line_start)
2043
2044    if c_encoding == NULL or c_encoding == b'utf-8':
2045        for i in range(lines):
2046            COLITER_NEXT(it, word)
2047            val = PyUnicode_FromString(word)
2048            result[i] = f(val)
2049    else:
2050        for i in range(lines):
2051            COLITER_NEXT(it, word)
2052            val = PyUnicode_Decode(word, strlen(word),
2053                                   c_encoding, errors)
2054            result[i] = f(val)
2055
2056    return lib.maybe_convert_objects(result)
2057
2058
2059def _maybe_encode(values):
2060    if values is None:
2061        return []
2062    return [x.encode('utf-8') if isinstance(x, str) else x for x in values]
2063
2064
2065def sanitize_objects(ndarray[object] values, set na_values,
2066                     bint convert_empty=True):
2067    """
2068    Convert specified values, including the given set na_values and empty
2069    strings if convert_empty is True, to np.nan.
2070
2071    Parameters
2072    ----------
2073    values : ndarray[object]
2074    na_values : set
2075    convert_empty : bool, default True
2076    """
2077    cdef:
2078        Py_ssize_t i, n
2079        object val, onan
2080        Py_ssize_t na_count = 0
2081        dict memo = {}
2082
2083    n = len(values)
2084    onan = np.nan
2085
2086    for i in range(n):
2087        val = values[i]
2088        if (convert_empty and val == '') or (val in na_values):
2089            values[i] = onan
2090            na_count += 1
2091        elif val in memo:
2092            values[i] = memo[val]
2093        else:
2094            memo[val] = val
2095
2096    return na_count
2097