1 /*
2 
3 Copyright (c) 2012, Lambda Foundry, Inc., except where noted
4 
5 Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
6 BSD
7 
8 See LICENSE for the license
9 
10 */
11 
12 #ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
13 #define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
14 
15 #define PY_SSIZE_T_CLEAN
16 #include <Python.h>
17 
18 #define ERROR_NO_DIGITS 1
19 #define ERROR_OVERFLOW 2
20 #define ERROR_INVALID_CHARS 3
21 
22 #include "../headers/stdint.h"
23 #include "../inline_helper.h"
24 #include "../headers/portable.h"
25 
26 #include "khash.h"
27 
28 #define STREAM_INIT_SIZE 32
29 
30 #define REACHED_EOF 1
31 #define CALLING_READ_FAILED 2
32 
33 
34 /*
35 
36   C flat file parsing low level code for pandas / NumPy
37 
38  */
39 
40 /*
41  *  Common set of error types for the read_rows() and tokenize()
42  *  functions.
43  */
44 
45 // #define VERBOSE
46 #if defined(VERBOSE)
47 #define TRACE(X) printf X;
48 #else
49 #define TRACE(X)
50 #endif  // VERBOSE
51 
52 #define PARSER_OUT_OF_MEMORY -1
53 
54 /*
55  *  TODO: Might want to couple count_rows() with read_rows() to avoid
56  *        duplication of some file I/O.
57  */
58 
59 typedef enum {
60     START_RECORD,
61     START_FIELD,
62     ESCAPED_CHAR,
63     IN_FIELD,
64     IN_QUOTED_FIELD,
65     ESCAPE_IN_QUOTED_FIELD,
66     QUOTE_IN_QUOTED_FIELD,
67     EAT_CRNL,
68     EAT_CRNL_NOP,
69     EAT_WHITESPACE,
70     EAT_COMMENT,
71     EAT_LINE_COMMENT,
72     WHITESPACE_LINE,
73     START_FIELD_IN_SKIP_LINE,
74     IN_FIELD_IN_SKIP_LINE,
75     IN_QUOTED_FIELD_IN_SKIP_LINE,
76     QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
77     FINISHED
78 } ParserState;
79 
80 typedef enum {
81     QUOTE_MINIMAL,
82     QUOTE_ALL,
83     QUOTE_NONNUMERIC,
84     QUOTE_NONE
85 } QuoteStyle;
86 
87 typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
88                              int *status);
89 typedef int (*io_cleanup)(void *src);
90 
91 typedef struct parser_t {
92     void *source;
93     io_callback cb_io;
94     io_cleanup cb_cleanup;
95 
96     int64_t chunksize;      // Number of bytes to prepare for each chunk
97     char *data;             // pointer to data to be processed
98     int64_t datalen;        // amount of data available
99     int64_t datapos;
100 
101     // where to write out tokenized data
102     char *stream;
103     uint64_t stream_len;
104     uint64_t stream_cap;
105 
106     // Store words in (potentially ragged) matrix for now, hmm
107     char **words;
108     int64_t *word_starts;   // where we are in the stream
109     uint64_t words_len;
110     uint64_t words_cap;
111     uint64_t max_words_cap;  // maximum word cap encountered
112 
113     char *pword_start;      // pointer to stream start of current field
114     int64_t word_start;     // position start of current field
115 
116     int64_t *line_start;    // position in words for start of line
117     int64_t *line_fields;   // Number of fields in each line
118     uint64_t lines;         // Number of (good) lines observed
119     uint64_t file_lines;    // Number of lines (including bad or skipped)
120     uint64_t lines_cap;     // Vector capacity
121 
122     // Tokenizing stuff
123     ParserState state;
124     int doublequote;      /* is " represented by ""? */
125     char delimiter;       /* field separator */
126     int delim_whitespace; /* delimit by consuming space/tabs instead */
127     char quotechar;       /* quote character */
128     char escapechar;      /* escape character */
129     char lineterminator;
130     int skipinitialspace; /* ignore spaces following delimiter? */
131     int quoting;          /* style of quoting to write */
132 
133     char commentchar;
134     int allow_embedded_newline;
135     int strict; /* raise exception on bad CSV */
136 
137     int usecols;  // Boolean: 1: usecols provided, 0: none provided
138 
139     int expected_fields;
140     int error_bad_lines;
141     int warn_bad_lines;
142 
143     // floating point options
144     char decimal;
145     char sci;
146 
147     // thousands separator (comma, period)
148     char thousands;
149 
150     int header;            // Boolean: 1: has header, 0: no header
151     int64_t header_start;  // header row start
152     uint64_t header_end;   // header row end
153 
154     void *skipset;
155     PyObject *skipfunc;
156     int64_t skip_first_N_rows;
157     int64_t skip_footer;
158     double (*double_converter)(const char *, char **,
159                                char, char, char, int, int *, int *);
160 
161     // error handling
162     char *warn_msg;
163     char *error_msg;
164 
165     int skip_empty_lines;
166 } parser_t;
167 
168 typedef struct coliter_t {
169     char **words;
170     int64_t *line_start;
171     int64_t col;
172 } coliter_t;
173 
174 void coliter_setup(coliter_t *self, parser_t *parser, int i, int start);
175 
176 #define COLITER_NEXT(iter, word)                           \
177     do {                                                   \
178         const int64_t i = *iter.line_start++ + iter.col;   \
179         word = i >= *iter.line_start ? "" : iter.words[i]; \
180     } while (0)
181 
182 parser_t *parser_new(void);
183 
184 int parser_init(parser_t *self);
185 
186 int parser_consume_rows(parser_t *self, size_t nrows);
187 
188 int parser_trim_buffers(parser_t *self);
189 
190 int parser_add_skiprow(parser_t *self, int64_t row);
191 
192 int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
193 
194 void parser_free(parser_t *self);
195 
196 void parser_del(parser_t *self);
197 
198 void parser_set_default_options(parser_t *self);
199 
200 int tokenize_nrows(parser_t *self, size_t nrows);
201 
202 int tokenize_all_rows(parser_t *self);
203 
204 // Have parsed / type-converted a chunk of data
205 // and want to free memory from the token stream
206 
207 typedef struct uint_state {
208     int seen_sint;
209     int seen_uint;
210     int seen_null;
211 } uint_state;
212 
213 void uint_state_init(uint_state *self);
214 
215 int uint64_conflict(uint_state *self);
216 
217 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
218                        uint64_t uint_max, int *error, char tsep);
219 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
220                      int *error, char tsep);
221 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
222                int skip_trailing, int *error, int *maybe_int);
223 double precise_xstrtod(const char *p, char **q, char decimal,
224                        char sci, char tsep, int skip_trailing,
225                        int *error, int *maybe_int);
226 
227 // GH-15140 - round_trip requires and acquires the GIL on its own
228 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
229                   int skip_trailing, int *error, int *maybe_int);
230 int to_boolean(const char *item, uint8_t *val);
231 
232 #endif  // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
233