1 /* 2 3 Copyright (c) 2012, Lambda Foundry, Inc., except where noted 4 5 Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause 6 BSD 7 8 See LICENSE for the license 9 10 */ 11 12 #ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ 13 #define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ 14 15 #define PY_SSIZE_T_CLEAN 16 #include <Python.h> 17 18 #define ERROR_NO_DIGITS 1 19 #define ERROR_OVERFLOW 2 20 #define ERROR_INVALID_CHARS 3 21 22 #include "../headers/stdint.h" 23 #include "../inline_helper.h" 24 #include "../headers/portable.h" 25 26 #include "khash.h" 27 28 #define STREAM_INIT_SIZE 32 29 30 #define REACHED_EOF 1 31 #define CALLING_READ_FAILED 2 32 33 34 /* 35 36 C flat file parsing low level code for pandas / NumPy 37 38 */ 39 40 /* 41 * Common set of error types for the read_rows() and tokenize() 42 * functions. 43 */ 44 45 // #define VERBOSE 46 #if defined(VERBOSE) 47 #define TRACE(X) printf X; 48 #else 49 #define TRACE(X) 50 #endif // VERBOSE 51 52 #define PARSER_OUT_OF_MEMORY -1 53 54 /* 55 * TODO: Might want to couple count_rows() with read_rows() to avoid 56 * duplication of some file I/O. 57 */ 58 59 typedef enum { 60 START_RECORD, 61 START_FIELD, 62 ESCAPED_CHAR, 63 IN_FIELD, 64 IN_QUOTED_FIELD, 65 ESCAPE_IN_QUOTED_FIELD, 66 QUOTE_IN_QUOTED_FIELD, 67 EAT_CRNL, 68 EAT_CRNL_NOP, 69 EAT_WHITESPACE, 70 EAT_COMMENT, 71 EAT_LINE_COMMENT, 72 WHITESPACE_LINE, 73 START_FIELD_IN_SKIP_LINE, 74 IN_FIELD_IN_SKIP_LINE, 75 IN_QUOTED_FIELD_IN_SKIP_LINE, 76 QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, 77 FINISHED 78 } ParserState; 79 80 typedef enum { 81 QUOTE_MINIMAL, 82 QUOTE_ALL, 83 QUOTE_NONNUMERIC, 84 QUOTE_NONE 85 } QuoteStyle; 86 87 typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, 88 int *status); 89 typedef int (*io_cleanup)(void *src); 90 91 typedef struct parser_t { 92 void *source; 93 io_callback cb_io; 94 io_cleanup cb_cleanup; 95 96 int64_t chunksize; // Number of bytes to prepare for each chunk 97 char *data; // pointer to data to be processed 98 int64_t datalen; // amount of data available 99 int64_t datapos; 100 101 // where to write out tokenized data 102 char *stream; 103 uint64_t stream_len; 104 uint64_t stream_cap; 105 106 // Store words in (potentially ragged) matrix for now, hmm 107 char **words; 108 int64_t *word_starts; // where we are in the stream 109 uint64_t words_len; 110 uint64_t words_cap; 111 uint64_t max_words_cap; // maximum word cap encountered 112 113 char *pword_start; // pointer to stream start of current field 114 int64_t word_start; // position start of current field 115 116 int64_t *line_start; // position in words for start of line 117 int64_t *line_fields; // Number of fields in each line 118 uint64_t lines; // Number of (good) lines observed 119 uint64_t file_lines; // Number of lines (including bad or skipped) 120 uint64_t lines_cap; // Vector capacity 121 122 // Tokenizing stuff 123 ParserState state; 124 int doublequote; /* is " represented by ""? */ 125 char delimiter; /* field separator */ 126 int delim_whitespace; /* delimit by consuming space/tabs instead */ 127 char quotechar; /* quote character */ 128 char escapechar; /* escape character */ 129 char lineterminator; 130 int skipinitialspace; /* ignore spaces following delimiter? */ 131 int quoting; /* style of quoting to write */ 132 133 char commentchar; 134 int allow_embedded_newline; 135 int strict; /* raise exception on bad CSV */ 136 137 int usecols; // Boolean: 1: usecols provided, 0: none provided 138 139 int expected_fields; 140 int error_bad_lines; 141 int warn_bad_lines; 142 143 // floating point options 144 char decimal; 145 char sci; 146 147 // thousands separator (comma, period) 148 char thousands; 149 150 int header; // Boolean: 1: has header, 0: no header 151 int64_t header_start; // header row start 152 uint64_t header_end; // header row end 153 154 void *skipset; 155 PyObject *skipfunc; 156 int64_t skip_first_N_rows; 157 int64_t skip_footer; 158 double (*double_converter)(const char *, char **, 159 char, char, char, int, int *, int *); 160 161 // error handling 162 char *warn_msg; 163 char *error_msg; 164 165 int skip_empty_lines; 166 } parser_t; 167 168 typedef struct coliter_t { 169 char **words; 170 int64_t *line_start; 171 int64_t col; 172 } coliter_t; 173 174 void coliter_setup(coliter_t *self, parser_t *parser, int i, int start); 175 176 #define COLITER_NEXT(iter, word) \ 177 do { \ 178 const int64_t i = *iter.line_start++ + iter.col; \ 179 word = i >= *iter.line_start ? "" : iter.words[i]; \ 180 } while (0) 181 182 parser_t *parser_new(void); 183 184 int parser_init(parser_t *self); 185 186 int parser_consume_rows(parser_t *self, size_t nrows); 187 188 int parser_trim_buffers(parser_t *self); 189 190 int parser_add_skiprow(parser_t *self, int64_t row); 191 192 int parser_set_skipfirstnrows(parser_t *self, int64_t nrows); 193 194 void parser_free(parser_t *self); 195 196 void parser_del(parser_t *self); 197 198 void parser_set_default_options(parser_t *self); 199 200 int tokenize_nrows(parser_t *self, size_t nrows); 201 202 int tokenize_all_rows(parser_t *self); 203 204 // Have parsed / type-converted a chunk of data 205 // and want to free memory from the token stream 206 207 typedef struct uint_state { 208 int seen_sint; 209 int seen_uint; 210 int seen_null; 211 } uint_state; 212 213 void uint_state_init(uint_state *self); 214 215 int uint64_conflict(uint_state *self); 216 217 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, 218 uint64_t uint_max, int *error, char tsep); 219 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, 220 int *error, char tsep); 221 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, 222 int skip_trailing, int *error, int *maybe_int); 223 double precise_xstrtod(const char *p, char **q, char decimal, 224 char sci, char tsep, int skip_trailing, 225 int *error, int *maybe_int); 226 227 // GH-15140 - round_trip requires and acquires the GIL on its own 228 double round_trip(const char *p, char **q, char decimal, char sci, char tsep, 229 int skip_trailing, int *error, int *maybe_int); 230 int to_boolean(const char *item, uint8_t *val); 231 232 #endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ 233