1 #ifndef dt_FREAD_H 2 #define dt_FREAD_H 3 #include "dt_stdio.h" // PRId64 4 #include <stdint.h> // uint32_t 5 #include <stdlib.h> // size_t 6 #include <stdbool.h> // bool 7 #include "myomp.h" 8 #ifdef DTPY 9 #include "py_fread.h" 10 #define ENC2NATIVE(s) (s) 11 #else 12 #include "freadR.h" 13 extern cetype_t ienc; 14 // R's message functions only take C's char pointer not SEXP, where encoding info can't be stored 15 // so must convert the error message char to native encoding first in order to correctly display in R 16 #define ENC2NATIVE(s) translateChar(mkCharCE(s, ienc)) 17 #endif 18 19 // Ordered hierarchy of types 20 typedef enum { 21 NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management 22 CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule 23 CT_BOOL8_N, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. 24 CT_BOOL8_U, 25 CT_BOOL8_T, 26 CT_BOOL8_L, 27 CT_INT32, // int32_t 28 CT_INT64, // int64_t 29 CT_FLOAT64, // double (64-bit IEEE 754 float) 30 CT_FLOAT64_EXT, // double, with NAN/INF literals 31 CT_FLOAT64_HEX, // double, in hexadecimal format 32 CT_ISO8601_DATE, // integer, as read from a date in ISO-8601 format 33 CT_ISO8601_TIME, // double, as read from a timestamp in ISO-8601 time 34 CT_STRING, // lenOff struct below 35 NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds 36 } colType; 37 38 extern int8_t typeSize[NUMTYPE]; 39 extern const char typeName[NUMTYPE][10]; 40 extern const long double pow10lookup[601]; 41 extern const uint8_t hexdigits[256]; 42 43 44 // Strings are pushed by fread_main using an offset from an anchor address plus 45 // string length; fread_impl.c then manages strings appropriately 46 typedef struct { 47 int32_t len; // signed to distinguish NA vs empty "" 48 int32_t off; 49 } lenOff; 50 51 52 #define NA_BOOL8 INT8_MIN 53 #define NA_INT32 INT32_MIN 54 #define NA_INT64 INT64_MIN 55 #define NA_FLOAT64_I64 0x7FF00000000007A2 56 #define NA_LENOFF INT32_MIN // lenOff.len only; lenOff.off undefined for NA 57 58 59 60 // ***************************************************************************** 61 62 typedef struct freadMainArgs 63 { 64 // Name of the file to open (a \0-terminated C string). If the file name 65 // contains non-ASCII characters, it should be UTF-8 encoded (however fread 66 // will not validate the encoding). 67 const char *filename; 68 69 // Data buffer: a \0-terminated C string. When this parameter is given, 70 // fread() will read from the provided string. This parameter is exclusive 71 // with `filename`. 72 const char *input; 73 74 // Maximum number of rows to read, or INT64_MAX to read the entire dataset. 75 // Note that even if `nrowLimit = 0`, fread() will scan a sample of rows in 76 // the file to detect column names and types (and other parsing settings). 77 int64_t nrowLimit; 78 79 // Number of input lines to skip when reading the file. 80 int64_t skipNrow; 81 82 // Skip to the line containing this string. This parameter cannot be used 83 // with `skipLines`. 84 const char *skipString; 85 86 // NULL-terminated list of strings that should be converted into NA values. 87 // The last entry in this array is NULL (sentinel), which lets us know where 88 // the array ends. 89 const char * const* NAstrings; 90 91 // Maximum number of threads. If 0, then fread will use the maximum possible 92 // number of threads, as determined by omp_get_max_threads(). If negative, 93 // then fread will use that many threads less than allowed maximum (but 94 // always at least 1). 95 int32_t nth; 96 97 // Character to use for a field separator. Multi-character separators are not 98 // supported. If `sep` is '\0', then fread will autodetect it. A quotation 99 // mark '"' is not allowed as field separator. 100 char sep; 101 102 // Decimal separator for numbers (usually '.'). This may coincide with `sep`, 103 // in which case floating-point numbers will have to be quoted. Multi-char 104 // (or non-ASCII) decimal separators are not supported. A quotation mark '"' 105 // is not allowed as decimal separator. 106 // See: https://en.wikipedia.org/wiki/Decimal_mark 107 char dec; 108 109 // Character to use as a quotation mark (usually '"'). Pass '\0' to disable 110 // field quoting. This parameter cannot be auto-detected. Multi-character, 111 // non-ASCII, or different open/closing quotation marks are not supported. 112 char quote; 113 114 // Is there a header at the beginning of the file? 115 // 0 = no, 1 = yes, -128 = autodetect 116 int8_t header; 117 118 // Strip the whitespace from fields (usually True). 119 bool stripWhite; 120 121 // If True, empty lines in the file will be skipped. Otherwise empty lines 122 // will produce rows of NAs. 123 bool skipEmptyLines; 124 125 // If True, then rows are allowed to have variable number of columns, and 126 // all ragged rows will be filled with NAs on the right. 127 bool fill; 128 129 // If True, then emit progress messages during the parsing. 130 bool showProgress; 131 132 // Emit extra debug-level information. 133 bool verbose; 134 135 // If true, then this field instructs `fread` to treat warnings as errors. In 136 // particular in R this setting is turned on whenever `option(warn=2)` is set, 137 // in which case calling the standard `warning()` raises an exception. 138 // However `fread` still needs to know that the exception will be raised, so 139 // that it can do proper cleanup / resource deallocation -- otherwise memory 140 // leaks would occur. 141 bool warningsAreErrors; 142 143 // If true, then column of 0s and 1s will be read as logical, otherwise it 144 // will become integer. 145 bool logical01; 146 147 bool keepLeadingZeros; 148 149 // should datetime with no Z or UTZ-offset be read as UTC? 150 bool noTZasUTC; 151 152 char _padding[1]; 153 154 // Any additional implementation-specific parameters. 155 FREAD_MAIN_ARGS_EXTRA_FIELDS 156 157 } freadMainArgs; 158 159 160 161 // ***************************************************************************** 162 163 typedef struct ThreadLocalFreadParsingContext 164 { 165 // Pointer that serves as a starting point for all offsets within the `lenOff` 166 // structs. 167 const char *__restrict__ anchor; 168 169 // Output buffers for values with different alignment requirements. For 170 // example all `lenOff` columns, `double` columns and `int64` columns will be 171 // written to buffer `buff8`; at the same time `bool` and `int8` columns will 172 // be stored in memory buffer `buff1`. 173 // Within each buffer the data is stored in row-major order, i.e. in the same 174 // order as in the original CSV file. 175 void *__restrict__ buff8; 176 void *__restrict__ buff4; 177 void *__restrict__ buff1; 178 179 // Size (in bytes) for a single row of data within the buffers `buff8`, 180 // `buff4` and `buff1` correspondingly. 181 size_t rowSize8; 182 size_t rowSize4; 183 size_t rowSize1; 184 185 // Starting row index within the output DataTable for the current data chunk. 186 size_t DTi; 187 188 // Number of rows currently being stored within the buffers. The allocation 189 // size of each `buffX` is thus at least `nRows * rowSizeX`. 190 size_t nRows; 191 192 // Reference to the flag that controls the parser's execution. Setting this 193 // flag to true will force parsing of the CSV file to terminate in the near 194 // future. 195 bool *stopTeam; 196 197 int threadn; 198 199 int quoteRule; 200 201 // Any additional implementation-specific parameters. 202 FREAD_PUSH_BUFFERS_EXTRA_FIELDS 203 204 } ThreadLocalFreadParsingContext; 205 206 207 208 // ***************************************************************************** 209 210 /** 211 * Fast parallel reading of CSV files with intelligent guessing of parse 212 * parameters. 213 * 214 * It should have been called just "fread", but that name is already defined in 215 * the system libraries... 216 */ 217 int freadMain(freadMainArgs args); 218 219 220 /** 221 * This callback is invoked by `freadMain` after the initial pre-scan of the 222 * file, when all parsing parameters have been determined; most importantly the 223 * column names and their types. 224 * 225 * This function serves two purposes: first, it tells the upstream code what the 226 * detected column names are; and secondly what is the expected type of each 227 * column. The upstream code then has an opportunity to upcast the column types 228 * if requested by the user, or mark some columns as skipped. 229 * 230 * @param types 231 * type codes of each column in the CSV file. Possible type codes are 232 * described by the `colType` enum. The function may modify this array 233 * setting some types to 0 (CT_DROP), or upcasting the types. Downcasting is 234 * not allowed and will trigger an error from `freadMain` later on. 235 * 236 * @param colNames 237 * array of `lenOff` structures (offsets are relative to the `anchor`) 238 * describing the column names. If the CSV file had no header row, then this 239 * array will be filled with 0s. 240 * 241 * @param anchor 242 * pointer to a string buffer (usually somewhere inside the memory-mapped 243 * file) within which the column names are located, as described by the 244 * `colNames` array. 245 * 246 * @param ncol 247 * total number of columns. This is the length of arrays `types` and 248 * `colNames`. 249 * 250 * @return 251 * this function may return `false` to request that fread abort reading 252 * the CSV file. Normally, this function should return `true`. 253 */ 254 bool userOverride(int8_t *types, lenOff *colNames, const char *anchor, 255 const int ncol); 256 257 258 /** 259 * This function is invoked by `freadMain` before the main scan of the input 260 * file. It should allocate the resulting `DataTable` structure and prepare 261 * to receive the data in chunks. 262 * 263 * Additionally, this function will be invoked if the main scan was 264 * unsuccessful. This may happen either because there were out-of-sample type 265 * exceptions (i.e. a value was found in one of the columns that wasn't 266 * acceptable for that column's type), or if the initial estimate of the file's 267 * number of rows turned out to be too conservative, and more rows has to be 268 * appended to the DataTable. 269 * 270 * @param types 271 * array of type codes for each column. Same as in the `userOverride` 272 * function. 273 * 274 * @param sizes 275 * the size (in bytes) of each column within the buffer(s) that will be 276 * passed to `pushBuffer()` during the scan. This array should be saved for 277 * later use. It exists mostly for convenience, since the size of each 278 * non-skipped column may be determined from that column's type. 279 * 280 * @param ncols 281 * number of columns in the CSV file. This is the size of arrays `types` and 282 * `sizes`. 283 * 284 * @param ndrop 285 * count of columns with type CT_DROP. This parameter is provided for 286 * convenience, since it can always be computed from `types`. The resulting 287 * datatable will have `ncols - ndrop` columns. 288 * 289 * @param nrows 290 * the number of rows to allocate for the datatable. This number of rows is 291 * estimated during the initial pre-scan, and then adjusted upwards to 292 * account for possible variation. It is very unlikely that this number 293 * underestimates the final row count. 294 * 295 * @return 296 * this function should return the total size of the Datatable created (for 297 * reporting purposes). If the return value is 0, then it indicates an error 298 * and `fread` will abort. 299 */ 300 size_t allocateDT(int8_t *types, int8_t *sizes, int ncols, int ndrop, 301 size_t nrows); 302 303 304 /** 305 * Called once at the beginning of each thread before it starts scanning the 306 * input file. If the file needs to be rescanned because of out-of-type 307 * exceptions, this will be called again before the second scan. 308 */ 309 void prepareThreadContext(ThreadLocalFreadParsingContext *ctx); 310 311 312 /** 313 * Give upstream the chance to modify the scanned buffers after the thread 314 * finished reading its chunk but before it enters the "ordered" section. 315 * Variable `ctx.DTi` is not available at this moment. 316 */ 317 void postprocessBuffer(ThreadLocalFreadParsingContext *ctx); 318 319 320 /** 321 * Callback invoked within the "ordered" section for each thread. Only 322 * lightweight processing should be performed here, since this section stalls 323 * execution of any other thread! 324 */ 325 void orderBuffer(ThreadLocalFreadParsingContext *ctx); 326 327 328 /** 329 * This function transfers the scanned input data into the final DataTable 330 * structure. It will be called many times, and from parallel threads (thus 331 * it should not attempt to modify any global variables). Its primary job is 332 * to transpose the data: convert from row-major order within each buffer 333 * into the column-major order for the resulting DataTable. 334 */ 335 void pushBuffer(ThreadLocalFreadParsingContext *ctx); 336 337 338 /** 339 * Called at the end to specify what the actual number of rows in the datatable 340 * was. The function should adjust the datatable, reallocing the buffers if 341 * necessary. 342 * If the input file needs to be rescanned due to some columns having wrong 343 * column types, then this function will be called once after the file is 344 * finished scanning but before any calls to `reallocColType()`, and then the 345 * second time after the entire input file was scanned again. 346 */ 347 void setFinalNrow(size_t nrows); 348 349 350 /** 351 * Free any srtuctures associated with the thread-local parsing context. 352 */ 353 void freeThreadContext(ThreadLocalFreadParsingContext *ctx); 354 355 356 /** 357 * Progress-reporting function. 358 */ 359 void progress(int percent/*[0,100]*/, int ETA/*secs*/); 360 361 362 bool freadCleanup(void); 363 double wallclock(void); 364 365 #endif 366