1 #ifndef dt_FREAD_H
2 #define dt_FREAD_H
3 #include "dt_stdio.h"  // PRId64
4 #include <stdint.h>    // uint32_t
5 #include <stdlib.h>    // size_t
6 #include <stdbool.h>   // bool
7 #include "myomp.h"
8 #ifdef DTPY
9   #include "py_fread.h"
10   #define ENC2NATIVE(s) (s)
11 #else
12   #include "freadR.h"
13   extern cetype_t ienc;
14   // R's message functions only take C's char pointer not SEXP, where encoding info can't be stored
15   // so must convert the error message char to native encoding first in order to correctly display in R
16   #define ENC2NATIVE(s) translateChar(mkCharCE(s, ienc))
17 #endif
18 
19 // Ordered hierarchy of types
20 typedef enum {
21   NEG = -1,        // dummy to force signed type; sign bit used for out-of-sample type bump management
22   CT_DROP = 0,     // skip column requested by user; it is navigated as a string column with the prevailing quoteRule
23   CT_BOOL8_N,      // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1.
24   CT_BOOL8_U,
25   CT_BOOL8_T,
26   CT_BOOL8_L,
27   CT_INT32,        // int32_t
28   CT_INT64,        // int64_t
29   CT_FLOAT64,      // double (64-bit IEEE 754 float)
30   CT_FLOAT64_EXT,  // double, with NAN/INF literals
31   CT_FLOAT64_HEX,  // double, in hexadecimal format
32   CT_ISO8601_DATE, // integer, as read from a date in ISO-8601 format
33   CT_ISO8601_TIME, // double, as read from a timestamp in ISO-8601 time
34   CT_STRING,       // lenOff struct below
35   NUMTYPE          // placeholder for the number of types including drop; used for allocation and loop bounds
36 } colType;
37 
38 extern int8_t typeSize[NUMTYPE];
39 extern const char typeName[NUMTYPE][10];
40 extern const long double pow10lookup[601];
41 extern const uint8_t hexdigits[256];
42 
43 
44 // Strings are pushed by fread_main using an offset from an anchor address plus
45 // string length; fread_impl.c then manages strings appropriately
46 typedef struct {
47   int32_t len;  // signed to distinguish NA vs empty ""
48   int32_t off;
49 } lenOff;
50 
51 
52 #define NA_BOOL8         INT8_MIN
53 #define NA_INT32         INT32_MIN
54 #define NA_INT64         INT64_MIN
55 #define NA_FLOAT64_I64   0x7FF00000000007A2
56 #define NA_LENOFF        INT32_MIN  // lenOff.len only; lenOff.off undefined for NA
57 
58 
59 
60 // *****************************************************************************
61 
62 typedef struct freadMainArgs
63 {
64   // Name of the file to open (a \0-terminated C string). If the file name
65   // contains non-ASCII characters, it should be UTF-8 encoded (however fread
66   // will not validate the encoding).
67   const char *filename;
68 
69   // Data buffer: a \0-terminated C string. When this parameter is given,
70   // fread() will read from the provided string. This parameter is exclusive
71   // with `filename`.
72   const char *input;
73 
74   // Maximum number of rows to read, or INT64_MAX to read the entire dataset.
75   // Note that even if `nrowLimit = 0`, fread() will scan a sample of rows in
76   // the file to detect column names and types (and other parsing settings).
77   int64_t nrowLimit;
78 
79   // Number of input lines to skip when reading the file.
80   int64_t skipNrow;
81 
82   // Skip to the line containing this string. This parameter cannot be used
83   // with `skipLines`.
84   const char *skipString;
85 
86   // NULL-terminated list of strings that should be converted into NA values.
87   // The last entry in this array is NULL (sentinel), which lets us know where
88   // the array ends.
89   const char * const* NAstrings;
90 
91   // Maximum number of threads. If 0, then fread will use the maximum possible
92   // number of threads, as determined by omp_get_max_threads(). If negative,
93   // then fread will use that many threads less than allowed maximum (but
94   // always at least 1).
95   int32_t nth;
96 
97   // Character to use for a field separator. Multi-character separators are not
98   // supported. If `sep` is '\0', then fread will autodetect it. A quotation
99   // mark '"' is not allowed as field separator.
100   char sep;
101 
102   // Decimal separator for numbers (usually '.'). This may coincide with `sep`,
103   // in which case floating-point numbers will have to be quoted. Multi-char
104   // (or non-ASCII) decimal separators are not supported. A quotation mark '"'
105   // is not allowed as decimal separator.
106   // See: https://en.wikipedia.org/wiki/Decimal_mark
107   char dec;
108 
109   // Character to use as a quotation mark (usually '"'). Pass '\0' to disable
110   // field quoting. This parameter cannot be auto-detected. Multi-character,
111   // non-ASCII, or different open/closing quotation marks are not supported.
112   char quote;
113 
114   // Is there a header at the beginning of the file?
115   // 0 = no, 1 = yes, -128 = autodetect
116   int8_t header;
117 
118   // Strip the whitespace from fields (usually True).
119   bool stripWhite;
120 
121   // If True, empty lines in the file will be skipped. Otherwise empty lines
122   // will produce rows of NAs.
123   bool skipEmptyLines;
124 
125   // If True, then rows are allowed to have variable number of columns, and
126   // all ragged rows will be filled with NAs on the right.
127   bool fill;
128 
129   // If True, then emit progress messages during the parsing.
130   bool showProgress;
131 
132   // Emit extra debug-level information.
133   bool verbose;
134 
135   // If true, then this field instructs `fread` to treat warnings as errors. In
136   // particular in R this setting is turned on whenever `option(warn=2)` is set,
137   // in which case calling the standard `warning()` raises an exception.
138   // However `fread` still needs to know that the exception will be raised, so
139   // that it can do proper cleanup / resource deallocation -- otherwise memory
140   // leaks would occur.
141   bool warningsAreErrors;
142 
143   // If true, then column of 0s and 1s will be read as logical, otherwise it
144   // will become integer.
145   bool logical01;
146 
147   bool keepLeadingZeros;
148 
149   // should datetime with no Z or UTZ-offset be read as UTC?
150   bool noTZasUTC;
151 
152   char _padding[1];
153 
154   // Any additional implementation-specific parameters.
155   FREAD_MAIN_ARGS_EXTRA_FIELDS
156 
157 } freadMainArgs;
158 
159 
160 
161 // *****************************************************************************
162 
163 typedef struct ThreadLocalFreadParsingContext
164 {
165   // Pointer that serves as a starting point for all offsets within the `lenOff`
166   // structs.
167   const char *__restrict__ anchor;
168 
169   // Output buffers for values with different alignment requirements. For
170   // example all `lenOff` columns, `double` columns and `int64` columns will be
171   // written to buffer `buff8`; at the same time `bool` and `int8` columns will
172   // be stored in memory buffer `buff1`.
173   // Within each buffer the data is stored in row-major order, i.e. in the same
174   // order as in the original CSV file.
175   void *__restrict__ buff8;
176   void *__restrict__ buff4;
177   void *__restrict__ buff1;
178 
179   // Size (in bytes) for a single row of data within the buffers `buff8`,
180   // `buff4` and `buff1` correspondingly.
181   size_t rowSize8;
182   size_t rowSize4;
183   size_t rowSize1;
184 
185   // Starting row index within the output DataTable for the current data chunk.
186   size_t DTi;
187 
188   // Number of rows currently being stored within the buffers. The allocation
189   // size of each `buffX` is thus at least `nRows * rowSizeX`.
190   size_t nRows;
191 
192   // Reference to the flag that controls the parser's execution. Setting this
193   // flag to true will force parsing of the CSV file to terminate in the near
194   // future.
195   bool *stopTeam;
196 
197   int threadn;
198 
199   int quoteRule;
200 
201   // Any additional implementation-specific parameters.
202   FREAD_PUSH_BUFFERS_EXTRA_FIELDS
203 
204 } ThreadLocalFreadParsingContext;
205 
206 
207 
208 // *****************************************************************************
209 
210 /**
211  * Fast parallel reading of CSV files with intelligent guessing of parse
212  * parameters.
213  *
214  * It should have been called just "fread", but that name is already defined in
215  * the system libraries...
216  */
217 int freadMain(freadMainArgs args);
218 
219 
220 /**
221  * This callback is invoked by `freadMain` after the initial pre-scan of the
222  * file, when all parsing parameters have been determined; most importantly the
223  * column names and their types.
224  *
225  * This function serves two purposes: first, it tells the upstream code what the
226  * detected column names are; and secondly what is the expected type of each
227  * column. The upstream code then has an opportunity to upcast the column types
228  * if requested by the user, or mark some columns as skipped.
229  *
230  * @param types
231  *    type codes of each column in the CSV file. Possible type codes are
232  *    described by the `colType` enum. The function may modify this array
233  *    setting some types to 0 (CT_DROP), or upcasting the types. Downcasting is
234  *    not allowed and will trigger an error from `freadMain` later on.
235  *
236  * @param colNames
237  *    array of `lenOff` structures (offsets are relative to the `anchor`)
238  *    describing the column names. If the CSV file had no header row, then this
239  *    array will be filled with 0s.
240  *
241  * @param anchor
242  *    pointer to a string buffer (usually somewhere inside the memory-mapped
243  *    file) within which the column names are located, as described by the
244  *    `colNames` array.
245  *
246  * @param ncol
247  *    total number of columns. This is the length of arrays `types` and
248  *    `colNames`.
249  *
250  * @return
251  *    this function may return `false` to request that fread abort reading
252  *    the CSV file. Normally, this function should return `true`.
253  */
254 bool userOverride(int8_t *types, lenOff *colNames, const char *anchor,
255                   const int ncol);
256 
257 
258 /**
259  * This function is invoked by `freadMain` before the main scan of the input
260  * file. It should allocate the resulting `DataTable` structure and prepare
261  * to receive the data in chunks.
262  *
263  * Additionally, this function will be invoked if the main scan was
264  * unsuccessful. This may happen either because there were out-of-sample type
265  * exceptions (i.e. a value was found in one of the columns that wasn't
266  * acceptable for that column's type), or if the initial estimate of the file's
267  * number of rows turned out to be too conservative, and more rows has to be
268  * appended to the DataTable.
269  *
270  * @param types
271  *     array of type codes for each column. Same as in the `userOverride`
272  *     function.
273  *
274  * @param sizes
275  *    the size (in bytes) of each column within the buffer(s) that will be
276  *    passed to `pushBuffer()` during the scan. This array should be saved for
277  *    later use. It exists mostly for convenience, since the size of each
278  *    non-skipped column may be determined from that column's type.
279  *
280  * @param ncols
281  *    number of columns in the CSV file. This is the size of arrays `types` and
282  *    `sizes`.
283  *
284  * @param ndrop
285  *    count of columns with type CT_DROP. This parameter is provided for
286  *    convenience, since it can always be computed from `types`. The resulting
287  *    datatable will have `ncols - ndrop` columns.
288  *
289  * @param nrows
290  *    the number of rows to allocate for the datatable. This number of rows is
291  *    estimated during the initial pre-scan, and then adjusted upwards to
292  *    account for possible variation. It is very unlikely that this number
293  *    underestimates the final row count.
294  *
295  * @return
296  *    this function should return the total size of the Datatable created (for
297  *    reporting purposes). If the return value is 0, then it indicates an error
298  *    and `fread` will abort.
299  */
300 size_t allocateDT(int8_t *types, int8_t *sizes, int ncols, int ndrop,
301                   size_t nrows);
302 
303 
304 /**
305  * Called once at the beginning of each thread before it starts scanning the
306  * input file. If the file needs to be rescanned because of out-of-type
307  * exceptions, this will be called again before the second scan.
308  */
309 void prepareThreadContext(ThreadLocalFreadParsingContext *ctx);
310 
311 
312 /**
313  * Give upstream the chance to modify the scanned buffers after the thread
314  * finished reading its chunk but before it enters the "ordered" section.
315  * Variable `ctx.DTi` is not available at this moment.
316  */
317 void postprocessBuffer(ThreadLocalFreadParsingContext *ctx);
318 
319 
320 /**
321  * Callback invoked within the "ordered" section for each thread. Only
322  * lightweight processing should be performed here, since this section stalls
323  * execution of any other thread!
324  */
325 void orderBuffer(ThreadLocalFreadParsingContext *ctx);
326 
327 
328 /**
329  * This function transfers the scanned input data into the final DataTable
330  * structure. It will be called many times, and from parallel threads (thus
331  * it should not attempt to modify any global variables). Its primary job is
332  * to transpose the data: convert from row-major order within each buffer
333  * into the column-major order for the resulting DataTable.
334  */
335 void pushBuffer(ThreadLocalFreadParsingContext *ctx);
336 
337 
338 /**
339  * Called at the end to specify what the actual number of rows in the datatable
340  * was. The function should adjust the datatable, reallocing the buffers if
341  * necessary.
342  * If the input file needs to be rescanned due to some columns having wrong
343  * column types, then this function will be called once after the file is
344  * finished scanning but before any calls to `reallocColType()`, and then the
345  * second time after the entire input file was scanned again.
346  */
347 void setFinalNrow(size_t nrows);
348 
349 
350 /**
351  * Free any srtuctures associated with the thread-local parsing context.
352  */
353 void freeThreadContext(ThreadLocalFreadParsingContext *ctx);
354 
355 
356 /**
357  * Progress-reporting function.
358  */
359 void progress(int percent/*[0,100]*/, int ETA/*secs*/);
360 
361 
362 bool freadCleanup(void);
363 double wallclock(void);
364 
365 #endif
366