1 #include "fread.h"
2 #include "freadR.h"
3 #include "data.table.h"
4
5 /***** TO DO *****
6 Restore test 1339 (balanced embedded quotes, see ?fread already updated).
7 Confirm: http://stackoverflow.com/questions/23833294/data-tablefread-doesnt-like-missing-values-in-first-column
8 construct test and investigate skip for completeness here: http://stackoverflow.com/questions/22086780/data-table-fread-error
9 http://stackoverflow.com/questions/22229109/r-data-table-fread-command-how-to-read-large-files-with-irregular-separators
10 And even more diagnostics to verbose=true so we can see where crashes are.
11 Detect and coerce dates and times. By searching for - and :, and dateTtime etc, or R's own method or fasttime. POSIXct default, for microseconds? : http://stackoverflow.com/questions/14056370/cast-string-to-idatetime
12 Add as.colClasses to fread.R after return from C level (e.g. for colClasses "Date", although as slow as read.csv via character)
13 Allow comment char to ignore. Important in format detection. But require valid line data before comment character in the read loop? See http://stackoverflow.com/a/18922269/403310
14 Deal with row.names e.g. http://stackoverflow.com/questions/15448732/reading-csv-with-row-names-by-fread
15 Test Garrett's two files again (wrap around ,,,,,, and different row lengths that the wc -l now fixes)
16 Post from patricknik on 5 Jan re ""b"" in a field. And Aykut Firat on email.
17 Save repeated ch<eof checking in main read step. Last line might still be tricky if last line has no eol.
18 Test using at least "grep read.table ...Rtrunk/tests/
19 Look for any non-alpha-numeric characters in the output and try each of them. That way can handle bell character as well and save testing separators which aren't there.
20 Just one "NA" current default but empty in future when numerics handle NA string variants directly.
21 ---
22 Secondary separator for list() columns, such as columns 11 and 12 in BED (no need for strsplit).
23 *****/
24
25 #define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards
26
27 static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP };
28 static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" };
29 static int typeEnum[NUT] = {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING};
30 static colType readInt64As=CT_INT64;
31 static SEXP selectSxp;
32 static SEXP dropSxp;
33 static SEXP colClassesSxp;
34 static bool selectColClasses = false;
35 cetype_t ienc = CE_NATIVE;
36 static SEXP RCHK;
37 static SEXP DT;
38 static SEXP colNamesSxp;
39 static SEXP colClassesAs; // the classes like factor, POSIXct which are currently done afterwards at R level: strings don't match typeRName above => NUT / "CLASS"
40 static SEXP selectRank; // C level returns the column reording vector to be done by setcolorder() at R level afterwards
41 static int8_t *type;
42 static int8_t *size;
43 static int ncol = 0;
44 static int64_t dtnrows = 0;
45 static bool verbose = false;
46 static bool warningsAreErrors = false;
47 static bool oldNoDateTime = false;
48
49
freadR(SEXP inputArg,SEXP sepArg,SEXP decArg,SEXP quoteArg,SEXP headerArg,SEXP nrowLimitArg,SEXP skipArg,SEXP NAstringsArg,SEXP stripWhiteArg,SEXP skipEmptyLinesArg,SEXP fillArg,SEXP showProgressArg,SEXP nThreadArg,SEXP verboseArg,SEXP warnings2errorsArg,SEXP logical01Arg,SEXP selectArg,SEXP dropArg,SEXP colClassesArg,SEXP integer64Arg,SEXP encodingArg,SEXP keepLeadingZerosArgs,SEXP noTZasUTC)50 SEXP freadR(
51 // params passed to freadMain
52 SEXP inputArg,
53 SEXP sepArg,
54 SEXP decArg,
55 SEXP quoteArg,
56 SEXP headerArg,
57 SEXP nrowLimitArg,
58 SEXP skipArg,
59 SEXP NAstringsArg,
60 SEXP stripWhiteArg,
61 SEXP skipEmptyLinesArg,
62 SEXP fillArg,
63 SEXP showProgressArg,
64 SEXP nThreadArg,
65 SEXP verboseArg,
66 SEXP warnings2errorsArg,
67 SEXP logical01Arg,
68
69 // extras needed by callbacks from freadMain
70 SEXP selectArg,
71 SEXP dropArg,
72 SEXP colClassesArg,
73 SEXP integer64Arg,
74 SEXP encodingArg,
75 SEXP keepLeadingZerosArgs,
76 SEXP noTZasUTC
77 ) {
78 verbose = LOGICAL(verboseArg)[0];
79 warningsAreErrors = LOGICAL(warnings2errorsArg)[0];
80
81 freadMainArgs args;
82 ncol = 0;
83 dtnrows = 0;
84 const char *ch, *ch2;
85 if (!isString(inputArg) || LENGTH(inputArg)!=1)
86 error(_("Internal error: freadR input not a single character string: a filename or the data itself. Should have been caught at R level.")); // # nocov
87 ch = ch2 = (const char *)CHAR(STRING_ELT(inputArg,0));
88 while (*ch2!='\n' && *ch2!='\r' && *ch2!='\0') ch2++;
89 args.input = (*ch2=='\0') ? R_ExpandFileName(ch) : ch; // for convenience so user doesn't have to call path.expand()
90
91 ch = args.input;
92 while (*ch!='\0' && *ch!='\n' && *ch!='\r') ch++;
93 if (*ch!='\0' || args.input[0]=='\0') {
94 if (verbose) DTPRINT(_("Input contains a \\n or is \")\". Taking this to be text input (not a filename)\n"));
95 args.filename = NULL;
96 } else {
97 if (verbose) DTPRINT(_("Input contains no \\n. Taking this to be a filename to open\n"));
98 args.filename = args.input;
99 args.input = NULL;
100 }
101
102 if (!isString(sepArg) || LENGTH(sepArg)!=1 || strlen(CHAR(STRING_ELT(sepArg,0)))>1)
103 error(_("Internal error: freadR sep not a single character. R level catches this.")); // # nocov
104 args.sep = CHAR(STRING_ELT(sepArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level
105
106 if (!(isString(decArg) && LENGTH(decArg)==1 && strlen(CHAR(STRING_ELT(decArg,0)))==1))
107 error(_("Internal error: freadR dec not a single character. R level catches this.")); // # nocov
108 args.dec = CHAR(STRING_ELT(decArg,0))[0];
109
110 if (IS_FALSE(quoteArg)) {
111 args.quote = '\0';
112 } else {
113 if (!isString(quoteArg) || LENGTH(quoteArg)!=1 || strlen(CHAR(STRING_ELT(quoteArg,0))) > 1)
114 error(_("quote= must be a single character, blank \"\", or FALSE"));
115 args.quote = CHAR(STRING_ELT(quoteArg,0))[0];
116 }
117
118 // header is the only boolean where NA is valid and means 'auto'.
119 // LOGICAL in R is signed 32 bits with NA_LOGICAL==INT_MIN, currently.
120 args.header = false;
121 if (LOGICAL(headerArg)[0]==NA_LOGICAL) args.header = NA_BOOL8;
122 else if (LOGICAL(headerArg)[0]==TRUE) args.header = true;
123
124 args.nrowLimit = INT64_MAX;
125 // checked at R level
126 if (isReal(nrowLimitArg)) {
127 if (R_FINITE(REAL(nrowLimitArg)[0]) && REAL(nrowLimitArg)[0]>=0.0) args.nrowLimit = (int64_t)(REAL(nrowLimitArg)[0]);
128 } else {
129 if (INTEGER(nrowLimitArg)[0]>=1) args.nrowLimit = (int64_t)INTEGER(nrowLimitArg)[0];
130 }
131
132 args.logical01 = LOGICAL(logical01Arg)[0];
133 {
134 SEXP tt = PROTECT(GetOption(sym_old_fread_datetime_character, R_NilValue));
135 args.oldNoDateTime = oldNoDateTime = isLogical(tt) && LENGTH(tt)==1 && LOGICAL(tt)[0]==TRUE;
136 UNPROTECT(1);
137 }
138 args.skipNrow=-1;
139 args.skipString=NULL;
140 if (isString(skipArg)) {
141 args.skipString = CHAR(STRING_ELT(skipArg,0)); // LENGTH==1 was checked at R level
142 } else if (isInteger(skipArg)) {
143 args.skipNrow = (int64_t)INTEGER(skipArg)[0];
144 } else error(_("Internal error: skip not integer or string in freadR.c")); // # nocov
145
146 if (!isNull(NAstringsArg) && !isString(NAstringsArg))
147 error(_("Internal error: NAstringsArg is type '%s'. R level catches this"), type2char(TYPEOF(NAstringsArg))); // # nocov
148 int nnas = length(NAstringsArg);
149 const char **NAstrings = (const char **)R_alloc((nnas + 1), sizeof(char*)); // +1 for the final NULL to save a separate nna variable
150 for (int i=0; i<nnas; i++)
151 NAstrings[i] = CHAR(STRING_ELT(NAstringsArg,i));
152 NAstrings[nnas] = NULL;
153 args.NAstrings = NAstrings;
154
155 // here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL
156 args.stripWhite = LOGICAL(stripWhiteArg)[0];
157 args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0];
158 args.fill = LOGICAL(fillArg)[0];
159 args.showProgress = LOGICAL(showProgressArg)[0];
160 if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]);
161 args.nth = (uint32_t)INTEGER(nThreadArg)[0];
162 args.verbose = verbose;
163 args.warningsAreErrors = warningsAreErrors;
164 args.keepLeadingZeros = LOGICAL(keepLeadingZerosArgs)[0];
165 args.noTZasUTC = LOGICAL(noTZasUTC)[0];
166
167 // === extras used for callbacks ===
168 if (!isString(integer64Arg) || LENGTH(integer64Arg)!=1) error(_("'integer64' must be a single character string"));
169 const char *tt = CHAR(STRING_ELT(integer64Arg,0));
170 if (strcmp(tt, "integer64")==0) {
171 readInt64As = CT_INT64;
172 } else if (strcmp(tt, "character")==0) {
173 readInt64As = CT_STRING;
174 } else if (strcmp(tt,"double")==0 || strcmp(tt,"numeric")==0) {
175 readInt64As = CT_FLOAT64;
176 } else STOP(_("Invalid value integer64='%s'. Must be 'integer64', 'character', 'double' or 'numeric'"), tt);
177
178 colClassesSxp = colClassesArg;
179
180 selectSxp = selectArg;
181 dropSxp = dropArg;
182 selectColClasses = false;
183 if (!isNull(selectSxp)) {
184 if (!isNull(dropSxp)) STOP(_("Use either select= or drop= but not both."));
185 if (isNewList(selectArg)) {
186 if (!isNull(colClassesSxp))
187 STOP(_("select= is type list for specifying types in select=, but colClasses= has been provided as well. Please remove colClasses=."));
188 if (!length(getAttrib(selectArg, R_NamesSymbol)))
189 STOP(_("select= is type list but has no names; expecting list(type1=cols1, type2=cols2, ...)"));
190 colClassesSxp = selectArg;
191 selectColClasses = true;
192 selectSxp = R_NilValue;
193 } else {
194 if (!isNull(getAttrib(selectArg, R_NamesSymbol))) {
195 if (!isNull(colClassesSxp))
196 STOP(_("select= is a named vector specifying the columns to select and their types, but colClasses= has been provided as well. Please remove colClasses=."));
197 colClassesSxp = selectArg;
198 selectSxp = getAttrib(selectArg, R_NamesSymbol);
199 selectColClasses = true;
200 }
201 }
202 } else {
203 if (TYPEOF(colClassesSxp)==VECSXP && !length(getAttrib(colClassesSxp, R_NamesSymbol)))
204 STOP(_("colClasses is type list but has no names"));
205 }
206
207 // Encoding, #563: Borrowed from do_setencoding from base R
208 // https://github.com/wch/r-source/blob/ca5348f0b5e3f3c2b24851d7aff02de5217465eb/src/main/util.c#L1115
209 // Check for mkCharLenCE function to locate as to where where this is implemented.
210 tt = CHAR(STRING_ELT(encodingArg, 0));
211 if (strcmp(tt, "unknown")==0) ienc = CE_NATIVE;
212 else if (strcmp(tt, "Latin-1")==0) ienc = CE_LATIN1;
213 else if (strcmp(tt, "UTF-8")==0) ienc = CE_UTF8;
214 else STOP(_("encoding='%s' invalid. Must be 'unknown', 'Latin-1' or 'UTF-8'"), tt); // # nocov
215 // === end extras ===
216
217 RCHK = PROTECT(allocVector(VECSXP, 4));
218 // see kalibera/rchk#9 and Rdatatable/data.table#2865. To avoid rchk false positives.
219 // allocateDT() assigns DT to position 0. userOverride() assigns colNamesSxp to position 1 and colClassesAs to position 2 (both used in allocateDT())
220 freadMain(args);
221 UNPROTECT(1);
222 return DT;
223 }
224
applyDrop(SEXP items,int8_t * type,int ncol,int dropSource)225 static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) {
226 if (!length(items)) return;
227 SEXP itemsInt = PROTECT( isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP) );
228 const int *itemsD = INTEGER(itemsInt), n=LENGTH(itemsInt);
229 for (int j=0; j<n; ++j) {
230 int k = itemsD[j];
231 if (k==NA_INTEGER || k<1 || k>ncol) {
232 static char buff[51];
233 if (dropSource==-1) snprintf(buff, 50, "drop[%d]", j+1);
234 else snprintf(buff, 50, "colClasses[[%d]][%d]", dropSource+1, j+1);
235 if (k==NA_INTEGER) {
236 if (isString(items))
237 DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff);
238 else
239 DTWARN(_("%s is NA"), buff);
240 } else {
241 DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol);
242 }
243 } else {
244 type[k-1] = CT_DROP;
245 // aside: dropping the same column several times is acceptable with no warning. This could arise via duplicates in the drop= vector,
246 // or specifying the same column to drop using NULLs in colClasses and drop= too.
247 }
248 }
249 UNPROTECT(1);
250 }
251
userOverride(int8_t * type,lenOff * colNames,const char * anchor,const int ncol)252 bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int ncol)
253 {
254 // use typeSize superfluously to avoid not-used warning; otherwise could move typeSize from fread.h into fread.c
255 if (typeSize[CT_BOOL8_N]!=1) STOP(_("Internal error: typeSize[CT_BOOL8_N] != 1")); // # nocov
256 if (typeSize[CT_STRING]!=8) STOP(_("Internal error: typeSize[CT_STRING] != 1")); // # nocov
257 colNamesSxp = R_NilValue;
258 SET_VECTOR_ELT(RCHK, 1, colNamesSxp=allocVector(STRSXP, ncol));
259 for (int i=0; i<ncol; i++) {
260 SEXP elem;
261 if (colNames==NULL || colNames[i].len<=0) {
262 char buff[12];
263 snprintf(buff,12,"V%d",i+1);
264 elem = mkChar(buff); // no PROTECT as passed immediately to SET_STRING_ELT
265 } else {
266 elem = mkCharLenCE(anchor+colNames[i].off, colNames[i].len, ienc); // no PROTECT as passed immediately to SET_STRING_ELT
267 }
268 SET_STRING_ELT(colNamesSxp, i, elem);
269 }
270 // "use either select= or drop= but not both" was checked earlier in freadR
271 applyDrop(dropSxp, type, ncol, /*dropSource=*/-1);
272 if (TYPEOF(colClassesSxp)==VECSXP) { // not isNewList() because that returns true for NULL
273 SEXP listNames = PROTECT(getAttrib(colClassesSxp, R_NamesSymbol)); // rchk wanted this protected
274 for (int i=0; i<LENGTH(colClassesSxp); ++i) {
275 if (STRING_ELT(listNames, i) == char_NULL) {
276 SEXP items = VECTOR_ELT(colClassesSxp,i);
277 applyDrop(items, type, ncol, /*dropSource=*/i);
278 }
279 }
280 UNPROTECT(1); // listNames
281 }
282 selectRank = NULL;
283 const int *selectInts = NULL; // if select is provided this will point to 1-based ints of the column numbers (which might already be the input as-is)
284 int nprotect = 0; // just used for select; other protects are specifically balanced within loops to save the protection stack, whereas select is long-lived or no-alloc.
285 if (length(selectSxp)) {
286 const int n = length(selectSxp);
287 if (isString(selectSxp)) {
288 selectInts = INTEGER(PROTECT(chmatch(selectSxp, colNamesSxp, NA_INTEGER))); nprotect++;
289 for (int i=0; i<n; ++i) if (selectInts[i]==NA_INTEGER)
290 DTWARN(_("Column name '%s' not found in column name header (case sensitive), skipping."), CHAR(STRING_ELT(selectSxp, i)));
291 } else {
292 if (!isInteger(selectSxp)) { selectSxp=PROTECT(coerceVector(selectSxp, INTSXP)); nprotect++; } // coerce numeric to int
293 selectInts = INTEGER(selectSxp);
294 }
295 SET_VECTOR_ELT(RCHK, 3, selectRank=allocVector(INTSXP, ncol));
296 int *selectRankD = INTEGER(selectRank), rank = 1;
297 for (int i=0; i<n; ++i) {
298 int k = selectInts[i];
299 if (k==NA_INTEGER) continue; // missing column name warned above and skipped
300 if (k<0) STOP(_("Column number %d (select[%d]) is negative but should be in the range [1,ncol=%d]. Consider drop= for column exclusion."),k,i+1,ncol);
301 if (k==0) STOP(_("select = 0 (select[%d]) has no meaning. All values of select should be in the range [1,ncol=%d]."),i+1,ncol);
302 if (k>ncol) STOP(_("Column number %d (select[%d]) is too large for this table, which only has %d columns."),k,i+1,ncol);
303 if (type[k-1]<0) STOP(_("Column number %d ('%s') has been selected twice by select="), k, CHAR(STRING_ELT(colNamesSxp,k-1)));
304 type[k-1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all
305 selectRankD[k-1] = rank++; // rank not i to skip missing column names
306 }
307 for (int i=0; i<ncol; ++i) {
308 if (type[i]<0) type[i] *= -1;
309 else type[i]=CT_DROP;
310 }
311 }
312 colClassesAs = NULL;
313 if (length(colClassesSxp)) {
314 SEXP typeRName_sxp = PROTECT(allocVector(STRSXP, NUT));
315 for (int i=0; i<NUT; i++) SET_STRING_ELT(typeRName_sxp, i, mkChar(typeRName[i]));
316 if (oldNoDateTime) {
317 // prevent colClasses="IDate"/"POSIXct" being recognized so that colClassesAs is assigned here ready for type massage after reading at R level; test 2150.14
318 SET_STRING_ELT(typeRName_sxp, CT_ISO8601_DATE, R_BlankString);
319 SET_STRING_ELT(typeRName_sxp, CT_ISO8601_TIME, R_BlankString);
320 }
321 SET_VECTOR_ELT(RCHK, 2, colClassesAs=allocVector(STRSXP, ncol)); // if any, this attached to the DT for R level to call as_ methods on
322 if (isString(colClassesSxp)) {
323 SEXP typeEnum_idx = PROTECT(chmatch(colClassesSxp, typeRName_sxp, NUT));
324 if (selectColClasses==false) {
325 if (LENGTH(colClassesSxp)!=ncol && LENGTH(colClassesSxp)!=1)
326 STOP(_("colClasses= is an unnamed vector of types, length %d, but there are %d columns in the input. To specify types for a subset of columns, you can use "
327 "a named vector, list format, or specify types using select= instead of colClasses=. Please see examples in ?fread."), LENGTH(colClassesSxp), ncol);
328 const int mask = LENGTH(colClassesSxp)==1 ? 0 : INT_MAX; // to have one consistent loop/logic for the length-1 recycling case too; #4237
329 for (int i=0; i<ncol; ++i) {
330 if (type[i]==CT_DROP) continue; // user might have specified the type of all columns including those dropped with drop=
331 const SEXP tt = STRING_ELT(colClassesSxp, i&mask); // mask recycles colClassesSxp when it's length-1
332 if (tt==NA_STRING || tt==R_BlankString) continue; // user is ok with inherent type for this column
333 int w = INTEGER(typeEnum_idx)[i&mask];
334 if (tt==char_POSIXct) {
335 // from v1.13.0, POSIXct is a built in type, but if the built-in doesn't support (e.g. test 1743.25 has missing tzone) then we still dispatch to as.POSIXct afterwards
336 if (type[i]!=CT_ISO8601_TIME) {
337 type[i]=CT_STRING; // e.g. CT_ISO8601_DATE changed to character here so that as.POSIXct treats the date-only as local time in tests 1743.122 and 2150.11
338 SET_STRING_ELT(colClassesAs, i, tt);
339 }
340 } else {
341 type[i] = typeEnum[w-1]; // freadMain checks bump up only not down
342 if (w==NUT) SET_STRING_ELT(colClassesAs, i, tt);
343 }
344 }
345 } else { // selectColClasses==true
346 if (!selectInts) STOP(_("Internal error: selectInts is NULL but selectColClasses is true"));
347 const int n = length(colClassesSxp);
348 if (length(selectSxp)!=n) STOP(_("Internal error: length(selectSxp)!=length(colClassesSxp) but selectColClasses is true"));
349 for (int i=0; i<n; ++i) {
350 SEXP tt = STRING_ELT(colClassesSxp,i);
351 if (tt==NA_STRING || tt==R_BlankString) continue;
352 int w = INTEGER(typeEnum_idx)[i];
353 int y = selectInts[i];
354 if (y==NA_INTEGER) continue;
355 if (tt==char_POSIXct) {
356 if (type[y-1]!=CT_ISO8601_TIME) {
357 type[y-1]=CT_STRING;
358 SET_STRING_ELT(colClassesAs, y-1, tt);
359 }
360 } else {
361 type[y-1] = typeEnum[w-1];
362 if (w==NUT) SET_STRING_ELT(colClassesAs, y-1, tt);
363 }
364 }
365 }
366 UNPROTECT(1); // typeEnum_idx
367 } else {
368 if (!isNewList(colClassesSxp)) STOP(_("colClasses is type '%s' but should be list or character"), type2char(TYPEOF(colClassesSxp)));
369 SEXP listNames = PROTECT(getAttrib(colClassesSxp, R_NamesSymbol)); // rchk wanted this protected
370 if (!length(listNames)) STOP(_("colClasses is type list but has no names"));
371 SEXP typeEnum_idx = PROTECT(chmatch(listNames, typeRName_sxp, NUT));
372
373 int *selectRankD = NULL, rank = 1;
374 if (selectColClasses) {
375 SET_VECTOR_ELT(RCHK, 3, selectRank=allocVector(INTSXP, ncol)); // column order changed in setFinalNRow
376 selectRankD = INTEGER(selectRank);
377 }
378
379 for (int i=0; i<LENGTH(colClassesSxp); i++) {
380 const int w = INTEGER(typeEnum_idx)[i];
381 signed char thisType = typeEnum[w-1];
382 if (thisType==CT_DROP) continue; // was dealt with earlier above
383 SEXP items = VECTOR_ELT(colClassesSxp,i);
384 SEXP itemsInt;
385 if (isString(items)) itemsInt = PROTECT(chmatch(items, colNamesSxp, NA_INTEGER));
386 else itemsInt = PROTECT(coerceVector(items, INTSXP));
387 // UNPROTECTed directly just after this for loop. No nprotect++ here is correct.
388 for (int j=0; j<LENGTH(items); j++) {
389 int k = INTEGER(itemsInt)[j];
390 if (k==NA_INTEGER) {
391 if (isString(items))
392 DTWARN(_("Column name '%s' (colClasses[[%d]][%d]) not found"), CHAR(STRING_ELT(items, j)), i+1, j+1);
393 else
394 DTWARN(_("colClasses[[%d]][%d] is NA"), i+1, j+1);
395 } else {
396 if (k>=1 && k<=ncol) {
397 if (type[k-1]<0)
398 DTWARN(_("Column %d ('%s') appears more than once in colClasses. The second time is colClasses[[%d]][%d]."), k, CHAR(STRING_ELT(colNamesSxp,k-1)), i+1, j+1);
399 else if (type[k-1]!=CT_DROP) {
400 if (thisType==CT_ISO8601_TIME && type[k-1]!=CT_ISO8601_TIME) {
401 type[k-1] = -CT_STRING; // don't use in-built UTC parser, defer to character and as.POSIXct afterwards which reads in local time
402 SET_STRING_ELT(colClassesAs, k-1, STRING_ELT(listNames,i));
403 } else {
404 type[k-1] = -thisType; // freadMain checks bump up only not down. Deliberately don't catch here to test freadMain; e.g. test 959
405 if (w==NUT) SET_STRING_ELT(colClassesAs, k-1, STRING_ELT(listNames,i));
406 }
407 if (selectRankD) selectRankD[k-1] = rank++;
408 }
409 } else {
410 DTWARN(_("Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]"), k, i+1, j+1, ncol);
411 }
412 }
413 }
414 UNPROTECT(1); // UNPROTECTing itemsInt inside loop to save protection stack
415 }
416 for (int i=0; i<ncol; i++) {
417 if (type[i]<0) type[i] *= -1; // undo sign; was used to detect duplicates
418 else if (selectColClasses) type[i] = CT_DROP; // reading will proceed in order of columns in file; reorder happens afterwards at R level
419 }
420 UNPROTECT(2); // listNames and typeEnum_idx
421 }
422 UNPROTECT(1); // typeRName_sxp
423 }
424 UNPROTECT(nprotect);
425 if (readInt64As != CT_INT64) {
426 for (int i=0; i<ncol; i++) if (type[i]==CT_INT64) type[i] = readInt64As;
427 }
428 return true;
429 }
430
431
allocateDT(int8_t * typeArg,int8_t * sizeArg,int ncolArg,int ndrop,size_t allocNrow)432 size_t allocateDT(int8_t *typeArg, int8_t *sizeArg, int ncolArg, int ndrop, size_t allocNrow) {
433 // save inputs for use by pushBuffer
434 size = sizeArg;
435 type = typeArg;
436 int newDT = (ncol == 0);
437 if (newDT) {
438 ncol = ncolArg;
439 dtnrows = allocNrow;
440 SET_VECTOR_ELT(RCHK, 0, DT=allocVector(VECSXP, ncol-ndrop));
441 if (ndrop==0) {
442 setAttrib(DT, R_NamesSymbol, colNamesSxp); // colNames mkChar'd in userOverride step
443 if (colClassesAs) setAttrib(DT, sym_colClassesAs, colClassesAs);
444 } else {
445 int nprotect = 0;
446 SEXP tt, ss=R_NilValue;
447 setAttrib(DT, R_NamesSymbol, tt=PROTECT(allocVector(STRSXP, ncol-ndrop))); nprotect++;
448 if (colClassesAs) {
449 setAttrib(DT, sym_colClassesAs, ss=PROTECT(allocVector(STRSXP, ncol-ndrop))); nprotect++;
450 }
451 for (int i=0,resi=0; i<ncol; i++) if (type[i]!=CT_DROP) {
452 if (colClassesAs) SET_STRING_ELT(ss, resi, STRING_ELT(colClassesAs,i));
453 SET_STRING_ELT(tt, resi++, STRING_ELT(colNamesSxp,i));
454 }
455 UNPROTECT(nprotect);
456 }
457 if (selectRank) {
458 SEXP tt = PROTECT(allocVector(INTSXP, ncol-ndrop));
459 int *ttD = INTEGER(tt), *rankD = INTEGER(selectRank), rank=1;
460 for (int i=0; i<ncol; ++i) if (type[i]!=CT_DROP) ttD[ rankD[i]-1 ] = rank++;
461 SET_VECTOR_ELT(RCHK, 3, selectRank = tt);
462 // selectRank now holds the order not the rank (so its name is now misleading). setFinalNRow passes it to setcolorder
463 // we can't change column order now because they might be reallocated in the reread
464 UNPROTECT(1); // tt
465 }
466 colClassesAs = getAttrib(DT, sym_colClassesAs);
467 bool none = true;
468 const int n = length(colClassesAs);
469 for (int i=0; i<n; ++i) if (STRING_ELT(colClassesAs,i) != R_BlankString) { none=false; break; }
470 if (none) setAttrib(DT, sym_colClassesAs, R_NilValue);
471 else if (selectRank) setAttrib(DT, sym_colClassesAs, subsetVector(colClassesAs, selectRank)); // reorder the colClassesAs
472 }
473 // TODO: move DT size calculation into a separate function (since the final size is different from the initial size anyways)
474 size_t DTbytes = SIZEOF(DT)*(ncol-ndrop)*2; // the VECSXP and its column names (exclude global character cache usage)
475
476 // For each column we could have one of the following cases:
477 // * if the DataTable is "new", then make a new vector
478 // * if the column's type has changed, then replace it with a new vector
479 // (however if column's type[i] is negative, then it means we're skipping
480 // the column in the rerun, and its type hasn't actually changed).
481 // * if dtnrows≠allocNrow and the column's type has not changed, then that
482 // column needs to be re-alloced (using growVector).
483 // * otherwise leave the column as-is.
484 for (int i=0, resi=0; i<ncol; i++) {
485 if (type[i] == CT_DROP) continue;
486 SEXP col = VECTOR_ELT(DT, resi);
487 int oldIsInt64 = newDT? 0 : INHERITS(col, char_integer64);
488 int newIsInt64 = type[i] == CT_INT64;
489 int typeChanged = (type[i] > 0) && (newDT || TYPEOF(col) != typeSxp[type[i]] || oldIsInt64 != newIsInt64);
490 int nrowChanged = (allocNrow != dtnrows);
491 if (typeChanged || nrowChanged) {
492 SEXP thiscol = typeChanged ? allocVector(typeSxp[type[i]], allocNrow) // no need to PROTECT, passed immediately to SET_VECTOR_ELT, see R-exts 5.9.1
493 : growVector(col, allocNrow);
494 SET_VECTOR_ELT(DT,resi,thiscol);
495 if (type[i]==CT_INT64) {
496 SEXP tt = PROTECT(ScalarString(char_integer64));
497 setAttrib(thiscol, R_ClassSymbol, tt);
498 UNPROTECT(1);
499 } else if (type[i] == CT_ISO8601_DATE) {
500 SEXP tt = PROTECT(allocVector(STRSXP, 2));
501 SET_STRING_ELT(tt, 0, char_IDate);
502 SET_STRING_ELT(tt, 1, char_Date);
503 setAttrib(thiscol, R_ClassSymbol, tt);
504 UNPROTECT(1);
505 } else if (type[i] == CT_ISO8601_TIME) {
506 SEXP tt = PROTECT(allocVector(STRSXP, 2));
507 SET_STRING_ELT(tt, 0, char_POSIXct);
508 SET_STRING_ELT(tt, 1, char_POSIXt);
509 setAttrib(thiscol, R_ClassSymbol, tt);
510 UNPROTECT(1);
511
512 setAttrib(thiscol, sym_tzone, ScalarString(char_UTC)); // see news for v1.13.0
513 }
514 SET_TRUELENGTH(thiscol, allocNrow);
515 DTbytes += SIZEOF(thiscol)*allocNrow;
516 }
517 resi++;
518 }
519 dtnrows = allocNrow;
520 return DTbytes;
521 }
522
523
setFinalNrow(size_t nrow)524 void setFinalNrow(size_t nrow) {
525 if (selectRank) setcolorder(DT, selectRank); // selectRank was changed to contain order (not rank) in allocateDT above
526 if (length(DT)) {
527 if (nrow == dtnrows)
528 return;
529 for (int i=0; i<LENGTH(DT); i++) {
530 SETLENGTH(VECTOR_ELT(DT,i), nrow); // TODO: realloc
531 SET_TRUELENGTH(VECTOR_ELT(DT,i), nrow);
532 }
533 }
534 R_FlushConsole(); // # 2481. Just a convenient place; nothing per se to do with setFinalNrow()
535 }
536
537
pushBuffer(ThreadLocalFreadParsingContext * ctx)538 void pushBuffer(ThreadLocalFreadParsingContext *ctx)
539 {
540 const void *buff8 = ctx->buff8;
541 const void *buff4 = ctx->buff4;
542 const void *buff1 = ctx->buff1;
543 const char *anchor = ctx->anchor;
544 int nRows = (int) ctx->nRows;
545 size_t DTi = ctx->DTi;
546 int rowSize8 = (int) ctx->rowSize8;
547 int rowSize4 = (int) ctx->rowSize4;
548 int rowSize1 = (int) ctx->rowSize1;
549 int nStringCols = ctx->nStringCols;
550 int nNonStringCols = ctx->nNonStringCols;
551
552 // Do all the string columns first so as to minimize and concentrate the time inside the single critical.
553 // While the string columns are happening other threads before me can be copying their non-string buffers to the
554 // final DT and other threads after me can be filling their buffers too.
555 // rowSize is passed in because it will be different (much smaller) on the reread covering any type exception columns
556 // locals passed in on stack so openmp knows that no synchronization is required
557
558 // the byte position of this column in the first row of the row-major buffer
559 if (nStringCols) {
560 #pragma omp critical
561 {
562 int off8 = 0;
563 int cnt8 = rowSize8 / 8;
564 lenOff *buff8_lenoffs = (lenOff*) buff8;
565 for (int j=0, resj=-1, done=0; done<nStringCols && j<ncol; j++) {
566 if (type[j] == CT_DROP) continue;
567 resj++;
568 if (type[j] == CT_STRING) {
569 SEXP dest = VECTOR_ELT(DT, resj);
570 lenOff *source = buff8_lenoffs + off8;
571 for (int i=0; i<nRows; i++) {
572 int strLen = source->len;
573 if (strLen<=0) {
574 // stringLen == INT_MIN => NA, otherwise not a NAstring was checked inside fread_mean
575 if (strLen<0) SET_STRING_ELT(dest, DTi+i, NA_STRING); // else leave the "" in place that was initialized by allocVector()
576 } else {
577 const char *str = anchor + source->off;
578 int c=0;
579 while (c<strLen && str[c]) c++;
580 if (c<strLen) {
581 // embedded nul found; any at the beginning or the end of the field should have already been excluded but this will strip those too if present just in case
582 char *last = (char *)str+c; // obtain write access to (const char *)anchor;
583 while (c<strLen) {
584 if (str[c]) *last++=str[c]; // cow page write: saves allocation and management of a temp that would need to thread-safe in future.
585 c++; // This is only thread accessing this region. For non-mmap direct input nul are not possible (R would not have accepted nul earlier).
586 }
587 strLen = last-str;
588 }
589 SET_STRING_ELT(dest, DTi+i, mkCharLenCE(str, strLen, ienc));
590 }
591 source += cnt8;
592 }
593 done++; // if just one string col near the start, don't loop over the other 10,000 cols. TODO? start on first too
594 }
595 off8 += (size[j] == 8);
596 }
597 }
598 }
599
600 int off1 = 0, off4 = 0, off8 = 0;
601 for (int j=0, resj=-1, done=0; done<nNonStringCols && j<ncol; j++) {
602 if (type[j]==CT_DROP) continue;
603 int thisSize = size[j];
604 resj++;
605 if (type[j]!=CT_STRING && type[j]>0) {
606 if (thisSize == 8) {
607 double *dest = (double *)REAL(VECTOR_ELT(DT, resj)) + DTi;
608 const char *src8 = (char*)buff8 + off8;
609 for (int i=0; i<nRows; ++i) {
610 *dest = *(double *)src8;
611 src8 += rowSize8;
612 dest++;
613 }
614 } else
615 if (thisSize == 4) {
616 int *dest = (int *)INTEGER(VECTOR_ELT(DT, resj)) + DTi;
617 const char *src4 = (char*)buff4 + off4;
618 // debug line for #3369 ... if (DTi>2638000) printf("freadR.c:460: thisSize==4, resj=%d, %"PRIu64", %d, %d, j=%d, done=%d\n", resj, (uint64_t)DTi, off4, rowSize4, j, done);
619 for (int i=0; i<nRows; ++i) {
620 *dest = *(int *)src4;
621 src4 += rowSize4;
622 dest++;
623 }
624 } else
625 if (thisSize == 1) {
626 if (type[j] > CT_BOOL8_L) STOP(_("Field size is 1 but the field is of type %d\n"), type[j]);
627 Rboolean *dest = (Rboolean *)LOGICAL(VECTOR_ELT(DT, resj)) + DTi;
628 const char *src1 = (char*)buff1 + off1;
629 for (int i=0; i<nRows; ++i) {
630 int8_t v = *(int8_t *)src1;
631 *dest = (v==INT8_MIN ? NA_INTEGER : v);
632 src1 += rowSize1;
633 dest++;
634 }
635 } else STOP(_("Internal error: unexpected field of size %d\n"), thisSize); // # nocov
636 done++;
637 }
638 off8 += (size[j] & 8);
639 off4 += (size[j] & 4);
640 off1 += (size[j] & 1);
641 }
642 }
643
644 // # nocov start
progress(int p,int eta)645 void progress(int p, int eta) {
646 // called from thread 0 only
647 // p between 0 and 100
648 // eta in seconds
649 // Initialized the first time it is called with p>0
650 // Must be called at the end with p==100 to finish off and reset
651 // If it's called twice at the end with p=100, that's ok
652
653 // REprinf to avoid Rprintf's call to R_CheckUserInterrupt() every 100 lines, issue #2457
654 // It's the R_CheckUserInterrupt() that has caused crashes before when called from OpenMP parallel region
655 // even when called only from master thread. Update: can now retry within critical.
656 // fwrite.c has some comments about how it might be possible to call R_CheckUserInterrupt() here so that
657 // a long running fread can be stopped by user with Ctrl-C (or ESC on Windows).
658 // Could try R_ProcessEvents() too as per
659 // https://cran.r-project.org/bin/windows/base/rw-FAQ.html#The-console-freezes-when-my-compiled-code-is-running
660
661 // No use of \r to avoid bug in RStudio, linked in the same issue #2457
662 static int displayed = -1; // -1 means not yet displayed, otherwise [0,50] '=' are displayed
663 static char bar[] = "================================================== "; // 50 marks for each 2%
664 if (displayed==-1) {
665 if (eta<3 || p>50) return;
666 #pragma omp critical
667 {
668 REprintf("|--------------------------------------------------|\n|");
669 R_FlushConsole();
670 }
671 displayed = 0;
672 }
673 p/=2;
674 int toPrint = p-displayed;
675 if (toPrint==0) return;
676 bar[toPrint] = '\0';
677 #pragma omp critical
678 {
679 REprintf("%s", bar);
680 bar[toPrint] = '=';
681 displayed = p;
682 if (p==50) {
683 REprintf("|\n");
684 displayed = -1;
685 }
686 R_FlushConsole();
687 }
688 }
689 // # nocov end
690
__halt(bool warn,const char * format,...)691 void __halt(bool warn, const char *format, ...) {
692 // Solves: http://stackoverflow.com/questions/18597123/fread-data-table-locks-files
693 // TODO: always include fnam in the STOP message. For log files etc.
694 va_list args;
695 va_start(args, format);
696 char msg[2000];
697 vsnprintf(msg, 2000, format, args);
698 va_end(args);
699 freadCleanup(); // this closes mmp hence why we just copied substrings from mmp to msg[] first since mmp is now invalid
700 // if (warn) warning(_("%s"), msg);
701 // this warning() call doesn't seem to honor warn=2 straight away in R 3.6, so now always call error() directly to be sure
702 // we were going via warning() before to get the (converted from warning) prefix in the message (which we could mimic in future)
703 error(_("%s"), msg); // include "%s" because data in msg might include '%'
704 }
705
prepareThreadContext(ThreadLocalFreadParsingContext * ctx)706 void prepareThreadContext(ThreadLocalFreadParsingContext *ctx) {}
postprocessBuffer(ThreadLocalFreadParsingContext * ctx)707 void postprocessBuffer(ThreadLocalFreadParsingContext *ctx) {}
orderBuffer(ThreadLocalFreadParsingContext * ctx)708 void orderBuffer(ThreadLocalFreadParsingContext *ctx) {}
freeThreadContext(ThreadLocalFreadParsingContext * ctx)709 void freeThreadContext(ThreadLocalFreadParsingContext *ctx) {}
710