1 #include "fread.h"
2 #include "freadR.h"
3 #include "data.table.h"
4 
5 /*****    TO DO    *****
6 Restore test 1339 (balanced embedded quotes, see ?fread already updated).
7 Confirm: http://stackoverflow.com/questions/23833294/data-tablefread-doesnt-like-missing-values-in-first-column
8 construct test and investigate skip for completeness here: http://stackoverflow.com/questions/22086780/data-table-fread-error
9 http://stackoverflow.com/questions/22229109/r-data-table-fread-command-how-to-read-large-files-with-irregular-separators
10 And even more diagnostics to verbose=true so we can see where crashes are.
11 Detect and coerce dates and times. By searching for - and :, and dateTtime etc, or R's own method or fasttime. POSIXct default, for microseconds? : http://stackoverflow.com/questions/14056370/cast-string-to-idatetime
12 Add as.colClasses to fread.R after return from C level (e.g. for colClasses "Date", although as slow as read.csv via character)
13 Allow comment char to ignore. Important in format detection. But require valid line data before comment character in the read loop? See http://stackoverflow.com/a/18922269/403310
14 Deal with row.names e.g. http://stackoverflow.com/questions/15448732/reading-csv-with-row-names-by-fread
15 Test Garrett's two files again (wrap around ,,,,,, and different row lengths that the wc -l now fixes)
16 Post from patricknik on 5 Jan re ""b"" in a field. And Aykut Firat on email.
17 Save repeated ch<eof checking in main read step. Last line might still be tricky if last line has no eol.
18 Test using at least "grep read.table ...Rtrunk/tests/
19 Look for any non-alpha-numeric characters in the output and try each of them. That way can handle bell character as well and save testing separators which aren't there.
20 Just one "NA" current default but empty in future when numerics handle NA string variants directly.
21 ---
22 Secondary separator for list() columns, such as columns 11 and 12 in BED (no need for strsplit).
23 *****/
24 
25 #define NUT  NUMTYPE+2  // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards
26 
27 static int  typeSxp[NUT] =     {NILSXP,  LGLSXP,     LGLSXP,     LGLSXP,     LGLSXP,     INTSXP,    REALSXP,     REALSXP,    REALSXP,        REALSXP,        INTSXP,          REALSXP,         STRSXP,      REALSXP,    STRSXP   };
28 static char typeRName[NUT][10]={"NULL",  "logical",  "logical",  "logical",  "logical",  "integer", "integer64", "double",   "double",       "double",       "IDate",         "POSIXct",       "character", "numeric",  "CLASS"  };
29 static int  typeEnum[NUT] =    {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32,  CT_INT64,    CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING,   CT_FLOAT64, CT_STRING};
30 static colType readInt64As=CT_INT64;
31 static SEXP selectSxp;
32 static SEXP dropSxp;
33 static SEXP colClassesSxp;
34 static bool selectColClasses = false;
35 cetype_t ienc = CE_NATIVE;
36 static SEXP RCHK;
37 static SEXP DT;
38 static SEXP colNamesSxp;
39 static SEXP colClassesAs; // the classes like factor, POSIXct which are currently done afterwards at R level: strings don't match typeRName above => NUT / "CLASS"
40 static SEXP selectRank;   // C level returns the column reording vector to be done by setcolorder() at R level afterwards
41 static int8_t *type;
42 static int8_t *size;
43 static int ncol = 0;
44 static int64_t dtnrows = 0;
45 static bool verbose = false;
46 static bool warningsAreErrors = false;
47 static bool oldNoDateTime = false;
48 
49 
freadR(SEXP inputArg,SEXP sepArg,SEXP decArg,SEXP quoteArg,SEXP headerArg,SEXP nrowLimitArg,SEXP skipArg,SEXP NAstringsArg,SEXP stripWhiteArg,SEXP skipEmptyLinesArg,SEXP fillArg,SEXP showProgressArg,SEXP nThreadArg,SEXP verboseArg,SEXP warnings2errorsArg,SEXP logical01Arg,SEXP selectArg,SEXP dropArg,SEXP colClassesArg,SEXP integer64Arg,SEXP encodingArg,SEXP keepLeadingZerosArgs,SEXP noTZasUTC)50 SEXP freadR(
51   // params passed to freadMain
52   SEXP inputArg,
53   SEXP sepArg,
54   SEXP decArg,
55   SEXP quoteArg,
56   SEXP headerArg,
57   SEXP nrowLimitArg,
58   SEXP skipArg,
59   SEXP NAstringsArg,
60   SEXP stripWhiteArg,
61   SEXP skipEmptyLinesArg,
62   SEXP fillArg,
63   SEXP showProgressArg,
64   SEXP nThreadArg,
65   SEXP verboseArg,
66   SEXP warnings2errorsArg,
67   SEXP logical01Arg,
68 
69   // extras needed by callbacks from freadMain
70   SEXP selectArg,
71   SEXP dropArg,
72   SEXP colClassesArg,
73   SEXP integer64Arg,
74   SEXP encodingArg,
75   SEXP keepLeadingZerosArgs,
76   SEXP noTZasUTC
77 ) {
78   verbose = LOGICAL(verboseArg)[0];
79   warningsAreErrors = LOGICAL(warnings2errorsArg)[0];
80 
81   freadMainArgs args;
82   ncol = 0;
83   dtnrows = 0;
84   const char *ch, *ch2;
85   if (!isString(inputArg) || LENGTH(inputArg)!=1)
86     error(_("Internal error: freadR input not a single character string: a filename or the data itself. Should have been caught at R level."));  // # nocov
87   ch = ch2 = (const char *)CHAR(STRING_ELT(inputArg,0));
88   while (*ch2!='\n' && *ch2!='\r' && *ch2!='\0') ch2++;
89   args.input = (*ch2=='\0') ? R_ExpandFileName(ch) : ch; // for convenience so user doesn't have to call path.expand()
90 
91   ch = args.input;
92   while (*ch!='\0' && *ch!='\n' && *ch!='\r') ch++;
93   if (*ch!='\0' || args.input[0]=='\0') {
94     if (verbose) DTPRINT(_("Input contains a \\n or is \")\". Taking this to be text input (not a filename)\n"));
95     args.filename = NULL;
96   } else {
97     if (verbose) DTPRINT(_("Input contains no \\n. Taking this to be a filename to open\n"));
98     args.filename = args.input;
99     args.input = NULL;
100   }
101 
102   if (!isString(sepArg) || LENGTH(sepArg)!=1 || strlen(CHAR(STRING_ELT(sepArg,0)))>1)
103     error(_("Internal error: freadR sep not a single character. R level catches this."));  // # nocov
104   args.sep = CHAR(STRING_ELT(sepArg,0))[0];   // '\0' when default "auto" was replaced by "" at R level
105 
106   if (!(isString(decArg) && LENGTH(decArg)==1 && strlen(CHAR(STRING_ELT(decArg,0)))==1))
107     error(_("Internal error: freadR dec not a single character. R level catches this."));  // # nocov
108   args.dec = CHAR(STRING_ELT(decArg,0))[0];
109 
110   if (IS_FALSE(quoteArg)) {
111     args.quote = '\0';
112   } else {
113     if (!isString(quoteArg) || LENGTH(quoteArg)!=1 || strlen(CHAR(STRING_ELT(quoteArg,0))) > 1)
114       error(_("quote= must be a single character, blank \"\", or FALSE"));
115     args.quote = CHAR(STRING_ELT(quoteArg,0))[0];
116   }
117 
118   // header is the only boolean where NA is valid and means 'auto'.
119   // LOGICAL in R is signed 32 bits with NA_LOGICAL==INT_MIN, currently.
120   args.header = false;
121   if (LOGICAL(headerArg)[0]==NA_LOGICAL) args.header = NA_BOOL8;
122   else if (LOGICAL(headerArg)[0]==TRUE) args.header = true;
123 
124   args.nrowLimit = INT64_MAX;
125   // checked at R level
126   if (isReal(nrowLimitArg)) {
127     if (R_FINITE(REAL(nrowLimitArg)[0]) && REAL(nrowLimitArg)[0]>=0.0) args.nrowLimit = (int64_t)(REAL(nrowLimitArg)[0]);
128   } else {
129     if (INTEGER(nrowLimitArg)[0]>=1) args.nrowLimit = (int64_t)INTEGER(nrowLimitArg)[0];
130   }
131 
132   args.logical01 = LOGICAL(logical01Arg)[0];
133   {
134     SEXP tt = PROTECT(GetOption(sym_old_fread_datetime_character, R_NilValue));
135     args.oldNoDateTime = oldNoDateTime = isLogical(tt) && LENGTH(tt)==1 && LOGICAL(tt)[0]==TRUE;
136     UNPROTECT(1);
137   }
138   args.skipNrow=-1;
139   args.skipString=NULL;
140   if (isString(skipArg)) {
141     args.skipString = CHAR(STRING_ELT(skipArg,0));  // LENGTH==1 was checked at R level
142   } else if (isInteger(skipArg)) {
143     args.skipNrow = (int64_t)INTEGER(skipArg)[0];
144   } else error(_("Internal error: skip not integer or string in freadR.c")); // # nocov
145 
146   if (!isNull(NAstringsArg) && !isString(NAstringsArg))
147     error(_("Internal error: NAstringsArg is type '%s'. R level catches this"), type2char(TYPEOF(NAstringsArg)));  // # nocov
148   int nnas = length(NAstringsArg);
149   const char **NAstrings = (const char **)R_alloc((nnas + 1), sizeof(char*));  // +1 for the final NULL to save a separate nna variable
150   for (int i=0; i<nnas; i++)
151     NAstrings[i] = CHAR(STRING_ELT(NAstringsArg,i));
152   NAstrings[nnas] = NULL;
153   args.NAstrings = NAstrings;
154 
155   // here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL
156   args.stripWhite = LOGICAL(stripWhiteArg)[0];
157   args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0];
158   args.fill = LOGICAL(fillArg)[0];
159   args.showProgress = LOGICAL(showProgressArg)[0];
160   if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]);
161   args.nth = (uint32_t)INTEGER(nThreadArg)[0];
162   args.verbose = verbose;
163   args.warningsAreErrors = warningsAreErrors;
164   args.keepLeadingZeros = LOGICAL(keepLeadingZerosArgs)[0];
165   args.noTZasUTC = LOGICAL(noTZasUTC)[0];
166 
167   // === extras used for callbacks ===
168   if (!isString(integer64Arg) || LENGTH(integer64Arg)!=1) error(_("'integer64' must be a single character string"));
169   const char *tt = CHAR(STRING_ELT(integer64Arg,0));
170   if (strcmp(tt, "integer64")==0) {
171     readInt64As = CT_INT64;
172   } else if (strcmp(tt, "character")==0) {
173     readInt64As = CT_STRING;
174   } else if (strcmp(tt,"double")==0 || strcmp(tt,"numeric")==0) {
175     readInt64As = CT_FLOAT64;
176   } else STOP(_("Invalid value integer64='%s'. Must be 'integer64', 'character', 'double' or 'numeric'"), tt);
177 
178   colClassesSxp = colClassesArg;
179 
180   selectSxp = selectArg;
181   dropSxp = dropArg;
182   selectColClasses = false;
183   if (!isNull(selectSxp)) {
184     if (!isNull(dropSxp)) STOP(_("Use either select= or drop= but not both."));
185     if (isNewList(selectArg)) {
186       if (!isNull(colClassesSxp))
187         STOP(_("select= is type list for specifying types in select=, but colClasses= has been provided as well. Please remove colClasses=."));
188       if (!length(getAttrib(selectArg, R_NamesSymbol)))
189         STOP(_("select= is type list but has no names; expecting list(type1=cols1, type2=cols2, ...)"));
190       colClassesSxp = selectArg;
191       selectColClasses = true;
192       selectSxp = R_NilValue;
193     } else {
194       if (!isNull(getAttrib(selectArg, R_NamesSymbol))) {
195         if (!isNull(colClassesSxp))
196           STOP(_("select= is a named vector specifying the columns to select and their types, but colClasses= has been provided as well. Please remove colClasses=."));
197         colClassesSxp = selectArg;
198         selectSxp = getAttrib(selectArg, R_NamesSymbol);
199         selectColClasses = true;
200       }
201     }
202   } else {
203     if (TYPEOF(colClassesSxp)==VECSXP && !length(getAttrib(colClassesSxp, R_NamesSymbol)))
204        STOP(_("colClasses is type list but has no names"));
205   }
206 
207   // Encoding, #563: Borrowed from do_setencoding from base R
208   // https://github.com/wch/r-source/blob/ca5348f0b5e3f3c2b24851d7aff02de5217465eb/src/main/util.c#L1115
209   // Check for mkCharLenCE function to locate as to where where this is implemented.
210   tt = CHAR(STRING_ELT(encodingArg, 0));
211   if (strcmp(tt, "unknown")==0) ienc = CE_NATIVE;
212   else if (strcmp(tt, "Latin-1")==0) ienc = CE_LATIN1;
213   else if (strcmp(tt, "UTF-8")==0) ienc = CE_UTF8;
214   else STOP(_("encoding='%s' invalid. Must be 'unknown', 'Latin-1' or 'UTF-8'"), tt);  // # nocov
215   // === end extras ===
216 
217   RCHK = PROTECT(allocVector(VECSXP, 4));
218   // see kalibera/rchk#9 and Rdatatable/data.table#2865.  To avoid rchk false positives.
219   // allocateDT() assigns DT to position 0. userOverride() assigns colNamesSxp to position 1 and colClassesAs to position 2 (both used in allocateDT())
220   freadMain(args);
221   UNPROTECT(1);
222   return DT;
223 }
224 
applyDrop(SEXP items,int8_t * type,int ncol,int dropSource)225 static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) {
226   if (!length(items)) return;
227   SEXP itemsInt = PROTECT( isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP) );
228   const int *itemsD = INTEGER(itemsInt), n=LENGTH(itemsInt);
229   for (int j=0; j<n; ++j) {
230     int k = itemsD[j];
231     if (k==NA_INTEGER || k<1 || k>ncol) {
232       static char buff[51];
233       if (dropSource==-1) snprintf(buff, 50, "drop[%d]", j+1);
234       else snprintf(buff, 50, "colClasses[[%d]][%d]", dropSource+1, j+1);
235       if (k==NA_INTEGER) {
236         if (isString(items))
237           DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff);
238         else
239           DTWARN(_("%s is NA"), buff);
240       } else {
241         DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol);
242       }
243     } else {
244       type[k-1] = CT_DROP;
245       // aside: dropping the same column several times is acceptable with no warning. This could arise via duplicates in the drop= vector,
246       // or specifying the same column to drop using NULLs in colClasses and drop= too.
247     }
248   }
249   UNPROTECT(1);
250 }
251 
userOverride(int8_t * type,lenOff * colNames,const char * anchor,const int ncol)252 bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int ncol)
253 {
254   // use typeSize superfluously to avoid not-used warning; otherwise could move typeSize from fread.h into fread.c
255   if (typeSize[CT_BOOL8_N]!=1) STOP(_("Internal error: typeSize[CT_BOOL8_N] != 1")); // # nocov
256   if (typeSize[CT_STRING]!=8) STOP(_("Internal error: typeSize[CT_STRING] != 1")); // # nocov
257   colNamesSxp = R_NilValue;
258   SET_VECTOR_ELT(RCHK, 1, colNamesSxp=allocVector(STRSXP, ncol));
259   for (int i=0; i<ncol; i++) {
260     SEXP elem;
261     if (colNames==NULL || colNames[i].len<=0) {
262       char buff[12];
263       snprintf(buff,12,"V%d",i+1);
264       elem = mkChar(buff);  // no PROTECT as passed immediately to SET_STRING_ELT
265     } else {
266       elem = mkCharLenCE(anchor+colNames[i].off, colNames[i].len, ienc);  // no PROTECT as passed immediately to SET_STRING_ELT
267     }
268     SET_STRING_ELT(colNamesSxp, i, elem);
269   }
270   // "use either select= or drop= but not both" was checked earlier in freadR
271   applyDrop(dropSxp, type, ncol, /*dropSource=*/-1);
272   if (TYPEOF(colClassesSxp)==VECSXP) {  // not isNewList() because that returns true for NULL
273     SEXP listNames = PROTECT(getAttrib(colClassesSxp, R_NamesSymbol));  // rchk wanted this protected
274     for (int i=0; i<LENGTH(colClassesSxp); ++i) {
275       if (STRING_ELT(listNames, i) == char_NULL) {
276         SEXP items = VECTOR_ELT(colClassesSxp,i);
277         applyDrop(items, type, ncol, /*dropSource=*/i);
278       }
279     }
280     UNPROTECT(1);  // listNames
281   }
282   selectRank = NULL;
283   const int *selectInts = NULL; // if select is provided this will point to 1-based ints of the column numbers (which might already be the input as-is)
284   int nprotect = 0;  // just used for select; other protects are specifically balanced within loops to save the protection stack, whereas select is long-lived or no-alloc.
285   if (length(selectSxp)) {
286     const int n = length(selectSxp);
287     if (isString(selectSxp)) {
288       selectInts = INTEGER(PROTECT(chmatch(selectSxp, colNamesSxp, NA_INTEGER))); nprotect++;
289       for (int i=0; i<n; ++i) if (selectInts[i]==NA_INTEGER)
290         DTWARN(_("Column name '%s' not found in column name header (case sensitive), skipping."), CHAR(STRING_ELT(selectSxp, i)));
291     } else {
292       if (!isInteger(selectSxp)) { selectSxp=PROTECT(coerceVector(selectSxp, INTSXP)); nprotect++; }  // coerce numeric to int
293       selectInts = INTEGER(selectSxp);
294     }
295     SET_VECTOR_ELT(RCHK, 3, selectRank=allocVector(INTSXP, ncol));
296     int *selectRankD = INTEGER(selectRank), rank = 1;
297     for (int i=0; i<n; ++i) {
298       int k = selectInts[i];
299       if (k==NA_INTEGER) continue; // missing column name warned above and skipped
300       if (k<0) STOP(_("Column number %d (select[%d]) is negative but should be in the range [1,ncol=%d]. Consider drop= for column exclusion."),k,i+1,ncol);
301       if (k==0) STOP(_("select = 0 (select[%d]) has no meaning. All values of select should be in the range [1,ncol=%d]."),i+1,ncol);
302       if (k>ncol) STOP(_("Column number %d (select[%d]) is too large for this table, which only has %d columns."),k,i+1,ncol);
303       if (type[k-1]<0) STOP(_("Column number %d ('%s') has been selected twice by select="), k, CHAR(STRING_ELT(colNamesSxp,k-1)));
304       type[k-1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all
305       selectRankD[k-1] = rank++;  // rank not i to skip missing column names
306     }
307     for (int i=0; i<ncol; ++i) {
308       if (type[i]<0) type[i] *= -1;
309       else type[i]=CT_DROP;
310     }
311   }
312   colClassesAs = NULL;
313   if (length(colClassesSxp)) {
314     SEXP typeRName_sxp = PROTECT(allocVector(STRSXP, NUT));
315     for (int i=0; i<NUT; i++) SET_STRING_ELT(typeRName_sxp, i, mkChar(typeRName[i]));
316     if (oldNoDateTime) {
317       // prevent colClasses="IDate"/"POSIXct" being recognized so that colClassesAs is assigned here ready for type massage after reading at R level; test 2150.14
318       SET_STRING_ELT(typeRName_sxp, CT_ISO8601_DATE, R_BlankString);
319       SET_STRING_ELT(typeRName_sxp, CT_ISO8601_TIME, R_BlankString);
320     }
321     SET_VECTOR_ELT(RCHK, 2, colClassesAs=allocVector(STRSXP, ncol));  // if any, this attached to the DT for R level to call as_ methods on
322     if (isString(colClassesSxp)) {
323       SEXP typeEnum_idx = PROTECT(chmatch(colClassesSxp, typeRName_sxp, NUT));
324       if (selectColClasses==false) {
325         if (LENGTH(colClassesSxp)!=ncol && LENGTH(colClassesSxp)!=1)
326           STOP(_("colClasses= is an unnamed vector of types, length %d, but there are %d columns in the input. To specify types for a subset of columns, you can use "
327                  "a named vector, list format, or specify types using select= instead of colClasses=. Please see examples in ?fread."), LENGTH(colClassesSxp), ncol);
328         const int mask = LENGTH(colClassesSxp)==1 ? 0 : INT_MAX;  // to have one consistent loop/logic for the length-1 recycling case too; #4237
329         for (int i=0; i<ncol; ++i) {
330           if (type[i]==CT_DROP) continue;                    // user might have specified the type of all columns including those dropped with drop=
331           const SEXP tt = STRING_ELT(colClassesSxp, i&mask); // mask recycles colClassesSxp when it's length-1
332           if (tt==NA_STRING || tt==R_BlankString) continue;  // user is ok with inherent type for this column
333           int w = INTEGER(typeEnum_idx)[i&mask];
334           if (tt==char_POSIXct) {
335             // from v1.13.0, POSIXct is a built in type, but if the built-in doesn't support (e.g. test 1743.25 has missing tzone) then we still dispatch to as.POSIXct afterwards
336             if (type[i]!=CT_ISO8601_TIME) {
337               type[i]=CT_STRING; // e.g. CT_ISO8601_DATE changed to character here so that as.POSIXct treats the date-only as local time in tests 1743.122 and 2150.11
338               SET_STRING_ELT(colClassesAs, i, tt);
339             }
340           } else {
341             type[i] = typeEnum[w-1];                           // freadMain checks bump up only not down
342             if (w==NUT) SET_STRING_ELT(colClassesAs, i, tt);
343           }
344         }
345       } else { // selectColClasses==true
346         if (!selectInts) STOP(_("Internal error: selectInts is NULL but selectColClasses is true"));
347         const int n = length(colClassesSxp);
348         if (length(selectSxp)!=n) STOP(_("Internal error: length(selectSxp)!=length(colClassesSxp) but selectColClasses is true"));
349         for (int i=0; i<n; ++i) {
350           SEXP tt = STRING_ELT(colClassesSxp,i);
351           if (tt==NA_STRING || tt==R_BlankString) continue;
352           int w = INTEGER(typeEnum_idx)[i];
353           int y = selectInts[i];
354           if (y==NA_INTEGER) continue;
355           if (tt==char_POSIXct) {
356             if (type[y-1]!=CT_ISO8601_TIME) {
357               type[y-1]=CT_STRING;
358               SET_STRING_ELT(colClassesAs, y-1, tt);
359             }
360           } else {
361             type[y-1] = typeEnum[w-1];
362             if (w==NUT) SET_STRING_ELT(colClassesAs, y-1, tt);
363           }
364         }
365       }
366       UNPROTECT(1); // typeEnum_idx
367     } else {
368       if (!isNewList(colClassesSxp)) STOP(_("colClasses is type '%s' but should be list or character"), type2char(TYPEOF(colClassesSxp)));
369       SEXP listNames = PROTECT(getAttrib(colClassesSxp, R_NamesSymbol));  // rchk wanted this protected
370       if (!length(listNames)) STOP(_("colClasses is type list but has no names"));
371       SEXP typeEnum_idx = PROTECT(chmatch(listNames, typeRName_sxp, NUT));
372 
373       int *selectRankD = NULL, rank = 1;
374       if (selectColClasses) {
375         SET_VECTOR_ELT(RCHK, 3, selectRank=allocVector(INTSXP, ncol));  // column order changed in setFinalNRow
376         selectRankD = INTEGER(selectRank);
377       }
378 
379       for (int i=0; i<LENGTH(colClassesSxp); i++) {
380         const int w = INTEGER(typeEnum_idx)[i];
381         signed char thisType = typeEnum[w-1];
382         if (thisType==CT_DROP) continue;  // was dealt with earlier above
383         SEXP items = VECTOR_ELT(colClassesSxp,i);
384         SEXP itemsInt;
385         if (isString(items)) itemsInt = PROTECT(chmatch(items, colNamesSxp, NA_INTEGER));
386         else                 itemsInt = PROTECT(coerceVector(items, INTSXP));
387         // UNPROTECTed directly just after this for loop. No nprotect++ here is correct.
388         for (int j=0; j<LENGTH(items); j++) {
389           int k = INTEGER(itemsInt)[j];
390           if (k==NA_INTEGER) {
391             if (isString(items))
392               DTWARN(_("Column name '%s' (colClasses[[%d]][%d]) not found"), CHAR(STRING_ELT(items, j)), i+1, j+1);
393             else
394               DTWARN(_("colClasses[[%d]][%d] is NA"), i+1, j+1);
395           } else {
396             if (k>=1 && k<=ncol) {
397               if (type[k-1]<0)
398                 DTWARN(_("Column %d ('%s') appears more than once in colClasses. The second time is colClasses[[%d]][%d]."), k, CHAR(STRING_ELT(colNamesSxp,k-1)), i+1, j+1);
399               else if (type[k-1]!=CT_DROP) {
400                 if (thisType==CT_ISO8601_TIME && type[k-1]!=CT_ISO8601_TIME) {
401                   type[k-1] = -CT_STRING; // don't use in-built UTC parser, defer to character and as.POSIXct afterwards which reads in local time
402                   SET_STRING_ELT(colClassesAs, k-1, STRING_ELT(listNames,i));
403                 } else {
404                   type[k-1] = -thisType;     // freadMain checks bump up only not down.  Deliberately don't catch here to test freadMain; e.g. test 959
405                   if (w==NUT) SET_STRING_ELT(colClassesAs, k-1, STRING_ELT(listNames,i));
406                 }
407                 if (selectRankD) selectRankD[k-1] = rank++;
408               }
409             } else {
410               DTWARN(_("Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]"), k, i+1, j+1, ncol);
411             }
412           }
413         }
414         UNPROTECT(1); // UNPROTECTing itemsInt inside loop to save protection stack
415       }
416       for (int i=0; i<ncol; i++) {
417         if (type[i]<0) type[i] *= -1;                  // undo sign; was used to detect duplicates
418         else if (selectColClasses) type[i] = CT_DROP;  // reading will proceed in order of columns in file; reorder happens afterwards at R level
419       }
420       UNPROTECT(2);  // listNames and typeEnum_idx
421     }
422     UNPROTECT(1);  // typeRName_sxp
423   }
424   UNPROTECT(nprotect);
425   if (readInt64As != CT_INT64) {
426     for (int i=0; i<ncol; i++) if (type[i]==CT_INT64) type[i] = readInt64As;
427   }
428   return true;
429 }
430 
431 
allocateDT(int8_t * typeArg,int8_t * sizeArg,int ncolArg,int ndrop,size_t allocNrow)432 size_t allocateDT(int8_t *typeArg, int8_t *sizeArg, int ncolArg, int ndrop, size_t allocNrow) {
433   // save inputs for use by pushBuffer
434   size = sizeArg;
435   type = typeArg;
436   int newDT = (ncol == 0);
437   if (newDT) {
438     ncol = ncolArg;
439     dtnrows = allocNrow;
440     SET_VECTOR_ELT(RCHK, 0, DT=allocVector(VECSXP, ncol-ndrop));
441     if (ndrop==0) {
442       setAttrib(DT, R_NamesSymbol, colNamesSxp);  // colNames mkChar'd in userOverride step
443       if (colClassesAs) setAttrib(DT, sym_colClassesAs, colClassesAs);
444     } else {
445       int nprotect = 0;
446       SEXP tt, ss=R_NilValue;
447       setAttrib(DT, R_NamesSymbol, tt=PROTECT(allocVector(STRSXP, ncol-ndrop))); nprotect++;
448       if (colClassesAs) {
449         setAttrib(DT, sym_colClassesAs, ss=PROTECT(allocVector(STRSXP, ncol-ndrop))); nprotect++;
450       }
451       for (int i=0,resi=0; i<ncol; i++) if (type[i]!=CT_DROP) {
452         if (colClassesAs) SET_STRING_ELT(ss, resi, STRING_ELT(colClassesAs,i));
453         SET_STRING_ELT(tt, resi++, STRING_ELT(colNamesSxp,i));
454       }
455       UNPROTECT(nprotect);
456     }
457     if (selectRank) {
458       SEXP tt = PROTECT(allocVector(INTSXP, ncol-ndrop));
459       int *ttD = INTEGER(tt), *rankD = INTEGER(selectRank), rank=1;
460       for (int i=0; i<ncol; ++i) if (type[i]!=CT_DROP) ttD[ rankD[i]-1 ] = rank++;
461       SET_VECTOR_ELT(RCHK, 3, selectRank = tt);
462       // selectRank now holds the order not the rank (so its name is now misleading). setFinalNRow passes it to setcolorder
463       // we can't change column order now because they might be reallocated in the reread
464       UNPROTECT(1); // tt
465     }
466     colClassesAs = getAttrib(DT, sym_colClassesAs);
467     bool none = true;
468     const int n = length(colClassesAs);
469     for (int i=0; i<n; ++i) if (STRING_ELT(colClassesAs,i) != R_BlankString) { none=false; break; }
470     if (none) setAttrib(DT, sym_colClassesAs, R_NilValue);
471     else if (selectRank) setAttrib(DT, sym_colClassesAs, subsetVector(colClassesAs, selectRank));  // reorder the colClassesAs
472   }
473   // TODO: move DT size calculation into a separate function (since the final size is different from the initial size anyways)
474   size_t DTbytes = SIZEOF(DT)*(ncol-ndrop)*2; // the VECSXP and its column names (exclude global character cache usage)
475 
476   // For each column we could have one of the following cases:
477   //   * if the DataTable is "new", then make a new vector
478   //   * if the column's type has changed, then replace it with a new vector
479   //     (however if column's type[i] is negative, then it means we're skipping
480   //     the column in the rerun, and its type hasn't actually changed).
481   //   * if dtnrows≠allocNrow and the column's type has not changed, then that
482   //     column needs to be re-alloced (using growVector).
483   //   * otherwise leave the column as-is.
484   for (int i=0, resi=0; i<ncol; i++) {
485     if (type[i] == CT_DROP) continue;
486     SEXP col = VECTOR_ELT(DT, resi);
487     int oldIsInt64 = newDT? 0 : INHERITS(col, char_integer64);
488     int newIsInt64 = type[i] == CT_INT64;
489     int typeChanged = (type[i] > 0) && (newDT || TYPEOF(col) != typeSxp[type[i]] || oldIsInt64 != newIsInt64);
490     int nrowChanged = (allocNrow != dtnrows);
491     if (typeChanged || nrowChanged) {
492       SEXP thiscol = typeChanged ? allocVector(typeSxp[type[i]], allocNrow)  // no need to PROTECT, passed immediately to SET_VECTOR_ELT, see R-exts 5.9.1
493                                  : growVector(col, allocNrow);
494       SET_VECTOR_ELT(DT,resi,thiscol);
495       if (type[i]==CT_INT64) {
496         SEXP tt = PROTECT(ScalarString(char_integer64));
497         setAttrib(thiscol, R_ClassSymbol, tt);
498         UNPROTECT(1);
499       } else if (type[i] == CT_ISO8601_DATE) {
500         SEXP tt = PROTECT(allocVector(STRSXP, 2));
501         SET_STRING_ELT(tt, 0, char_IDate);
502         SET_STRING_ELT(tt, 1, char_Date);
503         setAttrib(thiscol, R_ClassSymbol, tt);
504         UNPROTECT(1);
505       } else if (type[i] == CT_ISO8601_TIME) {
506         SEXP tt = PROTECT(allocVector(STRSXP, 2));
507         SET_STRING_ELT(tt, 0, char_POSIXct);
508         SET_STRING_ELT(tt, 1, char_POSIXt);
509         setAttrib(thiscol, R_ClassSymbol, tt);
510         UNPROTECT(1);
511 
512         setAttrib(thiscol, sym_tzone, ScalarString(char_UTC)); // see news for v1.13.0
513       }
514       SET_TRUELENGTH(thiscol, allocNrow);
515       DTbytes += SIZEOF(thiscol)*allocNrow;
516     }
517     resi++;
518   }
519   dtnrows = allocNrow;
520   return DTbytes;
521 }
522 
523 
setFinalNrow(size_t nrow)524 void setFinalNrow(size_t nrow) {
525   if (selectRank) setcolorder(DT, selectRank);  // selectRank was changed to contain order (not rank) in allocateDT above
526   if (length(DT)) {
527     if (nrow == dtnrows)
528       return;
529     for (int i=0; i<LENGTH(DT); i++) {
530       SETLENGTH(VECTOR_ELT(DT,i), nrow);  // TODO: realloc
531       SET_TRUELENGTH(VECTOR_ELT(DT,i), nrow);
532     }
533   }
534   R_FlushConsole(); // # 2481. Just a convenient place; nothing per se to do with setFinalNrow()
535 }
536 
537 
pushBuffer(ThreadLocalFreadParsingContext * ctx)538 void pushBuffer(ThreadLocalFreadParsingContext *ctx)
539 {
540   const void *buff8 = ctx->buff8;
541   const void *buff4 = ctx->buff4;
542   const void *buff1 = ctx->buff1;
543   const char *anchor = ctx->anchor;
544   int nRows = (int) ctx->nRows;
545   size_t DTi = ctx->DTi;
546   int rowSize8 = (int) ctx->rowSize8;
547   int rowSize4 = (int) ctx->rowSize4;
548   int rowSize1 = (int) ctx->rowSize1;
549   int nStringCols = ctx->nStringCols;
550   int nNonStringCols = ctx->nNonStringCols;
551 
552   // Do all the string columns first so as to minimize and concentrate the time inside the single critical.
553   // While the string columns are happening other threads before me can be copying their non-string buffers to the
554   // final DT and other threads after me can be filling their buffers too.
555   // rowSize is passed in because it will be different (much smaller) on the reread covering any type exception columns
556   // locals passed in on stack so openmp knows that no synchronization is required
557 
558   // the byte position of this column in the first row of the row-major buffer
559   if (nStringCols) {
560     #pragma omp critical
561     {
562       int off8 = 0;
563       int cnt8 = rowSize8 / 8;
564       lenOff *buff8_lenoffs = (lenOff*) buff8;
565       for (int j=0, resj=-1, done=0; done<nStringCols && j<ncol; j++) {
566         if (type[j] == CT_DROP) continue;
567         resj++;
568         if (type[j] == CT_STRING) {
569           SEXP dest = VECTOR_ELT(DT, resj);
570           lenOff *source = buff8_lenoffs + off8;
571           for (int i=0; i<nRows; i++) {
572             int strLen = source->len;
573             if (strLen<=0) {
574               // stringLen == INT_MIN => NA, otherwise not a NAstring was checked inside fread_mean
575               if (strLen<0) SET_STRING_ELT(dest, DTi+i, NA_STRING); // else leave the "" in place that was initialized by allocVector()
576             } else {
577               const char *str = anchor + source->off;
578               int c=0;
579               while (c<strLen && str[c]) c++;
580               if (c<strLen) {
581                 // embedded nul found; any at the beginning or the end of the field should have already been excluded but this will strip those too if present just in case
582                 char *last = (char *)str+c;    // obtain write access to (const char *)anchor;
583                 while (c<strLen) {
584                   if (str[c]) *last++=str[c];  // cow page write: saves allocation and management of a temp that would need to thread-safe in future.
585                   c++;                         //   This is only thread accessing this region. For non-mmap direct input nul are not possible (R would not have accepted nul earlier).
586                 }
587                 strLen = last-str;
588               }
589               SET_STRING_ELT(dest, DTi+i, mkCharLenCE(str, strLen, ienc));
590             }
591             source += cnt8;
592           }
593           done++; // if just one string col near the start, don't loop over the other 10,000 cols. TODO? start on first too
594         }
595         off8 += (size[j] == 8);
596       }
597     }
598   }
599 
600   int off1 = 0, off4 = 0, off8 = 0;
601   for (int j=0, resj=-1, done=0; done<nNonStringCols && j<ncol; j++) {
602     if (type[j]==CT_DROP) continue;
603     int thisSize = size[j];
604     resj++;
605     if (type[j]!=CT_STRING && type[j]>0) {
606       if (thisSize == 8) {
607         double *dest = (double *)REAL(VECTOR_ELT(DT, resj)) + DTi;
608         const char *src8 = (char*)buff8 + off8;
609         for (int i=0; i<nRows; ++i) {
610           *dest = *(double *)src8;
611           src8 += rowSize8;
612           dest++;
613         }
614       } else
615       if (thisSize == 4) {
616         int *dest = (int *)INTEGER(VECTOR_ELT(DT, resj)) + DTi;
617         const char *src4 = (char*)buff4 + off4;
618         // debug line for #3369 ... if (DTi>2638000) printf("freadR.c:460: thisSize==4, resj=%d, %"PRIu64", %d, %d, j=%d, done=%d\n", resj, (uint64_t)DTi, off4, rowSize4, j, done);
619         for (int i=0; i<nRows; ++i) {
620           *dest = *(int *)src4;
621           src4 += rowSize4;
622           dest++;
623         }
624       } else
625       if (thisSize == 1) {
626         if (type[j] > CT_BOOL8_L) STOP(_("Field size is 1 but the field is of type %d\n"), type[j]);
627         Rboolean *dest = (Rboolean *)LOGICAL(VECTOR_ELT(DT, resj)) + DTi;
628         const char *src1 = (char*)buff1 + off1;
629         for (int i=0; i<nRows; ++i) {
630           int8_t v = *(int8_t *)src1;
631           *dest = (v==INT8_MIN ? NA_INTEGER : v);
632           src1 += rowSize1;
633           dest++;
634         }
635       } else STOP(_("Internal error: unexpected field of size %d\n"), thisSize);  // # nocov
636       done++;
637     }
638     off8 += (size[j] & 8);
639     off4 += (size[j] & 4);
640     off1 += (size[j] & 1);
641   }
642 }
643 
644 // # nocov start
progress(int p,int eta)645 void progress(int p, int eta) {
646   // called from thread 0 only
647   // p between 0 and 100
648   // eta in seconds
649   // Initialized the first time it is called with p>0
650   // Must be called at the end with p==100 to finish off and reset
651   // If it's called twice at the end with p=100, that's ok
652 
653   // REprinf to avoid Rprintf's call to R_CheckUserInterrupt() every 100 lines, issue #2457
654   // It's the R_CheckUserInterrupt() that has caused crashes before when called from OpenMP parallel region
655   // even when called only from master thread. Update: can now retry within critical.
656   // fwrite.c has some comments about how it might be possible to call R_CheckUserInterrupt() here so that
657   // a long running fread can be stopped by user with Ctrl-C (or ESC on Windows).
658   // Could try R_ProcessEvents() too as per
659   // https://cran.r-project.org/bin/windows/base/rw-FAQ.html#The-console-freezes-when-my-compiled-code-is-running
660 
661   // No use of \r to avoid bug in RStudio, linked in the same issue #2457
662   static int displayed = -1;  // -1 means not yet displayed, otherwise [0,50] '=' are displayed
663   static char bar[] = "================================================== ";  // 50 marks for each 2%
664   if (displayed==-1) {
665     if (eta<3 || p>50) return;
666     #pragma omp critical
667     {
668       REprintf("|--------------------------------------------------|\n|");
669       R_FlushConsole();
670     }
671     displayed = 0;
672   }
673   p/=2;
674   int toPrint = p-displayed;
675   if (toPrint==0) return;
676   bar[toPrint] = '\0';
677   #pragma omp critical
678   {
679     REprintf("%s", bar);
680     bar[toPrint] = '=';
681     displayed = p;
682     if (p==50) {
683       REprintf("|\n");
684       displayed = -1;
685     }
686     R_FlushConsole();
687   }
688 }
689 // # nocov end
690 
__halt(bool warn,const char * format,...)691 void __halt(bool warn, const char *format, ...) {
692   // Solves: http://stackoverflow.com/questions/18597123/fread-data-table-locks-files
693   // TODO: always include fnam in the STOP message. For log files etc.
694   va_list args;
695   va_start(args, format);
696   char msg[2000];
697   vsnprintf(msg, 2000, format, args);
698   va_end(args);
699   freadCleanup(); // this closes mmp hence why we just copied substrings from mmp to msg[] first since mmp is now invalid
700   // if (warn) warning(_("%s"), msg);
701   //   this warning() call doesn't seem to honor warn=2 straight away in R 3.6, so now always call error() directly to be sure
702   //   we were going via warning() before to get the (converted from warning) prefix in the message (which we could mimic in future)
703   error(_("%s"), msg); // include "%s" because data in msg might include '%'
704 }
705 
prepareThreadContext(ThreadLocalFreadParsingContext * ctx)706 void prepareThreadContext(ThreadLocalFreadParsingContext *ctx) {}
postprocessBuffer(ThreadLocalFreadParsingContext * ctx)707 void postprocessBuffer(ThreadLocalFreadParsingContext *ctx) {}
orderBuffer(ThreadLocalFreadParsingContext * ctx)708 void orderBuffer(ThreadLocalFreadParsingContext *ctx) {}
freeThreadContext(ThreadLocalFreadParsingContext * ctx)709 void freeThreadContext(ThreadLocalFreadParsingContext *ctx) {}
710