1 #include "data.table.h"
2 
isRealReallyInt(SEXP x)3 bool isRealReallyInt(SEXP x) {
4   if (!isReal(x)) return(false);
5   R_xlen_t n=xlength(x), i=0;
6   double *dx = REAL(x);
7   while (i<n &&
8          ( ISNA(dx[i]) ||
9          ( R_FINITE(dx[i]) && dx[i] == (int)(dx[i])))) {
10     i++;
11   }
12   return i==n;
13 }
14 
isReallyReal(SEXP x)15 SEXP isReallyReal(SEXP x) {
16   SEXP ans = PROTECT(allocVector(INTSXP, 1));
17   INTEGER(ans)[0] = 0;
18   // return 0 (FALSE) when not type double, or is type double but contains integers
19   // used to error if not passed type double but this needed extra is.double() calls in calling R code
20   // which needed a repeat of the argument. Hence simpler and more robust to return 0 when not type double.
21   if (isReal(x)) {
22     int n=length(x), i=0;
23     double *dx = REAL(x);
24     while (i<n &&
25         ( ISNA(dx[i]) ||
26         ( R_FINITE(dx[i]) && dx[i] == (int)(dx[i])))) {
27       i++;
28     }
29     if (i<n) INTEGER(ans)[0] = i+1;  // return the location of first element which is really real; i.e. not an integer
30   }
31   UNPROTECT(1);
32   return(ans);
33 }
34 
allNA(SEXP x,bool errorForBadType)35 bool allNA(SEXP x, bool errorForBadType) {
36   // less space and time than all(is.na(x)) at R level because that creates full size is.na(x) first before all()
37   // whereas this allNA can often return early on testing the first value without reading the rest
38   const int n = length(x);
39   if (n==0) // empty vectors (including raw(), NULL, and list()) same as R's all(is.na()) true result; tests 2116.*
40     return true;
41   switch (TYPEOF(x)) {
42   case RAWSXP: // raw doesn't support NA so always false (other than length 0 case above)
43     return false;
44   case LGLSXP:
45   case INTSXP: {
46     const int *xd = INTEGER(x);
47     for (int i=0; i<n; ++i)    if (xd[i]!=NA_INTEGER) {
48       return false;
49     }
50     return true;
51   }
52   case REALSXP:
53     if (Rinherits(x,char_integer64)) {
54       const int64_t *xd = (int64_t *)REAL(x);
55       for (int i=0; i<n; ++i)  if (xd[i]!=NA_INTEGER64) {
56         return false;
57       }
58     } else {
59       const double *xd = REAL(x);
60       for (int i=0; i<n; ++i)  if (!ISNAN(xd[i])) {
61         return false;
62       }
63     }
64     return true;
65   case CPLXSXP: {
66     const Rcomplex *xd = COMPLEX(x);
67     for (int i=0; i<n; ++i) if (!ISNAN_COMPLEX(xd[i])) {
68       return false;
69     }
70     return true;
71   }
72   case STRSXP: {
73     const SEXP *xd = STRING_PTR(x);
74     for (int i=0; i<n; ++i)    if (xd[i]!=NA_STRING) {
75       return false;
76     }
77     return true;
78   }}
79   if (!errorForBadType) return false;
80   error(_("Unsupported type '%s' passed to allNA()"), type2char(TYPEOF(x)));  // e.g. VECSXP; tests 2116.16-18
81   // turned off allNA list support for now to avoid accidentally using it internally where we did not intend; allNA not yet exported
82   //   https://github.com/Rdatatable/data.table/pull/3909#discussion_r329065950
83 }
84 
allNAR(SEXP x)85 SEXP allNAR(SEXP x) {
86   return ScalarLogical(allNA(x, /*errorForBadType=*/true));
87 }
88 
89 /* colnamesInt
90  * for provided data.table (or a list-like) and a subset of its columns, it returns integer positions of those columns in DT
91  * handle columns input as: integer, double, character and NULL (handled as seq_along(x))
92  * adds validation for:
93  *   correct range [1,ncol], and if type real checks whole integer
94  *   existing columns for character
95  *   optionally check for no duplicates
96  */
colnamesInt(SEXP x,SEXP cols,SEXP check_dups)97 SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) {
98   if (!isNewList(x))
99     error(_("'x' argument must be data.table compatible"));
100   if (!IS_TRUE_OR_FALSE(check_dups))
101     error(_("'check_dups' argument must be TRUE or FALSE"));
102   int protecti = 0;
103   R_len_t nx = length(x);
104   R_len_t nc = length(cols);
105   SEXP ricols = R_NilValue;
106   if (isNull(cols)) { // seq_along(x)
107     ricols = PROTECT(allocVector(INTSXP, nx)); protecti++;
108     int *icols = INTEGER(ricols);
109     for (int i=0; i<nx; i++) icols[i] = i+1;
110   } else if (length(cols)==0) { // integer(0)
111     ricols = PROTECT(allocVector(INTSXP, 0)); protecti++;
112   } else if (isInteger(cols) || isReal(cols)) {
113     if (isInteger(cols)) {
114       ricols = cols;
115     } else if (isReal(cols)) {
116       if (!isRealReallyInt(cols))
117         error(_("argument specifying columns is type 'double' and one or more items in it are not whole integers"));
118       ricols = PROTECT(coerceVector(cols, INTSXP)); protecti++;
119     }
120     int *icols = INTEGER(ricols);
121     for (int i=0; i<nc; i++) {
122       if ((icols[i]>nx) || (icols[i]<1))
123         error(_("argument specifying columns specify non existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also
124     }
125   } else if (isString(cols)) {
126     SEXP xnames = PROTECT(getAttrib(x, R_NamesSymbol)); protecti++;
127     if (isNull(xnames))
128       error(_("'x' argument data.table has no names"));
129     ricols = PROTECT(chmatch(cols, xnames, 0)); protecti++;
130     int *icols = INTEGER(ricols);
131     for (int i=0; i<nc; i++) {
132       if (icols[i]==0)
133         error(_("argument specifying columns specify non existing column(s): cols[%d]='%s'"), i+1, CHAR(STRING_ELT(cols, i))); // handles NAs also
134     }
135   } else {
136     error(_("argument specifying columns must be character or numeric"));
137   }
138   if (LOGICAL(check_dups)[0] && any_duplicated(ricols, FALSE))
139     error(_("argument specifying columns specify duplicated column(s)"));
140   UNPROTECT(protecti);
141   return ricols;
142 }
143 
coerceFill(SEXP fill,double * dfill,int32_t * ifill,int64_t * i64fill)144 void coerceFill(SEXP fill, double *dfill, int32_t *ifill, int64_t *i64fill) {
145   if (xlength(fill) != 1) error(_("%s: fill argument must be length 1"), __func__);
146   if (isInteger(fill)) {
147     if (INTEGER(fill)[0]==NA_INTEGER) {
148       ifill[0] = NA_INTEGER; dfill[0] = NA_REAL; i64fill[0] = NA_INTEGER64;
149     } else {
150       ifill[0] = INTEGER(fill)[0];
151       dfill[0] = (double)(INTEGER(fill)[0]);
152       i64fill[0] = (int64_t)(INTEGER(fill)[0]);
153     }
154   } else if (isReal(fill)) {
155     if (Rinherits(fill,char_integer64)) {  // Rinherits true for nanotime
156       int64_t rfill = ((int64_t *)REAL(fill))[0];
157       if (rfill==NA_INTEGER64) {
158         ifill[0] = NA_INTEGER; dfill[0] = NA_REAL; i64fill[0] = NA_INTEGER64;
159       } else {
160         ifill[0] = (rfill>INT32_MAX || rfill<=INT32_MIN) ? NA_INTEGER : (int32_t)rfill;
161         dfill[0] = (double)rfill;
162         i64fill[0] = rfill;
163       }
164     } else {
165       double rfill = REAL(fill)[0];
166       if (ISNAN(rfill)) {
167         // NA -> NA, NaN -> NaN
168         ifill[0] = NA_INTEGER; dfill[0] = rfill; i64fill[0] = NA_INTEGER64;
169       } else {
170         ifill[0] = (!R_FINITE(rfill) || rfill>INT32_MAX || rfill<=INT32_MIN) ? NA_INTEGER : (int32_t)rfill;
171         dfill[0] = rfill;
172         i64fill[0] = (!R_FINITE(rfill) || rfill>(double)INT64_MAX || rfill<=(double)INT64_MIN) ? NA_INTEGER64 : (int64_t)rfill;
173       }
174     }
175   } else if (isLogical(fill) && LOGICAL(fill)[0]==NA_LOGICAL) {
176     ifill[0] = NA_INTEGER; dfill[0] = NA_REAL; i64fill[0] = NA_INTEGER64;
177   } else {
178     error(_("%s: fill argument must be numeric"), __func__);
179   }
180 }
coerceFillR(SEXP fill)181 SEXP coerceFillR(SEXP fill) {
182   int protecti=0;
183   double dfill=NA_REAL;
184   int32_t ifill=NA_INTEGER;
185   int64_t i64fill=NA_INTEGER64;
186   coerceFill(fill, &dfill, &ifill, &i64fill);
187   SEXP ans = PROTECT(allocVector(VECSXP, 3)); protecti++;
188   SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 1));
189   SET_VECTOR_ELT(ans, 1, allocVector(REALSXP, 1));
190   SET_VECTOR_ELT(ans, 2, allocVector(REALSXP, 1));
191   INTEGER(VECTOR_ELT(ans, 0))[0] = ifill;
192   REAL(VECTOR_ELT(ans, 1))[0] = dfill;
193   ((int64_t *)REAL(VECTOR_ELT(ans, 2)))[0] = i64fill;
194   setAttrib(VECTOR_ELT(ans, 2), R_ClassSymbol, ScalarString(char_integer64));
195   UNPROTECT(protecti);
196   return ans;
197 }
198 
INHERITS(SEXP x,SEXP char_)199 inline bool INHERITS(SEXP x, SEXP char_) {
200   // Thread safe inherits() by pre-calling install() in init.c and then
201   // passing those char_* in here for simple and fast non-API pointer compare.
202   // The thread-safety aspect here is only currently actually needed for list columns in
203   // fwrite() where the class of the cell's vector is tested; the class of the column
204   // itself is pre-stored by fwrite (for example in isInteger64[] and isITime[]).
205   // Thread safe in the limited sense of correct and intended usage :
206   // i) no API call such as install() or mkChar() must be passed in.
207   // ii) no attrib writes must be possible in other threads.
208   SEXP klass;
209   if (isString(klass = getAttrib(x, R_ClassSymbol))) {
210     for (int i=0; i<LENGTH(klass); i++) {
211       if (STRING_ELT(klass, i) == char_) return true;
212     }
213   }
214   return false;
215 }
216 
Rinherits(SEXP x,SEXP char_)217 bool Rinherits(SEXP x, SEXP char_) {
218   // motivation was nanotime which is S4 and inherits from integer64 via S3 extends
219   // R's C API inherits() does not cover S4 and returns FALSE for nanotime, as does our own INHERITS above.
220   // R's R-level inherits() calls objects.c:inherits2 which calls attrib.c:R_data_class2 and
221   // then attrib.c:S4_extends which itself calls R level methods:::.extendsForS3 which then calls R level methods::extends.
222   // Since that chain of calls is so complicated and involves evaluating R level anyway, let's just reuse it.
223   // Rinherits prefix with 'R' to signify i) it may call R level and is therefore not thread safe, and ii) includes R level inherits which covers S4.
224   bool ans = INHERITS(x, char_);        // try standard S3 class character vector first
225   if (!ans && char_==char_integer64)    // save the eval() for known S4 classes that inherit from integer64
226     ans = INHERITS(x, char_nanotime);   // comment this out to test the eval() works for nanotime
227   if (!ans && IS_S4_OBJECT(x)) {        // if it's not S4 we can save the overhead of R eval()
228     SEXP vec = PROTECT(ScalarString(char_));           // TODO: cover this branch by making two new test S4 classes: one that
229     SEXP call = PROTECT(lang3(sym_inherits, x, vec));  //       does inherit from integer64 and one that doesn't
230     ans = LOGICAL(eval(call, R_GlobalEnv))[0]==1;
231     UNPROTECT(2);
232   }
233   return ans;
234 }
235 
copyAsPlain(SEXP x)236 SEXP copyAsPlain(SEXP x) {
237   // v1.12.2 and before used standard R duplicate() to do this. But duplicate() is not guaranteed to not return an ALTREP.
238   // e.g. ALTREP 'wrapper' on factor column (with materialized INTSXP) in package VIM under example(hotdeck)
239   //      .Internal(inspect(x[[5]]))
240   //      @558adf4d9508 13 INTSXP g0c0 [OBJ,NAM(7),ATT]  wrapper [srt=-2147483648,no_na=0]
241   // 'AsPlain' is intended to convey unALTREP-ing; i.e. materializing and removing any ALTREP wrappers/attributes
242   // For non-ALTREP this should do the same as R's duplicate().
243   // Intended for use on columns; to either un-ALTREP them or duplicate shared memory columns; see copySharedColumns() below
244   // Not intended to be called on a DT VECSXP where a concept of 'deep' might refer to whether the columns are copied
245 
246   if (isNull(x)) {
247     // deal with up front because isNewList(R_NilValue) is true
248     return R_NilValue;
249   }
250   if (!isVectorAtomic(x) && !isNewList(x)) {
251     // e.g. defer to R the CLOSXP in test 173.3 where a list column item is the function 'mean'
252     return duplicate(x);
253   }
254   const int64_t n = XLENGTH(x);
255   SEXP ans = PROTECT(allocVector(TYPEOF(x), n));
256   switch (TYPEOF(x)) {
257   case RAWSXP:
258     memcpy(RAW(ans),     RAW(x),     n*sizeof(Rbyte));
259     break;
260   case LGLSXP:
261     memcpy(LOGICAL(ans), LOGICAL(x), n*sizeof(Rboolean));
262     break;
263   case INTSXP:
264     memcpy(INTEGER(ans), INTEGER(x), n*sizeof(int));             // covered by 10:1 after test 178
265     break;
266   case REALSXP:
267     memcpy(REAL(ans),    REAL(x),    n*sizeof(double));          // covered by as.Date("2013-01-01")+seq(1,1000,by=10) after test 1075
268     break;
269   case CPLXSXP:
270     memcpy(COMPLEX(ans), COMPLEX(x), n*sizeof(Rcomplex));
271     break;
272   case STRSXP: {
273     const SEXP *xp=STRING_PTR(x);                                // covered by as.character(as.hexmode(1:500)) after test 642
274     for (int64_t i=0; i<n; ++i) SET_STRING_ELT(ans, i, xp[i]);
275   } break;
276   case VECSXP: {
277     const SEXP *xp=SEXPPTR_RO(x);
278     for (int64_t i=0; i<n; ++i) SET_VECTOR_ELT(ans, i, copyAsPlain(xp[i]));
279   } break;
280   default:                                                                                           // # nocov
281     error(_("Internal error: unsupported type '%s' passed to copyAsPlain()"), type2char(TYPEOF(x))); // # nocov
282   }
283   DUPLICATE_ATTRIB(ans, x);
284   // aside: unlike R's duplicate we do not copy truelength here; important for dogroups.c which uses negative truelenth to mark its specials
285   if (ALTREP(ans))
286     error(_("Internal error: copyAsPlain returning ALTREP for type '%s'"), type2char(TYPEOF(x))); // # nocov
287   UNPROTECT(1);
288   return ans;
289 }
290 
copySharedColumns(SEXP x)291 void copySharedColumns(SEXP x) {
292   const int ncol = length(x);
293   if (!isNewList(x) || ncol==1) return;
294   bool *shared = (bool *)R_alloc(ncol, sizeof(bool)); // on R heap in case alloc fails
295   int *savetl = (int *)R_alloc(ncol, sizeof(int));  // on R heap for convenience but could be a calloc
296   const SEXP *xp = SEXPPTR_RO(x);
297   // first save the truelength, which may be negative on specials in dogroups, and set to zero; test 2157
298   // the savetl() function elsewhere is for CHARSXP. Here, we are using truelength on atomic vectors.
299   for (int i=0; i<ncol; ++i) {
300     const SEXP thiscol = xp[i];
301     savetl[i] = ALTREP(thiscol) ? 0 : TRUELENGTH(thiscol);
302     SET_TRUELENGTH(thiscol, 0);
303   }
304   int nShared=0;
305   for (int i=0; i<ncol; ++i) {
306     SEXP thiscol = xp[i];
307     if (ALTREP(thiscol) || TRUELENGTH(thiscol)<0) {
308       shared[i] = true;  // we mark ALTREP as 'shared' too, whereas 'tocopy' would be better word to use for ALTREP
309       nShared++;
310       // do not copyAsPlain() here yet, as its alloc might fail. Must restore tl first to all columns before attempting any copies.
311     } else {
312       shared[i] = false;              // so the first column will never be shared (unless it is an altrep) even it is shared
313                                       // 'shared' means a later column shares an earlier column
314       SET_TRUELENGTH(thiscol, -i-1);  // -i-1 so that if, for example, column 3 shares column 1, in iteration 3 we'll know not
315                                       // only that the 3rd column is shared with an earlier column, but which one too. Although
316                                       // we don't use that information currently, we could do in future.
317     }
318   }
319   // now we know nShared and which ones they are (if any), restore original tl back to the unique set of columns
320   for (int i=0; i<ncol; ++i) {
321     if (!shared[i]) SET_TRUELENGTH(xp[i], savetl[i]);
322     //  ^^^^^^^^^^ important because if there are shared columns, the dup will have savetl==0 but we want the first restore to stand
323   }
324   // now that truelength has been restored for all columns, we can finally call copyAsPlain()
325   if (nShared) {
326     for (int i=0; i<ncol; ++i) {
327       if (shared[i])
328         SET_VECTOR_ELT(x, i, copyAsPlain(xp[i]));
329     }
330     if (GetVerbose()) Rprintf(_("Found and copied %d column%s with a shared memory address\n"), nShared, nShared>1?"s":"");
331     // GetVerbose() (slightly expensive call of all options) called here only when needed
332   }
333 }
334 
335 // lock, unlock and islocked at C level :
336 // 1) for speed to reduce overhead
337 // 2) to avoid an R level wrapper which bumps MAYBE_SHARED; see the unlock after eval(jval) in data.table.R, #1341 #2245
lock(SEXP DT)338 SEXP lock(SEXP DT) {
339   setAttrib(DT, sym_datatable_locked, ScalarLogical(TRUE));
340   return DT;
341 }
unlock(SEXP DT)342 SEXP unlock(SEXP DT) {
343   setAttrib(DT, sym_datatable_locked, R_NilValue);
344   return DT;
345 }
islocked(SEXP DT)346 bool islocked(SEXP DT) {
347   SEXP att = getAttrib(DT, sym_datatable_locked);
348   return isLogical(att) && LENGTH(att)==1 && LOGICAL(att)[0]==1;
349 }
islockedR(SEXP DT)350 SEXP islockedR(SEXP DT) {
351   return ScalarLogical(islocked(DT));
352 }
353 
need2utf8(SEXP x)354 bool need2utf8(SEXP x) {
355   const int xlen = length(x);
356   SEXP *xd = STRING_PTR(x);
357   for (int i=0; i<xlen; i++) {
358     if (NEED2UTF8(xd[i]))
359       return(true);
360   }
361   return(false);
362 }
363 
coerceUtf8IfNeeded(SEXP x)364 SEXP coerceUtf8IfNeeded(SEXP x) {
365   if (!need2utf8(x))
366     return(x);
367   const int xlen = length(x);
368   SEXP ans = PROTECT(allocVector(STRSXP, xlen));
369   SEXP *xd = STRING_PTR(x);
370   for (int i=0; i<xlen; i++) {
371     SET_STRING_ELT(ans, i, ENC2UTF8(xd[i]));
372   }
373   UNPROTECT(1);
374   return(ans);
375 }
376 
377 #ifndef NOZLIB
378 #include <zlib.h>
379 #endif
dt_zlib_version()380 SEXP dt_zlib_version() {
381   char out[51];
382 #ifndef NOZLIB
383   snprintf(out, 50, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION);
384 #else
385   snprintf(out, 50, "zlib header files were not found when data.table was compiled");
386 #endif
387   return ScalarString(mkChar(out));
388 }
389 
390