1 #include "data.table.h"
2
isRealReallyInt(SEXP x)3 bool isRealReallyInt(SEXP x) {
4 if (!isReal(x)) return(false);
5 R_xlen_t n=xlength(x), i=0;
6 double *dx = REAL(x);
7 while (i<n &&
8 ( ISNA(dx[i]) ||
9 ( R_FINITE(dx[i]) && dx[i] == (int)(dx[i])))) {
10 i++;
11 }
12 return i==n;
13 }
14
isReallyReal(SEXP x)15 SEXP isReallyReal(SEXP x) {
16 SEXP ans = PROTECT(allocVector(INTSXP, 1));
17 INTEGER(ans)[0] = 0;
18 // return 0 (FALSE) when not type double, or is type double but contains integers
19 // used to error if not passed type double but this needed extra is.double() calls in calling R code
20 // which needed a repeat of the argument. Hence simpler and more robust to return 0 when not type double.
21 if (isReal(x)) {
22 int n=length(x), i=0;
23 double *dx = REAL(x);
24 while (i<n &&
25 ( ISNA(dx[i]) ||
26 ( R_FINITE(dx[i]) && dx[i] == (int)(dx[i])))) {
27 i++;
28 }
29 if (i<n) INTEGER(ans)[0] = i+1; // return the location of first element which is really real; i.e. not an integer
30 }
31 UNPROTECT(1);
32 return(ans);
33 }
34
allNA(SEXP x,bool errorForBadType)35 bool allNA(SEXP x, bool errorForBadType) {
36 // less space and time than all(is.na(x)) at R level because that creates full size is.na(x) first before all()
37 // whereas this allNA can often return early on testing the first value without reading the rest
38 const int n = length(x);
39 if (n==0) // empty vectors (including raw(), NULL, and list()) same as R's all(is.na()) true result; tests 2116.*
40 return true;
41 switch (TYPEOF(x)) {
42 case RAWSXP: // raw doesn't support NA so always false (other than length 0 case above)
43 return false;
44 case LGLSXP:
45 case INTSXP: {
46 const int *xd = INTEGER(x);
47 for (int i=0; i<n; ++i) if (xd[i]!=NA_INTEGER) {
48 return false;
49 }
50 return true;
51 }
52 case REALSXP:
53 if (Rinherits(x,char_integer64)) {
54 const int64_t *xd = (int64_t *)REAL(x);
55 for (int i=0; i<n; ++i) if (xd[i]!=NA_INTEGER64) {
56 return false;
57 }
58 } else {
59 const double *xd = REAL(x);
60 for (int i=0; i<n; ++i) if (!ISNAN(xd[i])) {
61 return false;
62 }
63 }
64 return true;
65 case CPLXSXP: {
66 const Rcomplex *xd = COMPLEX(x);
67 for (int i=0; i<n; ++i) if (!ISNAN_COMPLEX(xd[i])) {
68 return false;
69 }
70 return true;
71 }
72 case STRSXP: {
73 const SEXP *xd = STRING_PTR(x);
74 for (int i=0; i<n; ++i) if (xd[i]!=NA_STRING) {
75 return false;
76 }
77 return true;
78 }}
79 if (!errorForBadType) return false;
80 error(_("Unsupported type '%s' passed to allNA()"), type2char(TYPEOF(x))); // e.g. VECSXP; tests 2116.16-18
81 // turned off allNA list support for now to avoid accidentally using it internally where we did not intend; allNA not yet exported
82 // https://github.com/Rdatatable/data.table/pull/3909#discussion_r329065950
83 }
84
allNAR(SEXP x)85 SEXP allNAR(SEXP x) {
86 return ScalarLogical(allNA(x, /*errorForBadType=*/true));
87 }
88
89 /* colnamesInt
90 * for provided data.table (or a list-like) and a subset of its columns, it returns integer positions of those columns in DT
91 * handle columns input as: integer, double, character and NULL (handled as seq_along(x))
92 * adds validation for:
93 * correct range [1,ncol], and if type real checks whole integer
94 * existing columns for character
95 * optionally check for no duplicates
96 */
colnamesInt(SEXP x,SEXP cols,SEXP check_dups)97 SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) {
98 if (!isNewList(x))
99 error(_("'x' argument must be data.table compatible"));
100 if (!IS_TRUE_OR_FALSE(check_dups))
101 error(_("'check_dups' argument must be TRUE or FALSE"));
102 int protecti = 0;
103 R_len_t nx = length(x);
104 R_len_t nc = length(cols);
105 SEXP ricols = R_NilValue;
106 if (isNull(cols)) { // seq_along(x)
107 ricols = PROTECT(allocVector(INTSXP, nx)); protecti++;
108 int *icols = INTEGER(ricols);
109 for (int i=0; i<nx; i++) icols[i] = i+1;
110 } else if (length(cols)==0) { // integer(0)
111 ricols = PROTECT(allocVector(INTSXP, 0)); protecti++;
112 } else if (isInteger(cols) || isReal(cols)) {
113 if (isInteger(cols)) {
114 ricols = cols;
115 } else if (isReal(cols)) {
116 if (!isRealReallyInt(cols))
117 error(_("argument specifying columns is type 'double' and one or more items in it are not whole integers"));
118 ricols = PROTECT(coerceVector(cols, INTSXP)); protecti++;
119 }
120 int *icols = INTEGER(ricols);
121 for (int i=0; i<nc; i++) {
122 if ((icols[i]>nx) || (icols[i]<1))
123 error(_("argument specifying columns specify non existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also
124 }
125 } else if (isString(cols)) {
126 SEXP xnames = PROTECT(getAttrib(x, R_NamesSymbol)); protecti++;
127 if (isNull(xnames))
128 error(_("'x' argument data.table has no names"));
129 ricols = PROTECT(chmatch(cols, xnames, 0)); protecti++;
130 int *icols = INTEGER(ricols);
131 for (int i=0; i<nc; i++) {
132 if (icols[i]==0)
133 error(_("argument specifying columns specify non existing column(s): cols[%d]='%s'"), i+1, CHAR(STRING_ELT(cols, i))); // handles NAs also
134 }
135 } else {
136 error(_("argument specifying columns must be character or numeric"));
137 }
138 if (LOGICAL(check_dups)[0] && any_duplicated(ricols, FALSE))
139 error(_("argument specifying columns specify duplicated column(s)"));
140 UNPROTECT(protecti);
141 return ricols;
142 }
143
coerceFill(SEXP fill,double * dfill,int32_t * ifill,int64_t * i64fill)144 void coerceFill(SEXP fill, double *dfill, int32_t *ifill, int64_t *i64fill) {
145 if (xlength(fill) != 1) error(_("%s: fill argument must be length 1"), __func__);
146 if (isInteger(fill)) {
147 if (INTEGER(fill)[0]==NA_INTEGER) {
148 ifill[0] = NA_INTEGER; dfill[0] = NA_REAL; i64fill[0] = NA_INTEGER64;
149 } else {
150 ifill[0] = INTEGER(fill)[0];
151 dfill[0] = (double)(INTEGER(fill)[0]);
152 i64fill[0] = (int64_t)(INTEGER(fill)[0]);
153 }
154 } else if (isReal(fill)) {
155 if (Rinherits(fill,char_integer64)) { // Rinherits true for nanotime
156 int64_t rfill = ((int64_t *)REAL(fill))[0];
157 if (rfill==NA_INTEGER64) {
158 ifill[0] = NA_INTEGER; dfill[0] = NA_REAL; i64fill[0] = NA_INTEGER64;
159 } else {
160 ifill[0] = (rfill>INT32_MAX || rfill<=INT32_MIN) ? NA_INTEGER : (int32_t)rfill;
161 dfill[0] = (double)rfill;
162 i64fill[0] = rfill;
163 }
164 } else {
165 double rfill = REAL(fill)[0];
166 if (ISNAN(rfill)) {
167 // NA -> NA, NaN -> NaN
168 ifill[0] = NA_INTEGER; dfill[0] = rfill; i64fill[0] = NA_INTEGER64;
169 } else {
170 ifill[0] = (!R_FINITE(rfill) || rfill>INT32_MAX || rfill<=INT32_MIN) ? NA_INTEGER : (int32_t)rfill;
171 dfill[0] = rfill;
172 i64fill[0] = (!R_FINITE(rfill) || rfill>(double)INT64_MAX || rfill<=(double)INT64_MIN) ? NA_INTEGER64 : (int64_t)rfill;
173 }
174 }
175 } else if (isLogical(fill) && LOGICAL(fill)[0]==NA_LOGICAL) {
176 ifill[0] = NA_INTEGER; dfill[0] = NA_REAL; i64fill[0] = NA_INTEGER64;
177 } else {
178 error(_("%s: fill argument must be numeric"), __func__);
179 }
180 }
coerceFillR(SEXP fill)181 SEXP coerceFillR(SEXP fill) {
182 int protecti=0;
183 double dfill=NA_REAL;
184 int32_t ifill=NA_INTEGER;
185 int64_t i64fill=NA_INTEGER64;
186 coerceFill(fill, &dfill, &ifill, &i64fill);
187 SEXP ans = PROTECT(allocVector(VECSXP, 3)); protecti++;
188 SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 1));
189 SET_VECTOR_ELT(ans, 1, allocVector(REALSXP, 1));
190 SET_VECTOR_ELT(ans, 2, allocVector(REALSXP, 1));
191 INTEGER(VECTOR_ELT(ans, 0))[0] = ifill;
192 REAL(VECTOR_ELT(ans, 1))[0] = dfill;
193 ((int64_t *)REAL(VECTOR_ELT(ans, 2)))[0] = i64fill;
194 setAttrib(VECTOR_ELT(ans, 2), R_ClassSymbol, ScalarString(char_integer64));
195 UNPROTECT(protecti);
196 return ans;
197 }
198
INHERITS(SEXP x,SEXP char_)199 inline bool INHERITS(SEXP x, SEXP char_) {
200 // Thread safe inherits() by pre-calling install() in init.c and then
201 // passing those char_* in here for simple and fast non-API pointer compare.
202 // The thread-safety aspect here is only currently actually needed for list columns in
203 // fwrite() where the class of the cell's vector is tested; the class of the column
204 // itself is pre-stored by fwrite (for example in isInteger64[] and isITime[]).
205 // Thread safe in the limited sense of correct and intended usage :
206 // i) no API call such as install() or mkChar() must be passed in.
207 // ii) no attrib writes must be possible in other threads.
208 SEXP klass;
209 if (isString(klass = getAttrib(x, R_ClassSymbol))) {
210 for (int i=0; i<LENGTH(klass); i++) {
211 if (STRING_ELT(klass, i) == char_) return true;
212 }
213 }
214 return false;
215 }
216
Rinherits(SEXP x,SEXP char_)217 bool Rinherits(SEXP x, SEXP char_) {
218 // motivation was nanotime which is S4 and inherits from integer64 via S3 extends
219 // R's C API inherits() does not cover S4 and returns FALSE for nanotime, as does our own INHERITS above.
220 // R's R-level inherits() calls objects.c:inherits2 which calls attrib.c:R_data_class2 and
221 // then attrib.c:S4_extends which itself calls R level methods:::.extendsForS3 which then calls R level methods::extends.
222 // Since that chain of calls is so complicated and involves evaluating R level anyway, let's just reuse it.
223 // Rinherits prefix with 'R' to signify i) it may call R level and is therefore not thread safe, and ii) includes R level inherits which covers S4.
224 bool ans = INHERITS(x, char_); // try standard S3 class character vector first
225 if (!ans && char_==char_integer64) // save the eval() for known S4 classes that inherit from integer64
226 ans = INHERITS(x, char_nanotime); // comment this out to test the eval() works for nanotime
227 if (!ans && IS_S4_OBJECT(x)) { // if it's not S4 we can save the overhead of R eval()
228 SEXP vec = PROTECT(ScalarString(char_)); // TODO: cover this branch by making two new test S4 classes: one that
229 SEXP call = PROTECT(lang3(sym_inherits, x, vec)); // does inherit from integer64 and one that doesn't
230 ans = LOGICAL(eval(call, R_GlobalEnv))[0]==1;
231 UNPROTECT(2);
232 }
233 return ans;
234 }
235
copyAsPlain(SEXP x)236 SEXP copyAsPlain(SEXP x) {
237 // v1.12.2 and before used standard R duplicate() to do this. But duplicate() is not guaranteed to not return an ALTREP.
238 // e.g. ALTREP 'wrapper' on factor column (with materialized INTSXP) in package VIM under example(hotdeck)
239 // .Internal(inspect(x[[5]]))
240 // @558adf4d9508 13 INTSXP g0c0 [OBJ,NAM(7),ATT] wrapper [srt=-2147483648,no_na=0]
241 // 'AsPlain' is intended to convey unALTREP-ing; i.e. materializing and removing any ALTREP wrappers/attributes
242 // For non-ALTREP this should do the same as R's duplicate().
243 // Intended for use on columns; to either un-ALTREP them or duplicate shared memory columns; see copySharedColumns() below
244 // Not intended to be called on a DT VECSXP where a concept of 'deep' might refer to whether the columns are copied
245
246 if (isNull(x)) {
247 // deal with up front because isNewList(R_NilValue) is true
248 return R_NilValue;
249 }
250 if (!isVectorAtomic(x) && !isNewList(x)) {
251 // e.g. defer to R the CLOSXP in test 173.3 where a list column item is the function 'mean'
252 return duplicate(x);
253 }
254 const int64_t n = XLENGTH(x);
255 SEXP ans = PROTECT(allocVector(TYPEOF(x), n));
256 switch (TYPEOF(x)) {
257 case RAWSXP:
258 memcpy(RAW(ans), RAW(x), n*sizeof(Rbyte));
259 break;
260 case LGLSXP:
261 memcpy(LOGICAL(ans), LOGICAL(x), n*sizeof(Rboolean));
262 break;
263 case INTSXP:
264 memcpy(INTEGER(ans), INTEGER(x), n*sizeof(int)); // covered by 10:1 after test 178
265 break;
266 case REALSXP:
267 memcpy(REAL(ans), REAL(x), n*sizeof(double)); // covered by as.Date("2013-01-01")+seq(1,1000,by=10) after test 1075
268 break;
269 case CPLXSXP:
270 memcpy(COMPLEX(ans), COMPLEX(x), n*sizeof(Rcomplex));
271 break;
272 case STRSXP: {
273 const SEXP *xp=STRING_PTR(x); // covered by as.character(as.hexmode(1:500)) after test 642
274 for (int64_t i=0; i<n; ++i) SET_STRING_ELT(ans, i, xp[i]);
275 } break;
276 case VECSXP: {
277 const SEXP *xp=SEXPPTR_RO(x);
278 for (int64_t i=0; i<n; ++i) SET_VECTOR_ELT(ans, i, copyAsPlain(xp[i]));
279 } break;
280 default: // # nocov
281 error(_("Internal error: unsupported type '%s' passed to copyAsPlain()"), type2char(TYPEOF(x))); // # nocov
282 }
283 DUPLICATE_ATTRIB(ans, x);
284 // aside: unlike R's duplicate we do not copy truelength here; important for dogroups.c which uses negative truelenth to mark its specials
285 if (ALTREP(ans))
286 error(_("Internal error: copyAsPlain returning ALTREP for type '%s'"), type2char(TYPEOF(x))); // # nocov
287 UNPROTECT(1);
288 return ans;
289 }
290
copySharedColumns(SEXP x)291 void copySharedColumns(SEXP x) {
292 const int ncol = length(x);
293 if (!isNewList(x) || ncol==1) return;
294 bool *shared = (bool *)R_alloc(ncol, sizeof(bool)); // on R heap in case alloc fails
295 int *savetl = (int *)R_alloc(ncol, sizeof(int)); // on R heap for convenience but could be a calloc
296 const SEXP *xp = SEXPPTR_RO(x);
297 // first save the truelength, which may be negative on specials in dogroups, and set to zero; test 2157
298 // the savetl() function elsewhere is for CHARSXP. Here, we are using truelength on atomic vectors.
299 for (int i=0; i<ncol; ++i) {
300 const SEXP thiscol = xp[i];
301 savetl[i] = ALTREP(thiscol) ? 0 : TRUELENGTH(thiscol);
302 SET_TRUELENGTH(thiscol, 0);
303 }
304 int nShared=0;
305 for (int i=0; i<ncol; ++i) {
306 SEXP thiscol = xp[i];
307 if (ALTREP(thiscol) || TRUELENGTH(thiscol)<0) {
308 shared[i] = true; // we mark ALTREP as 'shared' too, whereas 'tocopy' would be better word to use for ALTREP
309 nShared++;
310 // do not copyAsPlain() here yet, as its alloc might fail. Must restore tl first to all columns before attempting any copies.
311 } else {
312 shared[i] = false; // so the first column will never be shared (unless it is an altrep) even it is shared
313 // 'shared' means a later column shares an earlier column
314 SET_TRUELENGTH(thiscol, -i-1); // -i-1 so that if, for example, column 3 shares column 1, in iteration 3 we'll know not
315 // only that the 3rd column is shared with an earlier column, but which one too. Although
316 // we don't use that information currently, we could do in future.
317 }
318 }
319 // now we know nShared and which ones they are (if any), restore original tl back to the unique set of columns
320 for (int i=0; i<ncol; ++i) {
321 if (!shared[i]) SET_TRUELENGTH(xp[i], savetl[i]);
322 // ^^^^^^^^^^ important because if there are shared columns, the dup will have savetl==0 but we want the first restore to stand
323 }
324 // now that truelength has been restored for all columns, we can finally call copyAsPlain()
325 if (nShared) {
326 for (int i=0; i<ncol; ++i) {
327 if (shared[i])
328 SET_VECTOR_ELT(x, i, copyAsPlain(xp[i]));
329 }
330 if (GetVerbose()) Rprintf(_("Found and copied %d column%s with a shared memory address\n"), nShared, nShared>1?"s":"");
331 // GetVerbose() (slightly expensive call of all options) called here only when needed
332 }
333 }
334
335 // lock, unlock and islocked at C level :
336 // 1) for speed to reduce overhead
337 // 2) to avoid an R level wrapper which bumps MAYBE_SHARED; see the unlock after eval(jval) in data.table.R, #1341 #2245
lock(SEXP DT)338 SEXP lock(SEXP DT) {
339 setAttrib(DT, sym_datatable_locked, ScalarLogical(TRUE));
340 return DT;
341 }
unlock(SEXP DT)342 SEXP unlock(SEXP DT) {
343 setAttrib(DT, sym_datatable_locked, R_NilValue);
344 return DT;
345 }
islocked(SEXP DT)346 bool islocked(SEXP DT) {
347 SEXP att = getAttrib(DT, sym_datatable_locked);
348 return isLogical(att) && LENGTH(att)==1 && LOGICAL(att)[0]==1;
349 }
islockedR(SEXP DT)350 SEXP islockedR(SEXP DT) {
351 return ScalarLogical(islocked(DT));
352 }
353
need2utf8(SEXP x)354 bool need2utf8(SEXP x) {
355 const int xlen = length(x);
356 SEXP *xd = STRING_PTR(x);
357 for (int i=0; i<xlen; i++) {
358 if (NEED2UTF8(xd[i]))
359 return(true);
360 }
361 return(false);
362 }
363
coerceUtf8IfNeeded(SEXP x)364 SEXP coerceUtf8IfNeeded(SEXP x) {
365 if (!need2utf8(x))
366 return(x);
367 const int xlen = length(x);
368 SEXP ans = PROTECT(allocVector(STRSXP, xlen));
369 SEXP *xd = STRING_PTR(x);
370 for (int i=0; i<xlen; i++) {
371 SET_STRING_ELT(ans, i, ENC2UTF8(xd[i]));
372 }
373 UNPROTECT(1);
374 return(ans);
375 }
376
377 #ifndef NOZLIB
378 #include <zlib.h>
379 #endif
dt_zlib_version()380 SEXP dt_zlib_version() {
381 char out[51];
382 #ifndef NOZLIB
383 snprintf(out, 50, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION);
384 #else
385 snprintf(out, 50, "zlib header files were not found when data.table was compiled");
386 #endif
387 return ScalarString(mkChar(out));
388 }
389
390