1 #include "data.table.h"
2 #include <Rdefines.h>
3 #include <R_ext/Rdynload.h>
4 #include <R_ext/Visibility.h>
5 
6 // global constants extern in data.table.h for gcc10 -fno-common; #4091
7 // these are written to once here on initialization, but because of that write they can't be declared const
8 SEXP char_integer64;
9 SEXP char_ITime;
10 SEXP char_IDate;
11 SEXP char_Date;
12 SEXP char_POSIXct;
13 SEXP char_POSIXt;
14 SEXP char_UTC;
15 SEXP char_nanotime;
16 SEXP char_lens;
17 SEXP char_indices;
18 SEXP char_allLen1;
19 SEXP char_allGrp1;
20 SEXP char_factor;
21 SEXP char_ordered;
22 SEXP char_datatable;
23 SEXP char_dataframe;
24 SEXP char_NULL;
25 SEXP sym_sorted;
26 SEXP sym_index;
27 SEXP sym_BY;
28 SEXP sym_starts, char_starts;
29 SEXP sym_maxgrpn;
30 SEXP sym_colClassesAs;
31 SEXP sym_verbose;
32 SEXP SelfRefSymbol;
33 SEXP sym_inherits;
34 SEXP sym_datatable_locked;
35 SEXP sym_tzone;
36 SEXP sym_old_fread_datetime_character;
37 double NA_INT64_D;
38 long long NA_INT64_LL;
39 Rcomplex NA_CPLX;
40 size_t __sizes[100];
41 size_t __typeorder[100];
42 
43 // .Calls
44 SEXP setattrib();
45 SEXP bmerge();
46 SEXP assign();
47 SEXP dogroups();
48 SEXP copy();
49 SEXP shallowwrapper();
50 SEXP alloccolwrapper();
51 SEXP selfrefokwrapper();
52 SEXP truelength();
53 SEXP setcharvec();
54 SEXP setcolorder();
55 SEXP chmatch_R();
56 SEXP chmatchdup_R();
57 SEXP chin_R();
58 SEXP fifelseR();
59 SEXP fcaseR();
60 SEXP freadR();
61 SEXP fwriteR();
62 SEXP reorder();
63 SEXP rbindlist();
64 SEXP vecseq();
65 SEXP setlistelt();
66 SEXP address();
67 SEXP expandAltRep();
68 SEXP fmelt();
69 SEXP fcast();
70 SEXP uniqlist();
71 SEXP uniqlengths();
72 SEXP forder();
73 SEXP issorted();
74 SEXP gforce();
75 SEXP gsum();
76 SEXP gmean();
77 SEXP gmin();
78 SEXP gmax();
79 SEXP isOrderedSubset();
80 SEXP setNumericRounding();
81 SEXP getNumericRounding();
82 SEXP binary();
83 SEXP subsetDT();
84 SEXP subsetVector();
85 SEXP convertNegAndZeroIdx();
86 SEXP frank();
87 SEXP dt_na();
88 SEXP lookup();
89 SEXP overlaps();
90 SEXP whichwrapper();
91 SEXP shift();
92 SEXP transpose();
93 SEXP anyNA();
94 SEXP isReallyReal();
95 SEXP setlevels();
96 SEXP rleid();
97 SEXP gmedian();
98 SEXP gtail();
99 SEXP ghead();
100 SEXP glast();
101 SEXP gfirst();
102 SEXP gnthvalue();
103 SEXP dim();
104 SEXP gvar();
105 SEXP gsd();
106 SEXP gprod();
107 SEXP nestedid();
108 SEXP setDTthreads();
109 SEXP getDTthreads_R();
110 SEXP nqRecreateIndices();
111 SEXP fsort();
112 SEXP inrange();
113 SEXP between();
114 SEXP hasOpenMP();
115 SEXP uniqueNlogical();
116 SEXP frollfunR();
117 SEXP dllVersion();
118 SEXP nafillR();
119 SEXP colnamesInt();
120 SEXP initLastUpdated();
121 SEXP cj();
122 SEXP lock();
123 SEXP unlock();
124 SEXP islockedR();
125 SEXP allNAR();
126 SEXP test_dt_win_snprintf();
127 SEXP dt_zlib_version();
128 
129 // .Externals
130 SEXP fastmean();
131 
132 static const
133 R_CallMethodDef callMethods[] = {
134 {"Csetattrib", (DL_FUNC) &setattrib, -1},
135 {"Cbmerge", (DL_FUNC) &bmerge, -1},
136 {"Cassign", (DL_FUNC) &assign, -1},
137 {"Cdogroups", (DL_FUNC) &dogroups, -1},
138 {"Ccopy", (DL_FUNC) &copy, -1},
139 {"Cshallowwrapper", (DL_FUNC) &shallowwrapper, -1},
140 {"Calloccolwrapper", (DL_FUNC) &alloccolwrapper, -1},
141 {"Cselfrefokwrapper", (DL_FUNC) &selfrefokwrapper, -1},
142 {"Ctruelength", (DL_FUNC) &truelength, -1},
143 {"Csetcharvec", (DL_FUNC) &setcharvec, -1},
144 {"Csetcolorder", (DL_FUNC) &setcolorder, -1},
145 {"Cchmatch", (DL_FUNC) &chmatch_R, -1},
146 {"Cchmatchdup", (DL_FUNC) &chmatchdup_R, -1},
147 {"Cchin", (DL_FUNC) &chin_R, -1},
148 {"CfreadR", (DL_FUNC) &freadR, -1},
149 {"CfwriteR", (DL_FUNC) &fwriteR, -1},
150 {"Creorder", (DL_FUNC) &reorder, -1},
151 {"Crbindlist", (DL_FUNC) &rbindlist, -1},
152 {"Cvecseq", (DL_FUNC) &vecseq, -1},
153 {"Csetlistelt", (DL_FUNC) &setlistelt, -1},
154 {"Caddress", (DL_FUNC) &address, -1},
155 {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1},
156 {"Cfmelt", (DL_FUNC) &fmelt, -1},
157 {"Cfcast", (DL_FUNC) &fcast, -1},
158 {"Cuniqlist", (DL_FUNC) &uniqlist, -1},
159 {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1},
160 {"Cforder", (DL_FUNC) &forder, -1},
161 {"Cissorted", (DL_FUNC) &issorted, -1},
162 {"Cgforce", (DL_FUNC) &gforce, -1},
163 {"Cgsum", (DL_FUNC) &gsum, -1},
164 {"Cgmean", (DL_FUNC) &gmean, -1},
165 {"Cgmin", (DL_FUNC) &gmin, -1},
166 {"Cgmax", (DL_FUNC) &gmax, -1},
167 {"CisOrderedSubset", (DL_FUNC) &isOrderedSubset, -1},
168 {"CsetNumericRounding", (DL_FUNC) &setNumericRounding, -1},
169 {"CgetNumericRounding", (DL_FUNC) &getNumericRounding, -1},
170 {"Cbinary", (DL_FUNC) &binary, -1},
171 {"CsubsetDT", (DL_FUNC) &subsetDT, -1},
172 {"CsubsetVector", (DL_FUNC) &subsetVector, -1},
173 {"CconvertNegAndZeroIdx", (DL_FUNC) &convertNegAndZeroIdx, -1},
174 {"Cfrank", (DL_FUNC) &frank, -1},
175 {"Cdt_na", (DL_FUNC) &dt_na, -1},
176 {"Clookup", (DL_FUNC) &lookup, -1},
177 {"Coverlaps", (DL_FUNC) &overlaps, -1},
178 {"Cwhichwrapper", (DL_FUNC) &whichwrapper, -1},
179 {"Cshift", (DL_FUNC) &shift, -1},
180 {"Ctranspose", (DL_FUNC) &transpose, -1},
181 {"CanyNA", (DL_FUNC) &anyNA, -1},
182 {"CisReallyReal", (DL_FUNC) &isReallyReal, -1},
183 {"Csetlevels", (DL_FUNC) &setlevels, -1},
184 {"Crleid", (DL_FUNC) &rleid, -1},
185 {"Cgmedian", (DL_FUNC) &gmedian, -1},
186 {"Cgtail", (DL_FUNC) &gtail, -1},
187 {"Cghead", (DL_FUNC) &ghead, -1},
188 {"Cglast", (DL_FUNC) &glast, -1},
189 {"Cgfirst", (DL_FUNC) &gfirst, -1},
190 {"Cgnthvalue", (DL_FUNC) &gnthvalue, -1},
191 {"Cdim", (DL_FUNC) &dim, -1},
192 {"Cgvar", (DL_FUNC) &gvar, -1},
193 {"Cgsd", (DL_FUNC) &gsd, -1},
194 {"Cgprod", (DL_FUNC) &gprod, -1},
195 {"Cnestedid", (DL_FUNC) &nestedid, -1},
196 {"CsetDTthreads", (DL_FUNC) &setDTthreads, -1},
197 {"CgetDTthreads", (DL_FUNC) &getDTthreads_R, -1},
198 {"CnqRecreateIndices", (DL_FUNC) &nqRecreateIndices, -1},
199 {"Cfsort", (DL_FUNC) &fsort, -1},
200 {"Cinrange", (DL_FUNC) &inrange, -1},
201 {"Cbetween", (DL_FUNC) &between, -1},
202 {"ChasOpenMP", (DL_FUNC) &hasOpenMP, -1},
203 {"CuniqueNlogical", (DL_FUNC) &uniqueNlogical, -1},
204 {"CfrollfunR", (DL_FUNC) &frollfunR, -1},
205 {"CdllVersion", (DL_FUNC) &dllVersion, -1},
206 {"CnafillR", (DL_FUNC) &nafillR, -1},
207 {"CcolnamesInt", (DL_FUNC) &colnamesInt, -1},
208 {"CcoerceFillR", (DL_FUNC) &coerceFillR, -1},
209 {"CinitLastUpdated", (DL_FUNC) &initLastUpdated, -1},
210 {"Ccj", (DL_FUNC) &cj, -1},
211 {"Ccoalesce", (DL_FUNC) &coalesce, -1},
212 {"CfifelseR", (DL_FUNC) &fifelseR, -1},
213 {"CfcaseR", (DL_FUNC) &fcaseR, -1},
214 {"C_lock", (DL_FUNC) &lock, -1},  // _ for these 3 to avoid Clock as in time
215 {"C_unlock", (DL_FUNC) &unlock, -1},
216 {"C_islocked", (DL_FUNC) &islockedR, -1},
217 {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1},
218 {"CtestMsgR", (DL_FUNC) &testMsgR, -1},
219 {"C_allNAR", (DL_FUNC) &allNAR, -1},
220 {"Ctest_dt_win_snprintf", (DL_FUNC)&test_dt_win_snprintf, -1},
221 {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1},
222 {NULL, NULL, 0}
223 };
224 
225 static const
226 R_ExternalMethodDef externalMethods[] = {
227 {"Cfastmean", (DL_FUNC) &fastmean, -1},
228 {NULL, NULL, 0}
229 };
230 
setSizes()231 static void setSizes() {
232   for (int i=0; i<100; ++i) { __sizes[i]=0; __typeorder[i]=0; }
233   // only these types are currently allowed as column types :
234   __sizes[LGLSXP] =  sizeof(int);       __typeorder[LGLSXP] =  0;
235   __sizes[RAWSXP] =  sizeof(Rbyte);     __typeorder[RAWSXP] =  1;
236   __sizes[INTSXP] =  sizeof(int);       __typeorder[INTSXP] =  2;   // integer and factor
237   __sizes[REALSXP] = sizeof(double);    __typeorder[REALSXP] = 3;   // numeric and integer64
238   __sizes[CPLXSXP] = sizeof(Rcomplex);  __typeorder[CPLXSXP] = 4;
239   __sizes[STRSXP] =  sizeof(SEXP *);    __typeorder[STRSXP] =  5;
240   __sizes[VECSXP] =  sizeof(SEXP *);    __typeorder[VECSXP] =  6;   // list column
241   if (sizeof(char *)>8) error(_("Pointers are %d bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *));
242   // One place we need the largest sizeof is the working memory malloc in reorder.c
243 }
244 
R_init_datatable(DllInfo * info)245 void attribute_visible R_init_datatable(DllInfo *info)
246 // relies on pkg/src/Makevars to mv data.table.so to datatable.so
247 {
248   // C exported routines, see ?cdt for details
249   R_RegisterCCallable("data.table", "CsubsetDT", (DL_FUNC) &subsetDT);
250 
251   R_registerRoutines(info, NULL, callMethods, NULL, externalMethods);
252   R_useDynamicSymbols(info, FALSE);
253   setSizes();
254   const char *msg = "... failed. Please forward this message to maintainer('data.table').";
255   if ((int)NA_INTEGER != (int)INT_MIN) error(_("Checking NA_INTEGER [%d] == INT_MIN [%d] %s"), NA_INTEGER, INT_MIN, msg);
256   if ((int)NA_INTEGER != (int)NA_LOGICAL) error(_("Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s"), NA_INTEGER, NA_LOGICAL, msg);
257   if (sizeof(int) != 4) error(_("Checking sizeof(int) [%d] is 4 %s"), sizeof(int), msg);
258   if (sizeof(double) != 8) error(_("Checking sizeof(double) [%d] is 8 %s"), sizeof(double), msg);     // 8 on both 32bit and 64bit
259   // alignof not available in C99: if (alignof(double) != 8) error(_("Checking alignof(double) [%d] is 8 %s"), alignof(double), msg);  // 8 on both 32bit and 64bit
260   if (sizeof(long long) != 8) error(_("Checking sizeof(long long) [%d] is 8 %s"), sizeof(long long), msg);
261   if (sizeof(char *) != 4 && sizeof(char *) != 8) error(_("Checking sizeof(pointer) [%d] is 4 or 8 %s"), sizeof(char *), msg);
262   if (sizeof(SEXP) != sizeof(char *)) error(_("Checking sizeof(SEXP) [%d] == sizeof(pointer) [%d] %s"), sizeof(SEXP), sizeof(char *), msg);
263   if (sizeof(uint64_t) != 8) error(_("Checking sizeof(uint64_t) [%d] is 8 %s"), sizeof(uint64_t), msg);
264   if (sizeof(int64_t) != 8) error(_("Checking sizeof(int64_t) [%d] is 8 %s"), sizeof(int64_t), msg);
265   if (sizeof(signed char) != 1) error(_("Checking sizeof(signed char) [%d] is 1 %s"), sizeof(signed char), msg);
266   if (sizeof(int8_t) != 1) error(_("Checking sizeof(int8_t) [%d] is 1 %s"), sizeof(int8_t), msg);
267   if (sizeof(uint8_t) != 1) error(_("Checking sizeof(uint8_t) [%d] is 1 %s"), sizeof(uint8_t), msg);
268   if (sizeof(int16_t) != 2) error(_("Checking sizeof(int16_t) [%d] is 2 %s"), sizeof(int16_t), msg);
269   if (sizeof(uint16_t) != 2) error(_("Checking sizeof(uint16_t) [%d] is 2 %s"), sizeof(uint16_t), msg);
270 
271   SEXP tmp = PROTECT(allocVector(INTSXP,2));
272   if (LENGTH(tmp)!=2) error(_("Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s"), LENGTH(tmp), msg);
273   if (TRUELENGTH(tmp)!=0) error(_("Checking TRUELENGTH(allocVector(INTSXP,2)) [%d] is 0 %s"), TRUELENGTH(tmp), msg);
274   UNPROTECT(1);
275 
276   // According to IEEE (http://en.wikipedia.org/wiki/IEEE_754-1985#Zero) we can rely on 0.0 being all 0 bits.
277   // But check here anyway just to be sure, just in case this answer is right (http://stackoverflow.com/a/2952680/403310).
278   int i = 314;
279   memset(&i, 0, sizeof(int));
280   if (i != 0) error(_("Checking memset(&i,0,sizeof(int)); i == (int)0 %s"), msg);
281   unsigned int ui = 314;
282   memset(&ui, 0, sizeof(unsigned int));
283   if (ui != 0) error(_("Checking memset(&ui, 0, sizeof(unsigned int)); ui == (unsigned int)0 %s"), msg);
284   double d = 3.14;
285   memset(&d, 0, sizeof(double));
286   if (d != 0.0) error(_("Checking memset(&d, 0, sizeof(double)); d == (double)0.0 %s"), msg);
287   long double ld = 3.14;
288   memset(&ld, 0, sizeof(long double));
289   if (ld != 0.0) error(_("Checking memset(&ld, 0, sizeof(long double)); ld == (long double)0.0 %s"), msg);
290 
291   // Check unsigned cast used in fread.c. This isn't overflow/underflow, just cast.
292   if ((uint_fast8_t)('0'-'/') != 1) error(_("The ascii character '/' is not just before '0'"));
293   if ((uint_fast8_t)('/'-'0') < 10) error(_("The C expression (uint_fast8_t)('/'-'0')<10 is true. Should be false."));
294   if ((uint_fast8_t)(':'-'9') != 1) error(_("The ascii character ':' is not just after '9'"));
295   if ((uint_fast8_t)('9'-':') < 10) error(_("The C expression (uint_fast8_t)('9'-':')<10 is true. Should be false."));
296 
297   // Variables rather than #define for NA_INT64 to ensure correct usage; i.e. not casted
298   NA_INT64_LL = LLONG_MIN;
299   NA_INT64_D = LLtoD(NA_INT64_LL);
300   if (NA_INT64_LL != DtoLL(NA_INT64_D)) error(_("Conversion of NA_INT64 via double failed %"PRId64"!=%"PRId64), (int64_t)NA_INT64_LL, (int64_t)DtoLL(NA_INT64_D));
301   // LLONG_MIN when punned to double is the sign bit set and then all zeros in exponent and significand i.e. -0.0
302   //   That's why we must never test for NA_INT64_D using == in double type. Must always DtoLL and compare long long types.
303   //   Assigning NA_INT64_D to a REAL is ok however.
304   if (NA_INT64_D != 0.0)  error(_("NA_INT64_D (negative -0.0) is not == 0.0."));
305   if (NA_INT64_D != -0.0) error(_("NA_INT64_D (negative -0.0) is not ==-0.0."));
306   if (ISNAN(NA_INT64_D)) error(_("ISNAN(NA_INT64_D) is TRUE but should not be"));
307   if (isnan(NA_INT64_D)) error(_("isnan(NA_INT64_D) is TRUE but should not be"));
308 
309   NA_CPLX.r = NA_REAL;  // NA_REAL is defined as R_NaReal which is not a strict constant and thus initializer {NA_REAL, NA_REAL} can't be used in .h
310   NA_CPLX.i = NA_REAL;  // https://github.com/Rdatatable/data.table/pull/3689/files#r304117234
311 
312   setNumericRounding(PROTECT(ScalarInteger(0))); // #1642, #1728, #1463, #485
313   UNPROTECT(1);
314 
315   // create needed strings in advance for speed, same techique as R_*Symbol
316   // Following R-exts 5.9.4; paragraph and example starting "Using install ..."
317   // either use PRINTNAME(install()) or R_PreserveObject(mkChar()) here.
318   char_integer64 = PRINTNAME(install("integer64"));
319   char_ITime =     PRINTNAME(install("ITime"));
320   char_IDate =     PRINTNAME(install("IDate"));
321   char_Date =      PRINTNAME(install("Date"));   // used for IDate too since IDate inherits from Date
322   char_POSIXct =   PRINTNAME(install("POSIXct"));
323   char_POSIXt =    PRINTNAME(install("POSIXt"));
324   char_UTC =       PRINTNAME(install("UTC"));
325   char_nanotime =  PRINTNAME(install("nanotime"));
326   char_starts =    PRINTNAME(sym_starts = install("starts"));
327   char_lens =      PRINTNAME(install("lens"));
328   char_indices =   PRINTNAME(install("indices"));
329   char_allLen1 =   PRINTNAME(install("allLen1"));
330   char_allGrp1 =   PRINTNAME(install("allGrp1"));
331   char_factor =    PRINTNAME(install("factor"));
332   char_ordered =   PRINTNAME(install("ordered"));
333   char_datatable = PRINTNAME(install("data.table"));
334   char_dataframe = PRINTNAME(install("data.frame"));
335   char_NULL =      PRINTNAME(install("NULL"));
336 
337   if (TYPEOF(char_integer64) != CHARSXP) {
338     // checking one is enough in case of any R-devel changes
339     error(_("PRINTNAME(install(\"integer64\")) has returned %s not %s"), type2char(TYPEOF(char_integer64)), type2char(CHARSXP));  // # nocov
340   }
341 
342   // create commonly used symbols, same as R_*Symbol but internal to DT
343   // Not really for speed but to avoid leak in situations like setAttrib(DT, install(), allocVector()) where
344   // the allocVector() can happen first and then the install() could gc and free it before it is protected
345   // within setAttrib. Thanks to Bill Dunlap finding and reporting. Using these symbols instead of install()
346   // avoids the gc without needing an extra PROTECT and immediate UNPROTECT after the setAttrib which would
347   // look odd (and devs in future might be tempted to remove them). Avoiding passing install() to API calls
348   // keeps the code neat and readable. Also see grep's added to CRAN_Release.cmd to find such calls.
349   sym_sorted  = install("sorted");
350   sym_index   = install("index");
351   sym_BY      = install(".BY");
352   sym_maxgrpn = install("maxgrpn");
353   sym_colClassesAs = install("colClassesAs");
354   sym_verbose = install("datatable.verbose");
355   SelfRefSymbol = install(".internal.selfref");
356   sym_inherits = install("inherits");
357   sym_datatable_locked = install(".data.table.locked");
358   sym_tzone = install("tzone");
359   sym_old_fread_datetime_character = install("datatable.old.fread.datetime.character");
360 
361   initDTthreads();
362   avoid_openmp_hang_within_fork();
363 }
364 
DtoLL(double x)365 inline long long DtoLL(double x) {
366   // Type punning such as
367   //     *(long long *)&REAL(column)[i]
368   // is undefined by C standards. This may have been the cause of 1.10.2 failing on 31 Jan 2017
369   // under clang 3.9.1 -O3 and solaris-sparc but not solaris-x86 or gcc.
370   // There is now a grep in CRAN_Release.cmd; use this union method instead.
371   // int64_t may help rather than 'long long' (TODO: replace all long long with int64_t)
372   // The two types must be the same size. That is checked in R_init_datatable (above)
373   // where sizeof(int64_t)==sizeof(double)==8 is checked.
374   // Endianness should not matter because whether big or little, endianness is the same
375   // inside this process, and the two types are the same size.
376   union {double d; int64_t i64;} u;  // not static, inline instead
377   u.d = x;
378   return (long long)u.i64;
379 }
380 
LLtoD(long long x)381 inline double LLtoD(long long x) {
382   union {double d; int64_t i64;} u;
383   u.i64 = (int64_t)x;
384   return u.d;
385 }
386 
GetVerbose()387 bool GetVerbose() {
388   // don't call repetitively; save first in that case
389   SEXP opt = GetOption(sym_verbose, R_NilValue);
390   return isLogical(opt) && LENGTH(opt)==1 && LOGICAL(opt)[0]==1;
391 }
392 
393 // # nocov start
hasOpenMP()394 SEXP hasOpenMP() {
395   // Just for use by onAttach (hence nocov) to avoid an RPRINTF from C level which isn't suppressable by CRAN
396   // There is now a 'grep' in CRAN_Release.cmd to detect any use of RPRINTF in init.c, which is
397   // why RPRINTF is capitalized in this comment to avoid that grep.
398   // .Platform or .Machine in R itself does not contain whether OpenMP is available because compiler and flags are per-package.
399   #ifdef _OPENMP
400   return ScalarInteger(_OPENMP); // return the version; e.g. 201511 (i.e. 4.5)
401   #else
402   return ScalarInteger(0);       // 0 rather than NA so that if() can be used on the result
403   #endif
404 }
405 // # nocov end
406 
407 extern int *_Last_updated;  // assign.c
408 
initLastUpdated(SEXP var)409 SEXP initLastUpdated(SEXP var) {
410   if (!isInteger(var) || LENGTH(var)!=1) error(_(".Last.value in namespace is not a length 1 integer"));
411   _Last_updated = INTEGER(var);
412   return R_NilValue;
413 }
414 
dllVersion()415 SEXP dllVersion() {
416   // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056
417   return(ScalarString(mkChar("1.14.2")));
418 }
419 
420