1 #include "data.table.h"
2 /*
3 Inspired by :
4 icount in do_radixsort in src/main/sort.c @ rev 51389.
5 base::sort.list(method="radix") turns out not to be a radix sort, but a counting sort :
6 http://r.789695.n4.nabble.com/method-radix-in-sort-list-isn-t-actually-a-radix-sort-tp3309470p3309470.html
7 Wish granted for R to allow negatives (simple change) : https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=15644
8 Terdiman and Herf (LSB radix for floating point and other efficiency tricks) :
9 http://codercorner.com/RadixSortRevisited.htm
10 http://stereopsis.com/radix.html
11
12 Previous version of this file was promoted into base R, see ?base::sort.
13 Denmark useR! presentation <link>
14 Stanford DSC presentation <link>
15 JSM presentation <link>
16 Techniques used :
17 skewed groups are split in parallel
18 finds unique bytes to save 256 sweeping
19 skips already-grouped yet unsorted
20 recursive group gathering for cache efficiency
21 reuses R's global character cache (truelength on CHARSXP)
22 compressed column can cross byte boundaries to use spare bits (e.g. 2 16-level columns in one byte)
23 just the remaining part of key is reordered as the radix progresses
24 columnar byte-key for within-radix MT cache efficiency
25
26 Only forder() and dtwiddle() functions are meant for use by other C code in data.table, hence all other functions here are static.
27 The coding techniques deployed here are for efficiency. The static functions are recursive or called repetitively and we wish to minimise
28 overhead. They reach outside themselves to place results in the end result directly rather than returning many small pieces of memory.
29 */
30
31 // #define TIMING_ON
32
33 static int nth = 1; // number of threads to use, throttled by default; used by cleanup() to ensure no mismatch in getDTthreads() calls
34 static bool retgrp = true; // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn :
35 static int nrow = 0; // used as group size stack allocation limit (when all groups are 1 row)
36 static int *gs = NULL; // gs = final groupsizes e.g. 23,12,87,2,1,34,...
37 static int gs_alloc = 0; // allocated size of gs
38 static int gs_n = 0; // the number of groups found so far (how much of the allocated gs is used)
39 static int **gs_thread=NULL; // each thread has a private buffer which gets flushed to the final gs appropriately
40 static int *gs_thread_alloc=NULL;
41 static int *gs_thread_n=NULL;
42 static int *TMP=NULL; // UINT16_MAX*sizeof(int) for each thread; used by counting sort in radix_r()
43 static uint8_t *UGRP=NULL; // 256 bytes for each thread; used by counting sort in radix_r() when sortType==0 (byte appearance order)
44
45 static int *cradix_counts = NULL;
46 static SEXP *cradix_xtmp = NULL;
47 static SEXP *ustr = NULL;
48 static int ustr_alloc = 0;
49 static int ustr_n = 0;
50 static int ustr_maxlen = 0;
51 static int sortType = 0; // 0 just group; -1 descending, +1 ascending
52 static int nalast = 0; // 1 (true i.e. last), 0 (false i.e. first), -1 (na i.e. remove)
53 static int nradix = 0;
54 static uint8_t **key = NULL;
55 static int *anso = NULL;
56 static bool notFirst=false;
57
58 static char msg[1001];
59 #define STOP(...) do {snprintf(msg, 1000, __VA_ARGS__); cleanup(); error(msg);} while(0) // http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon
60 // use STOP in this file (not error()) to ensure cleanup() is called first
61 // snprintf to msg first in case nrow (just as an example) is provided in the message because cleanup() sets nrow to 0
62 #undef warning
63 #define warning(...) Do not use warning in this file // since it can be turned to error via warn=2
64 /* Using OS realloc() in this file to benefit from (often) in-place realloc() to save copy
65 * We have to trap on exit anyway to call savetl_end().
66 * NB: R_alloc() would be more convenient (fails within) and robust (auto free) but there is no R_realloc(). Implementing R_realloc() would be an alloc and copy, iiuc.
67 * Calloc/Realloc needs to be Free'd, even before error() [R-exts$6.1.2]. An oom within Calloc causes a previous Calloc to leak so Calloc would still needs to be trapped anyway.
68 * Therefore, using <<if (!malloc()) STOP(_("helpful context msg"))>> approach to cleanup() on error.
69 */
70
free_ustr()71 static void free_ustr() {
72 for(int i=0; i<ustr_n; i++)
73 SET_TRUELENGTH(ustr[i],0);
74 free(ustr); ustr=NULL;
75 ustr_alloc=0; ustr_n=0; ustr_maxlen=0;
76 }
77
cleanup()78 static void cleanup() {
79 free(gs); gs=NULL;
80 gs_alloc = 0;
81 gs_n = 0;
82
83 if (gs_thread!=NULL) for (int i=0; i<nth; i++) free(gs_thread[i]);
84 free(gs_thread); gs_thread=NULL;
85 free(gs_thread_alloc); gs_thread_alloc=NULL;
86 free(gs_thread_n); gs_thread_n=NULL;
87
88 free(TMP); TMP=NULL;
89 free(UGRP); UGRP=NULL;
90
91 nrow = 0;
92 free(cradix_counts); cradix_counts=NULL;
93 free(cradix_xtmp); cradix_xtmp=NULL;
94 free_ustr();
95 if (key!=NULL) { int i=0; while (key[i]!=NULL) free(key[i++]); } // ==nradix, other than rare cases e.g. tests 1844.5-6 (#3940), and if a calloc fails
96 free(key); key=NULL; nradix=0;
97 savetl_end(); // Restore R's own usage of tl. Must run after the for loop in free_ustr() since only CHARSXP which had tl>0 (R's usage) are stored there.
98 }
99
push(const int * x,const int n)100 static void push(const int *x, const int n) {
101 if (!retgrp) return; // clearer to have the switch here rather than before each call
102 int me = omp_get_thread_num();
103 int newn = gs_thread_n[me] + n;
104 if (gs_thread_alloc[me] < newn) {
105 gs_thread_alloc[me] = (newn < nrow/3) ? (1+(newn*2)/4096)*4096 : nrow; // [2|3] to not overflow and 3 not 2 to avoid allocating close to nrow (nrow groups occurs when all size 1 groups)
106 gs_thread[me] = realloc(gs_thread[me], gs_thread_alloc[me]*sizeof(int));
107 if (gs_thread[me]==NULL) STOP(_("Failed to realloc thread private group size buffer to %d*4bytes"), (int)gs_thread_alloc[me]);
108 }
109 memcpy(gs_thread[me]+gs_thread_n[me], x, n*sizeof(int));
110 gs_thread_n[me] += n;
111 }
112
flush()113 static void flush() {
114 if (!retgrp) return;
115 int me = omp_get_thread_num();
116 int n = gs_thread_n[me];
117 int newn = gs_n + n;
118 if (gs_alloc < newn) {
119 gs_alloc = (newn < nrow/3) ? (1+(newn*2)/4096)*4096 : nrow;
120 gs = realloc(gs, gs_alloc*sizeof(int));
121 if (gs==NULL) STOP(_("Failed to realloc group size result to %d*4bytes"), (int)gs_alloc);
122 }
123 memcpy(gs+gs_n, gs_thread[me], n*sizeof(int));
124 gs_n += n;
125 gs_thread_n[me] = 0;
126 }
127
128 #ifdef TIMING_ON
129 #define NBLOCK 64
130 #define MAX_NTH 256
131 static double tblock[MAX_NTH*NBLOCK];
132 static int nblock[MAX_NTH*NBLOCK];
133 static uint64_t stat[257];
134 #define TBEG() double tstart = wallclock(); // tstart declared locally for thread safety
135 #define TEND(i) { \
136 double now = wallclock(); \
137 int w = omp_get_thread_num()*NBLOCK + i; \
138 tblock[w] += now-tstart; \
139 nblock[w]++; \
140 tstart = now; \
141 }
142 #else
143 #define TBEG()
144 #define TEND(i)
145 #endif
146
147 // range_* functions return [min,max] of the non-NAs as common uint64_t type
148 // TODO parallelize these; not a priority according to TIMING_ON though (contiguous read with prefetch)
149
range_i32(const int32_t * x,const int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count)150 static void range_i32(const int32_t *x, const int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
151 {
152 int32_t min = NA_INTEGER;
153 int32_t max = NA_INTEGER;
154 int i=0;
155 while(i<n && x[i]==NA_INTEGER) i++;
156 int na_count = i;
157 if (i<n) max = min = x[i++];
158 for(; i<n; i++) {
159 int tmp = x[i];
160 if (tmp>max) max=tmp;
161 else if (tmp<min) {
162 if (tmp==NA_INTEGER) na_count++;
163 else min=tmp;
164 }
165 }
166 *out_na_count = na_count;
167 *out_min = min ^ 0x80000000u; // map [-2147483648(INT32_MIN), 2147483647(INT32_MAX)] => [0, 4294967295(UINT32_MAX)]
168 *out_max = max ^ 0x80000000u;
169 }
170
range_i64(int64_t * x,int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count)171 static void range_i64(int64_t *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
172 {
173 int64_t min = INT64_MIN;
174 int64_t max = INT64_MIN;
175 int i=0;
176 while(i<n && x[i]==INT64_MIN) i++;
177 int na_count = i;
178 if (i<n) max = min = x[i++];
179 for(; i<n; i++) {
180 int64_t tmp = x[i];
181 if (tmp>max) max=tmp;
182 else if (tmp<min) {
183 if (tmp==INT64_MIN) na_count++;
184 else min=tmp;
185 }
186 }
187 *out_na_count = na_count;
188 // map [INT64_MIN, INT64_MAX] => [0, UINT64_MAX]
189 *out_min = min ^ 0x8000000000000000u;
190 *out_max = max ^ 0x8000000000000000u;
191 }
192
range_d(double * x,int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count,int * out_infnan_count)193 static void range_d(double *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, int *out_infnan_count)
194 // return range of finite numbers (excluding NA, NaN, -Inf, +Inf), a count of NA and a count of Inf|-Inf|NaN
195 {
196 uint64_t min=0, max=0;
197 int na_count=0, infnan_count=0;
198 int i=0;
199 while(i<n && !R_FINITE(x[i])) { ISNA(x[i++]) ? na_count++ : infnan_count++; }
200 if (i<n) { max = min = dtwiddle(x[i++]); }
201 for(; i<n; i++) {
202 if (!R_FINITE(x[i])) { ISNA(x[i]) ? na_count++ : infnan_count++; continue; }
203 uint64_t tmp = dtwiddle(x[i]);
204 if (tmp>max) max=tmp;
205 else if (tmp<min) min=tmp;
206 }
207 *out_na_count = na_count;
208 *out_infnan_count = infnan_count;
209 *out_min = min;
210 *out_max = max;
211 }
212
213 // used by bmerge only; chmatch uses coerceUtf8IfNeeded
StrCmp(SEXP x,SEXP y)214 int StrCmp(SEXP x, SEXP y)
215 {
216 if (x == y) return 0; // same cached pointer (including NA_STRING==NA_STRING)
217 if (x == NA_STRING) return -1; // x<y
218 if (y == NA_STRING) return 1; // x>y
219 return strcmp(CHAR(x), CHAR(y)); // bmerge calls ENC2UTF8 on x and y before passing here
220 }
221
cradix_r(SEXP * xsub,int n,int radix)222 static void cradix_r(SEXP *xsub, int n, int radix)
223 // xsub is a unique set of CHARSXP, to be ordered by reference
224 // First time, radix==0, and xsub==x. Then recursively moves SEXP together for cache efficiency.
225 // Quite different to iradix because
226 // 1) x is known to be unique so fits in cache (wide random access not an issue)
227 // 2) they're variable length character strings
228 // 3) no need to maintain o. Just simply reorder x. No grps or push.
229 {
230 if (n<=1) return;
231 int *thiscounts = cradix_counts + radix*256;
232 uint8_t lastx = 0; // the last x is used to test its bin
233 for (int i=0; i<n; i++) {
234 lastx = radix<LENGTH(xsub[i]) ? (uint8_t)(CHAR(xsub[i])[radix]) : 1; // no NA_STRING present, 1 for "" (could use 0 too maybe since NA_STRING not present)
235 thiscounts[ lastx ]++;
236 }
237 if (thiscounts[lastx]==n && radix<ustr_maxlen-1) {
238 cradix_r(xsub, n, radix+1);
239 thiscounts[lastx] = 0; // all x same value, the rest must be 0 already, save the memset
240 return;
241 }
242 int itmp = thiscounts[0];
243 for (int i=1; i<256; i++) {
244 if (thiscounts[i]) thiscounts[i] = (itmp += thiscounts[i]); // don't cummulate through 0s, important below
245 }
246 for (int i=n-1; i>=0; i--) {
247 uint8_t thisx = radix<LENGTH(xsub[i]) ? (uint8_t)(CHAR(xsub[i])[radix]) : 1;
248 cradix_xtmp[--thiscounts[thisx]] = xsub[i];
249 }
250 memcpy(xsub, cradix_xtmp, n*sizeof(SEXP));
251 if (radix == ustr_maxlen-1) {
252 memset(thiscounts, 0, 256*sizeof(int));
253 return;
254 }
255 if (thiscounts[0] != 0) STOP(_("Logical error. counts[0]=%d in cradix but should have been decremented to 0. radix=%d"), thiscounts[0], radix);
256 itmp = 0;
257 for (int i=1; i<256; i++) {
258 if (thiscounts[i] == 0) continue;
259 int thisgrpn = thiscounts[i] - itmp; // undo cumulate; i.e. diff
260 cradix_r(xsub+itmp, thisgrpn, radix+1);
261 itmp = thiscounts[i];
262 thiscounts[i] = 0; // set to 0 now since we're here, saves memset afterwards. Important to do this ready for this memory's reuse
263 }
264 if (itmp<n-1) cradix_r(xsub+itmp, n-itmp, radix+1); // final group
265 }
266
cradix(SEXP * x,int n)267 static void cradix(SEXP *x, int n)
268 {
269 cradix_counts = (int *)calloc(ustr_maxlen*256, sizeof(int)); // counts for the letters of left-aligned strings
270 if (!cradix_counts) STOP(_("Failed to alloc cradix_counts"));
271 cradix_xtmp = (SEXP *)malloc(ustr_n*sizeof(SEXP));
272 if (!cradix_xtmp) STOP(_("Failed to alloc cradix_tmp"));
273 cradix_r(x, n, 0);
274 free(cradix_counts); cradix_counts=NULL;
275 free(cradix_xtmp); cradix_xtmp=NULL;
276 }
277
range_str(SEXP * x,int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count)278 static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
279 // group numbers are left in truelength to be fetched by WRITE_KEY
280 {
281 int na_count=0;
282 bool anyneedutf8=false;
283 if (ustr_n!=0) STOP(_("Internal error: ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d"), ustr_n, ustr_alloc); // # nocov
284 if (ustr_maxlen!=0) STOP(_("Internal error: ustr_maxlen isn't 0 when starting range_str")); // # nocov
285 // savetl_init() has already been called at the start of forder
286 #pragma omp parallel for num_threads(getDTthreads(n, true))
287 for(int i=0; i<n; i++) {
288 SEXP s = x[i];
289 if (s==NA_STRING) {
290 #pragma omp atomic update
291 na_count++;
292 continue;
293 }
294 if (TRUELENGTH(s)<0) continue; // seen this group before
295 #pragma omp critical
296 if (TRUELENGTH(s)>=0) { // another thread may have set it while I was waiting, so check it again
297 if (TRUELENGTH(s)>0) // save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore
298 savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.
299 // now save unique SEXP in ustr so i) we can loop through them afterwards and reset TRUELENGTH to 0 and ii) sort uniques when sorting too
300 if (ustr_alloc<=ustr_n) {
301 ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2; // small initial guess, negligible time to alloc 128KB (32 pages)
302 if (ustr_alloc>n) ustr_alloc = n; // clamp at n. Reaches n when fully unique (no dups)
303 ustr = realloc(ustr, ustr_alloc * sizeof(SEXP));
304 if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP)); // # nocov
305 }
306 ustr[ustr_n++] = s;
307 SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group
308 if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
309 if (!anyneedutf8 && NEED2UTF8(s)) anyneedutf8=true;
310 }
311 }
312 *out_na_count = na_count;
313 if (ustr_n==0) { // all na
314 *out_min = 0;
315 *out_max = 0;
316 return;
317 }
318 if (anyneedutf8) {
319 SEXP ustr2 = PROTECT(allocVector(STRSXP, ustr_n));
320 for (int i=0; i<ustr_n; i++) SET_STRING_ELT(ustr2, i, ENC2UTF8(ustr[i]));
321 SEXP *ustr3 = (SEXP *)malloc(ustr_n * sizeof(SEXP));
322 if (!ustr3) STOP(_("Failed to alloc ustr3 when converting strings to UTF8")); // # nocov
323 memcpy(ustr3, STRING_PTR(ustr2), ustr_n*sizeof(SEXP));
324 // need to reset ustr_maxlen because we need ustr_maxlen for utf8 strings
325 ustr_maxlen = 0;
326 for (int i=0; i<ustr_n; i++) {
327 SEXP s = ustr3[i];
328 if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
329 if (TRUELENGTH(s)>0) savetl(s);
330 }
331 cradix(ustr3, ustr_n); // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8
332 SET_TRUELENGTH(ustr3[0], -1);
333 int o = -1;
334 for (int i=1; i<ustr_n; i++) {
335 if (ustr3[i] == ustr3[i-1]) continue; // use the same o for duplicates
336 SET_TRUELENGTH(ustr3[i], --o);
337 }
338 // now use the 1-1 mapping from ustr to ustr2 to get the ordering back into original ustr, being careful to reset tl to 0
339 int *tl = (int *)malloc(ustr_n * sizeof(int));
340 if (!tl) STOP(_("Failed to alloc tl when converting strings to UTF8")); // # nocov
341 SEXP *tt = STRING_PTR(ustr2);
342 for (int i=0; i<ustr_n; i++) tl[i] = TRUELENGTH(tt[i]); // fetches the o in ustr3 into tl which is ordered by ustr
343 for (int i=0; i<ustr_n; i++) SET_TRUELENGTH(ustr3[i], 0); // reset to 0 tl of the UTF8 (and possibly non-UTF in ustr too)
344 for (int i=0; i<ustr_n; i++) SET_TRUELENGTH(ustr[i], tl[i]); // put back the o into ustr's tl
345 free(tl);
346 free(ustr3);
347 UNPROTECT(1);
348 *out_min = 1;
349 *out_max = -o; // could be less than ustr_n if there are duplicates in the utf8s
350 } else {
351 *out_min = 1;
352 *out_max = ustr_n;
353 if (sortType) {
354 // that this is always ascending; descending is done in WRITE_KEY using max-this
355 cradix(ustr, ustr_n); // sorts ustr in-place by reference. assumes NA_STRING not present.
356 for(int i=0; i<ustr_n; i++) // save ordering in the CHARSXP. negative so as to distinguish with R's own usage.
357 SET_TRUELENGTH(ustr[i], -i-1);
358 }
359 // else group appearance order was already saved to tl in the first pass
360 }
361 }
362
363 static int dround=0; // No rounding by default, for now. Handles #1642, #1728, #1463, #485
364 static uint64_t dmask=0;
365
setNumericRounding(SEXP droundArg)366 SEXP setNumericRounding(SEXP droundArg)
367 // init.c has initial call with default of 2
368 {
369 if (!isInteger(droundArg) || LENGTH(droundArg)!=1) error(_("Must an integer or numeric vector length 1"));
370 if (INTEGER(droundArg)[0] < 0 || INTEGER(droundArg)[0] > 2) error(_("Must be 2, 1 or 0"));
371 dround = INTEGER(droundArg)[0];
372 dmask = dround ? 1 << (8*dround-1) : 0;
373 return R_NilValue;
374 }
375
getNumericRounding()376 SEXP getNumericRounding()
377 {
378 return ScalarInteger(dround);
379 }
380
getNumericRounding_C()381 int getNumericRounding_C()
382 // for use in uniqlist.c
383 {
384 return dround;
385 }
386
387 // for signed integers it's easy: flip sign bit to swap positives and negatives; the resulting unsigned is in the right order with INT_MIN ending up as 0
388 // for floating point finite you have to flip the other bits too if it was signed: http://stereopsis.com/radix.html
dtwiddle(double x)389 uint64_t dtwiddle(double x) //const void *p, int i)
390 {
391 union {
392 double d;
393 uint64_t u64;
394 } u; // local for thread safety
395 u.d = x; //((double *)p)[i];
396 if (R_FINITE(u.d)) {
397 if (u.d==0) u.d=0; // changes -0.0 to 0.0, issue #743
398 u.u64 ^= (u.u64 & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; // always flip sign bit and if negative (sign bit was set) flip other bits too
399 u.u64 += (u.u64 & dmask) << 1/*is this shift really correct. No need to shift*/ ; // when dround==1|2, if 8th|16th bit is set, round up before chopping last 1|2 bytes
400 return u.u64 >> (dround*8);
401 }
402 if (ISNAN(u.d)) return ISNA(u.d) ? 0 /*NA*/ : 1 /*NaN*/; // also normalises a difference between NA on 32bit R (bit 13 set) and 64bit R (bit 13 not set)
403 if (isinf(u.d)) return signbit(u.d) ? 2 /*-Inf*/ : (0xffffffffffffffff>>(dround*8)) /*+Inf*/;
404 STOP(_("Unknown non-finite value; not NA, NaN, -Inf or +Inf")); // # nocov
405 }
406
407 void radix_r(const int from, const int to, const int radix);
408
forder(SEXP DT,SEXP by,SEXP retGrpArg,SEXP sortGroupsArg,SEXP ascArg,SEXP naArg)409 SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg)
410 // sortGroups TRUE from setkey and regular forder, FALSE from by= for efficiency so strings don't have to be sorted and can be left in appearance order
411 // when sortGroups is TRUE, ascArg contains +1/-1 for ascending/descending of each by column; when FALSE ascArg is ignored
412 {
413
414 #ifdef TIMING_ON
415 memset(tblock, 0, MAX_NTH*NBLOCK*sizeof(double));
416 memset(nblock, 0, MAX_NTH*NBLOCK*sizeof(int));
417 memset(stat, 0, 257*sizeof(uint64_t));
418 TBEG()
419 #endif
420
421 int n_protect = 0;
422 const bool verbose = GetVerbose();
423
424 if (!isNewList(DT)) {
425 if (!isVectorAtomic(DT))
426 STOP(_("Internal error: input is not either a list of columns, or an atomic vector.")); // # nocov; caught by colnamesInt at R level, test 1962.0472
427 if (!isNull(by))
428 STOP(_("Internal error: input is an atomic vector (not a list of columns) but by= is not NULL")); // # nocov; caught at R level, test 1962.043
429 if (!isInteger(ascArg) || LENGTH(ascArg)!=1)
430 STOP(_("Input is an atomic vector (not a list of columns) but order= is not a length 1 integer"));
431 if (verbose)
432 Rprintf(_("forder.c received a vector type '%s' length %d\n"), type2char(TYPEOF(DT)), length(DT));
433 SEXP tt = PROTECT(allocVector(VECSXP, 1)); n_protect++;
434 SET_VECTOR_ELT(tt, 0, DT);
435 DT = tt;
436 by = PROTECT(allocVector(INTSXP, 1)); n_protect++;
437 INTEGER(by)[0] = 1;
438 } else {
439 if (verbose)
440 Rprintf(_("forder.c received %d rows and %d columns\n"), length(VECTOR_ELT(DT,0)), length(DT));
441 }
442 if (!length(DT))
443 STOP(_("Internal error: DT is an empty list() of 0 columns")); // # nocov should have been caught be colnamesInt, test 2099.1
444 if (!isInteger(by) || !LENGTH(by))
445 STOP(_("Internal error: DT has %d columns but 'by' is either not integer or is length 0"), length(DT)); // # nocov colnamesInt catches, 2099.2
446 if (!isInteger(ascArg) || LENGTH(ascArg)!=LENGTH(by))
447 STOP(_("Either order= is not integer or its length (%d) is different to by='s length (%d)"), LENGTH(ascArg), LENGTH(by));
448 nrow = length(VECTOR_ELT(DT,0));
449 int n_cplx = 0;
450 for (int i=0; i<LENGTH(by); i++) {
451 int by_i = INTEGER(by)[i];
452 if (by_i < 1 || by_i > length(DT))
453 STOP(_("internal error: 'by' value %d out of range [1,%d]"), by_i, length(DT)); // # nocov # R forderv already catch that using C colnamesInt
454 if ( nrow != length(VECTOR_ELT(DT, by_i-1)) )
455 STOP(_("Column %d is length %d which differs from length of column 1 (%d)\n"), INTEGER(by)[i], length(VECTOR_ELT(DT, INTEGER(by)[i]-1)), nrow);
456 if (TYPEOF(VECTOR_ELT(DT, by_i-1)) == CPLXSXP) n_cplx++;
457 }
458 if (!isLogical(retGrpArg) || LENGTH(retGrpArg)!=1 || INTEGER(retGrpArg)[0]==NA_LOGICAL)
459 STOP(_("retGrp must be TRUE or FALSE"));
460 retgrp = LOGICAL(retGrpArg)[0]==TRUE;
461 if (!isLogical(sortGroupsArg) || LENGTH(sortGroupsArg)!=1 || INTEGER(sortGroupsArg)[0]==NA_LOGICAL )
462 STOP(_("sort must be TRUE or FALSE"));
463 sortType = LOGICAL(sortGroupsArg)[0]==TRUE; // if sortType is 1, it is later flipped between +1/-1 according to ascArg. Otherwise ascArg is ignored when sortType==0
464 if (!retgrp && !sortType)
465 STOP(_("At least one of retGrp= or sort= must be TRUE"));
466 if (!isLogical(naArg) || LENGTH(naArg) != 1)
467 STOP(_("na.last must be logical TRUE, FALSE or NA of length 1"));
468 nalast = (LOGICAL(naArg)[0] == NA_LOGICAL) ? -1 : LOGICAL(naArg)[0]; // 1=na last, 0=na first (default), -1=remove na
469
470 if (nrow==0) {
471 // empty vector or 0-row DT is always sorted
472 SEXP ans = PROTECT(allocVector(INTSXP, 0)); n_protect++;
473 if (retgrp) {
474 setAttrib(ans, sym_starts, allocVector(INTSXP, 0));
475 setAttrib(ans, sym_maxgrpn, ScalarInteger(0));
476 }
477 UNPROTECT(n_protect);
478 return ans;
479 }
480 // if n==1, the code is left to proceed below in case one or more of the 1-row by= columns are NA and na.last=NA. Otherwise it would be easy to return now.
481 notFirst = false;
482
483 SEXP ans = PROTECT(allocVector(INTSXP, nrow)); n_protect++;
484 anso = INTEGER(ans);
485 TEND(0)
486 #pragma omp parallel for num_threads(getDTthreads(nrow, true))
487 for (int i=0; i<nrow; i++) anso[i]=i+1; // gdb 8.1.0.20180409-git very slow here, oddly
488 TEND(1)
489 savetl_init(); // from now on use Error not error
490
491 int ncol=length(by);
492 int keyAlloc = (ncol+n_cplx)*8 + 1; // +1 for NULL to mark end; calloc to initialize with NULLs
493 key = calloc(keyAlloc, sizeof(uint8_t *)); // needs to be before loop because part II relies on part I, column-by-column.
494 if (!key)
495 STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)keyAlloc*sizeof(uint8_t *)); // # nocov
496 nradix=0; // the current byte we're writing this column to; might be squashing into it (spare>0)
497 int spare=0; // the amount of bits remaining on the right of the current nradix byte
498 bool isReal=false;
499 bool complexRerun = false; // see comments below in CPLXSXP case
500 SEXP CplxPart = R_NilValue;
501 if (n_cplx) { CplxPart=PROTECT(allocVector(REALSXP, nrow)); n_protect++; } // one alloc is reused for each part
502 TEND(2);
503 for (int col=0; col<ncol; col++) {
504 // Rprintf(_("Finding range of column %d ...\n"), col);
505 SEXP x = VECTOR_ELT(DT,INTEGER(by)[col]-1);
506 uint64_t min=0, max=0; // min and max of non-NA finite values
507 int na_count=0, infnan_count=0;
508 if (sortType) {
509 sortType=INTEGER(ascArg)[col]; // if sortType!=0 (not first-appearance) then +1/-1 comes from ascArg.
510 if (sortType!=1 && sortType!=-1)
511 STOP(_("Item %d of order (ascending/descending) is %d. Must be +1 or -1."), col+1, sortType);
512 }
513 //Rprintf(_("sortType = %d\n"), sortType);
514 switch(TYPEOF(x)) {
515 case INTSXP : case LGLSXP : // TODO skip LGL and assume range [0,1]
516 range_i32(INTEGER(x), nrow, &min, &max, &na_count);
517 break;
518 case CPLXSXP : {
519 // treat as if two separate columns of double
520 const Rcomplex *xd = COMPLEX(x);
521 double *tmp = REAL(CplxPart);
522 if (!complexRerun) {
523 for (int i=0; i<nrow; ++i) tmp[i] = xd[i].r; // extract the real part on the first time
524 complexRerun = true;
525 col--; // cause this loop iteration to rerun; decrement now in case of early continue below
526 } else {
527 for (int i=0; i<nrow; ++i) tmp[i] = xd[i].i;
528 complexRerun = false;
529 }
530 x = CplxPart;
531 } // !no break! so as to fall through to REAL case
532 case REALSXP :
533 if (INHERITS(x, char_integer64)) {
534 range_i64((int64_t *)REAL(x), nrow, &min, &max, &na_count);
535 } else {
536 if (verbose && INHERITS(x, char_Date) && INTEGER(isReallyReal(x))[0]==0) {
537 Rprintf(_("\n*** Column %d passed to forder is a date stored as an 8 byte double but no fractions are present. Please consider a 4 byte integer date such as IDate to save space and time.\n"), col+1);
538 // Note the (slightly expensive) isReallyReal will only run when verbose is true. Prefix '***' just to make it stand out in verbose output
539 // In future this could be upgraded to option warning. But I figured that's what we use verbose to do (to trace problems and look for efficiencies).
540 // If an automatic coerce is desired (see discussion in #1738) then this is the point to do that in this file. Move the INTSXP case above to be
541 // next, do the coerce of Date to integer now to a tmp, and then let this case fall through to INTSXP in the same way as CPLXSXP falls through to REALSXP.
542 }
543 range_d(REAL(x), nrow, &min, &max, &na_count, &infnan_count);
544 if (min==0 && na_count<nrow) { min=3; max=4; } // column contains no finite numbers and is not-all NA; create dummies to yield positive min-2 later
545 isReal = true;
546 }
547 break;
548 case STRSXP :
549 // need2utf8 now happens inside range_str on the uniques
550 range_str(STRING_PTR(x), nrow, &min, &max, &na_count);
551 break;
552 default:
553 STOP(_("Column %d passed to [f]order is type '%s', not yet supported."), col+1, type2char(TYPEOF(x)));
554 }
555 TEND(3);
556 if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) {
557 // all same value; skip column as nothing to do; [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA)
558 if (na_count==nrow && nalast==-1) { for (int i=0; i<nrow; i++) anso[i]=0; }
559 if (TYPEOF(x)==STRSXP) free_ustr();
560 continue;
561 }
562
563 uint64_t range = max-min+1 +1/*NA*/ +isReal*3/*NaN, -Inf, +Inf*/;
564 // Rprintf(_("range=%"PRIu64" min=%"PRIu64" max=%"PRIu64" na_count==%d\n"), range, min, max, na_count);
565
566 int maxBit=0;
567 while (range) { maxBit++; range>>=1; }
568 int nbyte = 1+(maxBit-1)/8; // the number of bytes spanned by the value
569 int firstBits = maxBit - (nbyte-1)*8; // how many bits used in most significant byte
570 if (spare==0) {
571 spare = 8-firstBits; // left align to byte boundary to get better first split.
572 } else {
573 if (sortType==0) {
574 // can't squash into previous spare as that will change first-appearance order within that radix-byte
575 // found thanks to test 1246.55 with seed 1540402216L and added new direct test 1954
576 spare = 8-firstBits;
577 nradix++;
578 }
579 else if (spare >= firstBits) {
580 // easiest case. just left shift a few bits within these nbyte to fill the spare gap
581 spare -= firstBits; // new spare is also how many bits to now left shift
582 }
583 else {
584 if (nbyte<8) {
585 spare += 8-firstBits;
586 nbyte++; // after shift, will need an extra byte
587 } else {
588 // range takes 8 bytes so can't shift into 9th to use spare of last byte of previous column; start with a new left-aligned byte
589 nradix++;
590 spare = 8-firstBits;
591 }
592 }
593 }
594
595 for (int b=0; b<nbyte; b++) {
596 if (key[nradix+b]==NULL) {
597 uint8_t *tt = calloc(nrow, sizeof(uint8_t)); // 0 initialize so that NA's can just skip (NA is always the 0 offset)
598 if (!tt)
599 STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)nrow*sizeof(uint8_t)); // # nocov
600 key[nradix+b] = tt;
601 }
602 }
603
604 const bool asc = (sortType>=0);
605 uint64_t min2=min, max2=max;
606 if (nalast<1 && asc) {
607 min2--; // so that x-min results in 1 for the minimum; NA coded by 0. Always provide for the NA spot even if NAs aren't present for code brevity and robustness
608 if (isReal) min2--; // Nan first
609 } else {
610 max2++; // NA is max+1 so max2-elem should result in 0
611 if (isReal) max2++; // Nan last
612 }
613 if (isReal) { min2--; max2++; } // -Inf and +Inf in min-1 and max+1 spots respectively
614
615 const uint64_t naval = ((nalast==1) == asc) ? max+1+isReal*2 : min-1-isReal*2;
616 const uint64_t nanval = ((nalast==1) == asc) ? max+2 : min-2; // only used when isReal
617 // Rprintf(_("asc=%d min2=%"PRIu64" max2=%"PRIu64" naval==%"PRIu64" nanval==%"PRIu64"\n"), asc, min2, max2, naval, nanval);
618
619 // several columns could squash into 1 byte. due to this bit squashing is why we deal
620 // with asc|desc here, otherwise it could be done in the ugrp sorting by reversing the ugrp insert sort
621 #define WRITE_KEY \
622 elem = asc ? elem-min2 : max2-elem; \
623 elem <<= spare; \
624 for (int b=nbyte-1; b>0; b--) { \
625 key[nradix+b][i] = (uint8_t)(elem & 0xff); \
626 elem >>= 8; \
627 } \
628 key[nradix][i] |= (uint8_t)(elem & 0xff);
629 // this last |= squashes the most significant bits of this column into the the spare of the previous
630 // NB: retaining group order does not retain the appearance-order in the sense that DT[,,by=] does. Because we go column-by-column, the first column values
631 // are grouped together. Whether we do byte-column-within-column doesn't alter this and neither does bit packing across column boundaries.
632 // if input data is grouped-by-column (not grouped-by-row) then this grouping strategy may help a lot.
633 // if input data is random ordering, then a consequence of grouping-by-column (whether sorted or not) is that the output ordering will not be by
634 // row-group-appearance-order built it; the result will still need to be resorted by row number of first group item.
635 // this was the case with the previous forder as well, which went by whole-column
636 // So, we may as well bit-pack as done above.
637 // Retaining appearance-order within key-byte-column is still advatangeous for when we do encounter column-grouped data (to avoid data movement); and
638 // the ordering will be appearance-order preserving in that case (just at the byte level).
639 // TODO: in future we could provide an option to return 'any' group order for efficiency, which would be the byte-appearance order. But for now, the
640 // the forder afterwards on o__[f__] (in [.data.table) is not significant.
641
642 switch(TYPEOF(x)) {
643 case INTSXP : case LGLSXP : {
644 int32_t *xd = INTEGER(x);
645 #pragma omp parallel for num_threads(getDTthreads(nrow, true))
646 for (int i=0; i<nrow; i++) {
647 uint64_t elem=0;
648 if (xd[i]==NA_INTEGER) { // TODO: go branchless if na_count==0
649 if (nalast==-1) anso[i]=0;
650 elem = naval;
651 } else {
652 elem = xd[i] ^ 0x80000000u;
653 }
654 WRITE_KEY
655 }}
656 break;
657 case REALSXP :
658 if (inherits(x, "integer64")) {
659 int64_t *xd = (int64_t *)REAL(x);
660 #pragma omp parallel for num_threads(getDTthreads(nrow, true))
661 for (int i=0; i<nrow; i++) {
662 uint64_t elem=0;
663 if (xd[i]==INT64_MIN) {
664 if (nalast==-1) anso[i]=0;
665 elem = naval;
666 } else {
667 elem = xd[i] ^ 0x8000000000000000u;
668 }
669 WRITE_KEY
670 }
671 } else {
672 double *xd = REAL(x); // TODO: revisit double compression (skip bytes/mult by 10,100 etc) as currently it's often 6-8 bytes even for 3.14,3.15
673 #pragma omp parallel for num_threads(getDTthreads(nrow, true))
674 for (int i=0; i<nrow; i++) {
675 uint64_t elem=0;
676 if (!R_FINITE(xd[i])) {
677 if (isinf(xd[i])) elem = signbit(xd[i]) ? min-1 : max+1;
678 else {
679 if (nalast==-1) anso[i]=0; // for both NA and NaN
680 elem = ISNA(xd[i]) ? naval : nanval;
681 }
682 } else {
683 elem = dtwiddle(xd[i]); // TODO: could avoid twiddle() if all positive finite which could be known from range_d.
684 // also R_FINITE is repeated within dtwiddle() currently, wastefully given the if() above
685 }
686 WRITE_KEY
687 }
688 }
689 break;
690 case STRSXP : {
691 SEXP *xd = STRING_PTR(x);
692 #pragma omp parallel for num_threads(getDTthreads(nrow, true))
693 for (int i=0; i<nrow; i++) {
694 uint64_t elem=0;
695 if (xd[i]==NA_STRING) {
696 if (nalast==-1) anso[i]=0;
697 elem = naval;
698 } else {
699 elem = -TRUELENGTH(xd[i]);
700 }
701 WRITE_KEY
702 }}
703 free_ustr(); // ustr could be left allocated and reused, but free now in case large and we're tight on ram
704 break;
705 default:
706 STOP(_("Internal error: column not supported, not caught earlier")); // # nocov
707 }
708 nradix += nbyte-1+(spare==0);
709 TEND(4)
710 // Rprintf(_("Written key for column %d\n"), col);
711 }
712 if (key[nradix]!=NULL) nradix++; // nradix now number of bytes in key
713 #ifdef TIMING_ON
714 Rprintf(_("nradix=%d\n"), nradix);
715 #endif
716
717 nth = getDTthreads(nrow, true); // this nth is relied on in cleanup()
718 TMP = (int *)malloc(nth*UINT16_MAX*sizeof(int)); // used by counting sort (my_n<=65536) in radix_r()
719 UGRP = (uint8_t *)malloc(nth*256); // TODO: align TMP and UGRP to cache lines (and do the same for stack allocations too)
720 if (!TMP || !UGRP /*|| TMP%64 || UGRP%64*/) STOP(_("Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d"), nth);
721 if (retgrp) {
722 gs_thread = calloc(nth, sizeof(int *)); // thread private group size buffers
723 gs_thread_alloc = calloc(nth, sizeof(int));
724 gs_thread_n = calloc(nth, sizeof(int));
725 if (!gs_thread || !gs_thread_alloc || !gs_thread_n) STOP(_("Could not allocate (very tiny) group size thread buffers"));
726 }
727 if (nradix) {
728 radix_r(0, nrow-1, 0); // top level recursive call: (from, to, radix)
729 } else {
730 push(&nrow, 1);
731 }
732
733 TEND(30)
734
735 if (anso[0]==1 && anso[nrow-1]==nrow && (nrow<3 || anso[nrow/2]==nrow/2+1)) {
736 // There used to be all_skipped shared bool. But even though it was safe to update this bool to false naked (without atomic protection) :
737 // i) there were a lot of updates from deeply iterated insert, so there were a lot of writes to it and that bool likely sat on a shared cache line
738 // ii) there were a lot of places in the code which needed to remember to set all_skipped properly. It's simpler code just to test now almost instantly.
739 // Alternatively, we could try and avoid creating anso[] until it's needed, but that has similar complexity issues as (ii)
740 // Note that if nalast==-1 (remove NA) anso will contain 0's for the NAs and will be considered not-sorted.
741 bool stop = false;
742 #pragma omp parallel for num_threads(getDTthreads(nrow, true))
743 for (int i=0; i<nrow; i++) {
744 if (stop) continue;
745 if (anso[i]!=i+1) stop=true;
746 }
747 if (!stop) {
748 // data is already grouped or sorted, integer() returned with group sizes attached
749 ans = PROTECT(allocVector(INTSXP, 0)); // can't attach attributes to NULL, hence an empty integer()
750 n_protect++;
751 }
752 }
753 TEND(31)
754
755 if (retgrp) {
756 SEXP tt;
757 int final_gs_n = (gs_n==0) ? gs_thread_n[0] : gs_n; // TODO: find a neater way to do this
758 int *final_gs = (gs_n==0) ? gs_thread[0] : gs;
759 setAttrib(ans, sym_starts, tt = allocVector(INTSXP, final_gs_n));
760 int *ss = INTEGER(tt);
761 int maxgrpn = 0;
762 for (int i=0, tmp=1; i<final_gs_n; i++) {
763 int elem = final_gs[i];
764 if (elem>maxgrpn) maxgrpn=elem;
765 ss[i]=tmp;
766 tmp+=elem;
767 }
768 setAttrib(ans, sym_maxgrpn, ScalarInteger(maxgrpn));
769 }
770
771 cleanup();
772 UNPROTECT(n_protect);
773 TEND(32);
774 #ifdef TIMING_ON
775 {
776 // first sum across threads
777 for (int i=0; i<NBLOCK; i++) {
778 for (int j=1; j<MAX_NTH; j++) {
779 tblock[i] += tblock[j*NBLOCK + i];
780 nblock[i] += nblock[j*NBLOCK + i];
781 }
782 }
783 int last=NBLOCK-1;
784 while (last>=0 && nblock[last]==0) last--; // remove unused timing slots
785 for (int i=0; i<=last; i++) {
786 Rprintf(_("Timing block %2d%s = %8.3f %8d\n"), i, (i>=17&&i<=19)?"(*)":" ", tblock[i], nblock[i]);
787 }
788 for (int i=0; i<=256; i++) {
789 if (stat[i]) Rprintf(_("stat[%03d]==%20"PRIu64"\n"), i, (uint64_t)stat[i]);
790 }
791 }
792 #endif
793 return ans;
794 }
795
sort_ugrp(uint8_t * x,const int n)796 static bool sort_ugrp(uint8_t *x, const int n)
797 // x contains n unique bytes; sort them in-place using insert sort
798 // always ascending. desc and nalast are done in WRITE_KEY because columns may cross byte boundaries
799 // maximum value for n is 256 which is one too big for uint8_t, hence int n
800 {
801 bool skip = true; // was x already sorted? if so, the caller can skip reordering
802 for (int i=1; i<n; i++) {
803 uint8_t tmp = x[i];
804 if (tmp>x[i-1]) continue; // x[i-1]==x[i] doesn't happen because x is unique
805 skip = false;
806 int j = i-1;
807 do {
808 x[j+1] = x[j];
809 } while (--j>=0 && tmp<x[j]);
810 x[j+1] = tmp;
811 }
812 return skip;
813 }
814
radix_r(const int from,const int to,const int radix)815 void radix_r(const int from, const int to, const int radix) {
816 TBEG();
817 const int my_n = to-from+1;
818 if (my_n==1) { // minor TODO: batch up the 1's instead in caller (and that's only needed when retgrp anyway)
819 push(&my_n, 1);
820 TEND(5);
821 return;
822 }
823 else if (my_n<=256) {
824 // if nth==1
825 // Rprintf(_("insert clause: radix=%d, my_n=%d, from=%d, to=%d\n"), radix, my_n, from, to);
826 // insert sort with some twists:
827 // i) detects if grouped; if sortType==0 can then skip
828 // ii) keeps group appearance order at byte level to minimize movement
829 #ifdef TIMING_ON
830 // #pragma omp atomic // turn on manually as the atomic affects timings
831 // stat[my_n]++;
832 #endif
833
834 uint8_t *restrict my_key = key[radix]+from; // safe to write as we don't use this radix again
835 uint8_t o[my_n];
836 // if last key (i.e. radix+1==nradix) there are no more keys to reorder so we could reorder osub by reference directly and save allocating and populating o just
837 // to use it once. However, o's type is uint8_t so many moves within this max-256 vector should be faster than many moves in osub (4 byte or 8 byte ints) [1 byte
838 // type is always aligned]
839 bool skip = true;
840 if (sortType!=0) {
841 // always ascending as desc (sortType==-1) was dealt with in WRITE_KEY
842 int start = 1;
843 while (start<my_n && my_key[start]>=my_key[start-1]) start++;
844 if (start<my_n) {
845 skip = false; // finding start is really just to take skip out of the loop below
846 for (int i=0; i<start; i++) o[i]=i; // always at least sets o[0]=0
847 for (int i=start; i<my_n; i++) {
848 uint8_t ktmp = my_key[i];
849 int j=i-1;
850 while (j>=0 && ktmp<my_key[j]) {
851 my_key[j+1] = my_key[j];
852 o[j+1] = o[j];
853 j--;
854 }
855 my_key[j+1] = ktmp; // redundant write when while() did nothing, but that's unlikely given the common case of pre-ordered is handled by skip==true
856 o[j+1] = i; // important to initialize o[] even when while() did nothing.
857 }
858 }
859 TEND(6)
860 } else {
861 // sortType==0; retain group appearance order to hopefully benefit from skip, but there is a still a sort afterwards to get back to appearance order
862 // because columns are split into multiple bytes: retaining the byte appearance order within key-byte is not the same as retaining the order of
863 // unique values within each by= column. In other words, retaining group order at byte level here does not affect correctness, just efficiency.
864 int second = 1;
865 while (second<my_n && my_key[second]==my_key[0]) second++; // look for the second different byte value
866 int third = second+1;
867 while (third<my_n && my_key[third]==my_key[second]) third++; // look for the last of the second value (which might be a repeat of the first)
868 if (third<my_n) {
869 // it's not either i) all one value (xxxxx), or ii) two values (xxyyy). It could be xxyyyx.. or xxyyyz.. It's likely worth allocating seen[] now.
870 bool seen[256]={false};
871 seen[my_key[0]] = seen[my_key[second]] = true; // first two different bytes have been seen
872 for(; third<my_n; third++) {
873 uint8_t ktmp = my_key[third];
874 if (ktmp==my_key[third-1]) continue;
875 if (!seen[ktmp]) { seen[ktmp]=true; continue; }
876 // else different byte but we've seen it before, so this my_n is not grouped (e.g. xxyyyx)
877 skip = false;
878 break;
879 }
880 if (!skip) {
881 // and now it's worth populating o[]
882 for (int i=0; i<third; i++) o[i] = i;
883 for (int i=third; i<my_n; i++) {
884 uint8_t ktmp = my_key[i];
885 if (!seen[ktmp]) { seen[ktmp]=true; o[i]=i; continue; }
886 // move this byte that we've seen before back to just after the last one to group them
887 int j = i-1;
888 while (j>=0 && ktmp!=my_key[j]) {
889 my_key[j+1] = my_key[j];
890 o[j+1] = o[j];
891 j--;
892 }
893 my_key[j+1] = ktmp; // redundant write if my_key[i]==my_key[i-1] and while() did nothing (but that's ok as that's at least some writing since skip==false)
894 o[j+1] = i; // important to initialize o[] even if while() did nothing
895 }
896 }
897 }
898 TEND(7)
899 }
900 if (!skip) {
901 // reorder osub and each remaining ksub
902 int TMP[my_n]; // on stack fine since my_n is very small (<=256)
903 const int *restrict osub = anso+from;
904 for (int i=0; i<my_n; i++) TMP[i] = osub[o[i]];
905 memcpy((int *restrict)(anso+from), TMP, my_n*sizeof(int));
906 for (int r=radix+1; r<nradix; r++) {
907 const uint8_t *restrict ksub = key[r]+from;
908 for (int i=0; i<my_n; i++) ((uint8_t *)TMP)[i] = ksub[o[i]];
909 memcpy((uint8_t *restrict)(key[r]+from), (uint8_t *)TMP, my_n);
910 }
911 TEND(8)
912 }
913 // my_key is now grouped (and sorted by group too if sort!=0)
914 // all we have left to do is find the group sizes and either recurse or push
915 if (radix+1==nradix && !retgrp) {
916 return;
917 }
918 int ngrp=0, my_gs[my_n]; //minor TODO: could know number of groups with certainty up above
919 my_gs[ngrp]=1;
920 for (int i=1; i<my_n; i++) {
921 if (my_key[i]!=my_key[i-1]) my_gs[++ngrp] = 1;
922 else my_gs[ngrp]++;
923 }
924 ngrp++;
925 TEND(9)
926 if (radix+1==nradix || ngrp==my_n) { // ngrp==my_n => unique groups all size 1 and we can stop recursing now
927 push(my_gs, ngrp);
928 } else {
929 for (int i=0, f=from; i<ngrp; i++) {
930 radix_r(f, f+my_gs[i]-1, radix+1);
931 f+=my_gs[i];
932 }
933 }
934 return;
935 }
936 else if (my_n<=UINT16_MAX) { // UINT16_MAX==65535 (important not 65536)
937 // if (nth==1) Rprintf(_("counting clause: radix=%d, my_n=%d\n"), radix, my_n);
938 uint16_t my_counts[256] = {0}; // Needs to be all-0 on entry. This ={0} initialization should be fast as it's on stack. Otherwise, we have to manage
939 // a stack of counts anyway since this is called recursively and these counts are needed to make the recursive calls.
940 // This thread-private stack alloc has no chance of false sharing and gives omp and compiler best chance.
941 uint8_t *restrict my_ugrp = UGRP + omp_get_thread_num()*256; // uninitialized is fine; will use the first ngrp items. Only used if sortType==0
942 // TODO: ensure my_counts, my_grp and my_tmp below are cache line aligned on both Linux and Windows.
943 const uint8_t *restrict my_key = key[radix]+from;
944 int ngrp = 0; // number of groups (items in ugrp[]). Max value 256 but could be uint8_t later perhaps if 0 is understood as 1.
945 bool skip = true; // i) if already _grouped_ and sortType==0 then caller can skip, ii) if already _grouped and sorted__ when sort!=0 then caller can skip too
946 TEND(10)
947 if (sortType!=0) {
948 // Always ascending sort here (even when sortType==-1) because descending was prepared in write_key
949 for (int i=0; i<my_n; ++i) my_counts[my_key[i]]++; // minimal branch-free loop first, #3647
950 for (int i=1; i<my_n; ++i) {
951 if (my_key[i]<my_key[i-1]) { skip=false; break; }
952 // stop early as soon as not-ordered is detected; likely quickly when it isn't sorted
953 // otherwise, it's worth checking if it is ordered because skip saves time later
954 }
955 TEND(11)
956 } else {
957 for (int i=0; i<my_n; i++) {
958 uint8_t elem = my_key[i];
959 if (++my_counts[elem]==1) {
960 // first time seen this value. i==0 always does this branch
961 my_ugrp[ngrp++]=elem;
962 } else if (skip && elem!=my_key[i-1]) { // does not happen for i==0
963 // seen this value before and it isn't the previous value, so data is not grouped
964 // including "skip &&" first is to avoid the != comparison
965 skip=false;
966 }
967 }
968 TEND(12)
969 }
970 if (!skip) {
971 // reorder anso and remaining radix keys
972
973 // avoid allocating and populating order vector (my_n long); we just use counts several times to push rather than pull
974 // with contiguous-read from osub and ksub, 256 write live cache-lines is worst case. However, often there are many fewer ugrp and only that number of
975 // write cache lines will be active. These write-cache lines will be constrained within the UINT16_MAX width, so should be close by in cache, too.
976 // If there is a good degree of grouping, there contiguous-read/write both ways happens automatically in this approach.
977
978 // cumulate; for forwards-assign to give cpu prefetch best chance (cpu may not support prefetch backwards).
979 uint16_t my_starts[256], my_starts_copy[256];
980 // TODO: could be allocated up front (like my_TMP below), or are they better on stack like this? TODO: allocating up front would provide to cache-align them.
981 if (sortType!=0) {
982 for (int i=0, sum=0; i<256; i++) { int tmp=my_counts[i]; my_starts[i]=my_starts_copy[i]=sum; sum+=tmp; ngrp+=(tmp>0);} // cumulate through 0's too (won't be used)
983 } else {
984 for (int i=0, sum=0; i<ngrp; i++) { uint8_t w=my_ugrp[i]; int tmp=my_counts[w]; my_starts[w]=my_starts_copy[w]=sum; sum+=tmp; } // cumulate in ugrp appearance order
985 }
986
987 int *restrict my_TMP = TMP + omp_get_thread_num()*UINT16_MAX; // Allocated up front to save malloc calls which i) block internally and ii) could fail
988 if (radix==0 && nalast!=-1) {
989 // anso contains 1:n so skip reading and copying it. Only happens when nrow<65535. Saving worth the branch (untested) when user repeatedly calls a small-n small-cardinality order.
990 for (int i=0; i<my_n; i++) anso[my_starts[my_key[i]]++] = i+1; // +1 as R is 1-based.
991 // The loop counter could be uint_fast16_t since max i here will be UINT16_MAX-1 (65534), hence ++ after last iteration won't overflow 16bits. However, have chosen signed
992 // integer for counters for now, as signed probably very slightly faster than unsigned on most platforms from what I can gather.
993 } else {
994 const int *restrict osub = anso+from;
995 for (int i=0; i<my_n; i++) my_TMP[my_starts[my_key[i]]++] = osub[i];
996 memcpy(anso+from, my_TMP, my_n*sizeof(int));
997 }
998 TEND(13)
999
1000 // reorder remaining key columns (radix+1 onwards). This could be done in one-step too (a single pass through x[], with a larger TMP
1001 // that's how its done in the batched approach below. Which is better? The way here is multiple (but contiguous) passes through (one-byte) my_key
1002 if (radix+1<nradix) {
1003 for (int r=radix+1; r<nradix; r++) {
1004 memcpy(my_starts, my_starts_copy, 256*sizeof(uint16_t)); // restore starting offsets
1005 //for (int i=0,last=0; i<256; i++) { int tmp=my_counts[i]; if (tmp==0) continue; my_counts[i]=last; last=tmp; } // rewind ++'s to offsets
1006 const uint8_t *restrict ksub = key[r]+from;
1007 for (int i=0; i<my_n; i++) ((uint8_t *)my_TMP)[my_starts[my_key[i]]++] = ksub[i];
1008 memcpy(key[r]+from, my_TMP, my_n);
1009 }
1010 TEND(14)
1011 }
1012 }
1013
1014 if (!retgrp && radix+1==nradix) {
1015 return; // we're done. avoid allocating and populating very last group sizes for last key
1016 }
1017 int my_gs[ngrp==0 ? 256 : ngrp]; // ngrp==0 when sort and skip==true; we didn't count the non-zeros in my_counts yet in that case
1018 if (sortType!=0) {
1019 ngrp=0;
1020 for (int i=0; i<256; i++) if (my_counts[i]) my_gs[ngrp++]=my_counts[i]; // this casts from uint16_t to int32, too
1021 } else {
1022 for (int i=0; i<ngrp; i++) my_gs[i]=my_counts[my_ugrp[i]];
1023 }
1024 TEND(15)
1025 if (radix+1==nradix) {
1026 // aside: cannot be all size 1 (a saving used in my_n<=256 case above) because my_n>256 and ngrp<=256
1027 push(my_gs, ngrp);
1028 } else {
1029 // this single thread will now descend and resolve all groups, now that the groups are close in cache
1030 for (int i=0, my_from=from; i<ngrp; i++) {
1031 radix_r(my_from, my_from+my_gs[i]-1, radix+1);
1032 my_from+=my_gs[i];
1033 }
1034 }
1035 return;
1036 }
1037 // else parallel batches. This is called recursively but only once or maybe twice before resolving to UINT16_MAX branch above
1038
1039 int batchSize = MIN(UINT16_MAX, 1+my_n/getDTthreads(my_n, true)); // (my_n-1)/nBatch + 1; //UINT16_MAX == 65535
1040 int nBatch = (my_n-1)/batchSize + 1; // TODO: make nBatch a multiple of nThreads?
1041 int lastBatchSize = my_n - (nBatch-1)*batchSize;
1042 uint16_t *counts = calloc(nBatch*256,sizeof(uint16_t));
1043 uint8_t *ugrps = malloc(nBatch*256*sizeof(uint8_t));
1044 int *ngrps = calloc(nBatch ,sizeof(int));
1045 if (!counts || !ugrps || !ngrps) STOP(_("Failed to allocate parallel counts. my_n=%d, nBatch=%d"), my_n, nBatch);
1046
1047 bool skip=true;
1048 const int n_rem = nradix-radix-1; // how many radix are remaining after this one
1049 TEND(16)
1050 #pragma omp parallel num_threads(getDTthreads(nBatch, false))
1051 {
1052 int *my_otmp = malloc(batchSize * sizeof(int)); // thread-private write
1053 uint8_t *my_ktmp = malloc(batchSize * sizeof(uint8_t) * n_rem);
1054 // TODO: move these up above and point restrict[me] to them. Easier to Error that way if failed to alloc.
1055 #pragma omp for
1056 for (int batch=0; batch<nBatch; batch++) {
1057 const int my_n = (batch==nBatch-1) ? lastBatchSize : batchSize; // lastBatchSize == batchSize when my_n is a multiple of batchSize
1058 const int my_from = from + batch*batchSize;
1059 uint16_t *restrict my_counts = counts + batch*256;
1060 uint8_t *restrict my_ugrp = ugrps + batch*256;
1061 int my_ngrp = 0;
1062 bool my_skip = true;
1063 const uint8_t *restrict my_key = key[radix] + my_from;
1064 const uint8_t *restrict byte = my_key;
1065 for (int i=0; i<my_n; i++, byte++) {
1066 if (++my_counts[*byte]==1) { // always true first time when i==0
1067 my_ugrp[my_ngrp++] = *byte;
1068 } else if (my_skip && byte[0]!=byte[-1]) { // include 'my_skip &&' to save != comparison after it's realized this batch is not grouped
1069 my_skip=false;
1070 }
1071 }
1072 ngrps[batch] = my_ngrp; // write once to this shared cache line
1073 if (!my_skip) {
1074 skip = false; // naked write to this shared byte is ok because false is only value written
1075 // gather this batch's anso and remaining keys. If we sorting too, urgrp is sorted later for that. Here we want to benefit from skip within batch
1076 // as much as possible which is a good chance since batchSize is relatively small (65535)
1077 for (int i=0, sum=0; i<my_ngrp; i++) { int tmp = my_counts[my_ugrp[i]]; my_counts[my_ugrp[i]]=sum; sum+=tmp; } // cumulate counts of this batch
1078 const int *restrict osub = anso+my_from;
1079 byte = my_key;
1080 for (int i=0; i<my_n; i++, byte++) {
1081 int dest = my_counts[*byte]++;
1082 my_otmp[dest] = *osub++; // wastefully copies out 1:n when radix==0, but do not optimize as unlikely worth code complexity. my_otmp is not large, for example. Use first TEND() to decide.
1083 for (int r=0; r<n_rem; r++) my_ktmp[r*my_n + dest] = key[radix+1+r][my_from+i]; // reorder remaining keys
1084 }
1085 // or could do multiple passes through my_key like in the my_n<=65535 approach above. Test which is better depending on if TEND() points here.
1086
1087 // we haven't completed all batches, so we don't know where these groups should place yet
1088 // So for now we write the thread-private small now-grouped buffers back in-place. The counts and groups across all batches will be used below to move these blocks.
1089 memcpy(anso+my_from, my_otmp, my_n*sizeof(int));
1090 for (int r=0; r<n_rem; r++) memcpy(key[radix+1+r]+my_from, my_ktmp+r*my_n, my_n*sizeof(uint8_t));
1091
1092 // revert cumulate back to counts ready for vertical cumulate
1093 for (int i=0, last=0; i<my_ngrp; i++) { int tmp = my_counts[my_ugrp[i]]; my_counts[my_ugrp[i]]-=last; last=tmp; }
1094 }
1095 }
1096 free(my_otmp);
1097 free(my_ktmp);
1098 }
1099 TEND(17 + notFirst*3) // 3 timings in this section: 17,18,19 first main split; 20,21,22 thereon
1100
1101 // If my_n input is grouped and ugrp is sorted too (to illustrate), status now would be :
1102 // counts: ugrps: ngrps:
1103 // 1: 20 18 2 0 0 0 0 1 2 3
1104 // 2: 0 0 17 21 5 0 2 3 4 3
1105 // 3: 0 0 0 0 15 19 4 5 2
1106 // If the keys within each and every batch were grouped, skip will be true.
1107 // Now we test if groups occurred in order across batches (like illustration above), and if not set skip=false
1108
1109 uint8_t ugrp[256]; // head(ugrp,ngrp) will contain the unique values in appearance order
1110 bool seen[256]; // is the value present in ugrp already
1111 int ngrp=0; // max value 256 so not uint8_t
1112 uint8_t last_seen=0; // the last grp seen in the previous batch. initialized 0 is not used
1113 for (int i=0; i<256; i++) seen[i]=false;
1114 for (int batch=0; batch<nBatch; batch++) {
1115 const uint8_t *restrict my_ugrp = ugrps + batch*256;
1116 if (ngrp==256 && !skip) break; // no need to carry on
1117 for (int i=0; i<ngrps[batch]; i++) {
1118 if (!seen[my_ugrp[i]]) {
1119 seen[my_ugrp[i]] = true;
1120 ugrp[ngrp++] = last_seen = my_ugrp[i];
1121 } else if (skip && my_ugrp[i]!=last_seen) { // ==last_seen would occur accross batch boundaries, like 2=>17 and 5=>15 in illustration above
1122 skip=false;
1123 }
1124 }
1125 }
1126
1127 // If skip==true (my_n was pre-grouped) and
1128 // i) sortType==0 (not sorting groups) then osub and ksub don't need reordering. ugrp may happen to be sorted too but nothing special to do in that case.
1129 // ii) sortType==1|-1 and ugrp is already sorted then osub and ksub don't need reordering either.
1130
1131 if (sortType!=0 && !sort_ugrp(ugrp, ngrp))
1132 skip=false;
1133
1134 // now cumulate counts vertically to see where the blocks in the batches should be placed in the result across all batches
1135 // the counts are uint16_t but the cumulate needs to be int32_t (or int64_t in future) to hold the offsets
1136 // If skip==true and we're already done, we still need the first row of this cummulate (diff to get total group sizes) to push() or recurse below
1137
1138 int *starts = calloc(nBatch*256, sizeof(int)); // keep starts the same shape and ugrp order as counts
1139 for (int j=0, sum=0; j<ngrp; j++) { // iterate through columns (ngrp bytes)
1140 uint16_t *tmp1 = counts+ugrp[j];
1141 int *tmp2 = starts+ugrp[j];
1142 for (int batch=0; batch<nBatch; batch++) {
1143 *tmp2 = sum;
1144 tmp2 += 256;
1145 sum += *tmp1;
1146 tmp1 += 256;
1147 }
1148 }
1149 // the first row now (when diff'd) now contains the size of each group across all batches
1150
1151 TEND(18 + notFirst*3)
1152 if (!skip) {
1153 int *TMP = malloc(my_n * sizeof(int));
1154 if (!TMP) STOP(_("Unable to allocate TMP for my_n=%d items in parallel batch counting"), my_n);
1155 #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
1156 for (int batch=0; batch<nBatch; batch++) {
1157 const int *restrict my_starts = starts + batch*256;
1158 const uint16_t *restrict my_counts = counts + batch*256;
1159 const int *restrict osub = anso + from + batch*batchSize; // the groups sit here contiguously
1160 const uint8_t *restrict byte = ugrps + batch*256; // in appearance order always logged here in ugrps
1161 const int my_ngrp = ngrps[batch];
1162 for (int i=0; i<my_ngrp; i++, byte++) {
1163 const uint16_t len = my_counts[*byte];
1164 memcpy(TMP+my_starts[*byte], osub, len*sizeof(int));
1165 osub += len;
1166 }
1167 }
1168 memcpy(anso+from, TMP, my_n*sizeof(int));
1169
1170 for (int r=0; r<n_rem; r++) { // TODO: groups of sizeof(anso) 4 byte int currently (in future 8). To save team startup cost (but unlikely significant anyway)
1171 #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
1172 for (int batch=0; batch<nBatch; batch++) {
1173 const int *restrict my_starts = starts + batch*256;
1174 const uint16_t *restrict my_counts = counts + batch*256;
1175 const uint8_t *restrict ksub = key[radix+1+r] + from + batch*batchSize; // the groups sit here contiguosly
1176 const uint8_t *restrict byte = ugrps + batch*256; // in appearance order always logged here in ugrps
1177 const int my_ngrp = ngrps[batch];
1178 for (int i=0; i<my_ngrp; i++, byte++) {
1179 const uint16_t len = my_counts[*byte];
1180 memcpy((uint8_t *)TMP + my_starts[*byte], ksub, len);
1181 ksub += len;
1182 }
1183 }
1184 memcpy(key[radix+1+r]+from, (uint8_t *)TMP, my_n);
1185 }
1186 free(TMP);
1187 }
1188 TEND(19 + notFirst*3)
1189 notFirst = true;
1190
1191 int my_gs[ngrp];
1192 for (int i=1; i<ngrp; i++) my_gs[i-1] = starts[ugrp[i]] - starts[ugrp[i-1]]; // use the first row of starts to get totals
1193 my_gs[ngrp-1] = my_n - starts[ugrp[ngrp-1]];
1194
1195 if (radix+1==nradix) {
1196 // aside: ngrp==my_n (all size 1 groups) isn't a possible short-circuit here similar to my_n>256 case above, my_n>65535 but ngrp<=256
1197 push(my_gs, ngrp);
1198 TEND(23)
1199 }
1200 else {
1201 // TODO: explicitly repeat parallel batch for any skew bins
1202 bool anyBig = false;
1203 for (int i=0; i<ngrp; i++) if (my_gs[i]>UINT16_MAX) { anyBig=true; break; }
1204 if (anyBig) {
1205 // Likely that they're all big. If so, each will go one-by-one and each will be parallel (case my_n>65535)
1206 // If only one or a few are big, then we have skew. The not-big ones will be <65535 and will be very
1207 // fast to run anyway relative to the big one(s) and the big one(s) will go in parallel. Hence dealing
1208 // with skew.
1209 // We could try and separate the not-big ones and do those in parallel first. However, it's critical
1210 // that the groups are flushed in order otherwise the group size final result won't be correct (i.e. it
1211 // won't be aligned with anso). This consideration is independent of maintaining group appearance order.
1212 // When we have skew, by design it's a special type of limited skew in that the small ones are fixed to
1213 // to be small at <65535 (and hence fast anyway). In contract when they are all 'big' (>65535) but there
1214 // is skew in that one or a few are significantly bigger than the others, then they'll all go one-by-one
1215 // each in parallel here and they're all dealt with in parallel. There is no nestedness here.
1216 for (int i=0; i<ngrp; i++) {
1217 int start = from + starts[ugrp[i]];
1218 radix_r(start, start+my_gs[i]-1, radix+1);
1219 flush();
1220 }
1221 TEND(24)
1222 } else {
1223 // all groups are <=65535 and radix_r() will handle each one single-threaded. Therefore, this time
1224 // it does make sense to start a parallel team and there will be no nestedness here either.
1225 if (retgrp) {
1226 #pragma omp parallel for ordered schedule(dynamic) num_threads(getDTthreads(ngrp, false))
1227 for (int i=0; i<ngrp; i++) {
1228 int start = from + starts[ugrp[i]];
1229 radix_r(start, start+my_gs[i]-1, radix+1);
1230 #pragma omp ordered
1231 flush();
1232 }
1233 } else {
1234 // flush() is only relevant when retgrp==true so save the redundant ordered clause
1235 #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(ngrp, false))
1236 for (int i=0; i<ngrp; i++) {
1237 int start = from + starts[ugrp[i]];
1238 radix_r(start, start+my_gs[i]-1, radix+1);
1239 }
1240 }
1241 TEND(25)
1242 }
1243 }
1244 free(counts);
1245 free(starts);
1246 free(ugrps);
1247 free(ngrps);
1248 TEND(26)
1249 }
1250
1251
issorted(SEXP x,SEXP by)1252 SEXP issorted(SEXP x, SEXP by)
1253 {
1254 // Just checks if ordered and returns FALSE early if not. Does not return ordering if so, unlike forder.
1255 // Always increasing order with NA's first
1256 // Similar to base:is.unsorted but accepts NA at the beginning (standard in data.table and considered sorted) rather than
1257 // returning NA when NA present, and is multi-column.
1258 // TODO: test in big steps first to return faster if unsortedness is at the end (a common case of rbind'ing data to end)
1259 // These are all sequential access to x, so quick and cache efficient. Could be parallel by checking continuity at batch boundaries.
1260
1261 if (!isNull(by) && !isInteger(by)) STOP(_("Internal error: issorted 'by' must be NULL or integer vector"));
1262 if (isVectorAtomic(x) || length(by)==1) {
1263 // one-column special case is very common so specialize it by avoiding column-type switches inside the row-loop later
1264 if (length(by)==1) {
1265 if (INTEGER(by)[0]<1 || INTEGER(by)[0]>length(x)) STOP(_("issorted 'by' [%d] out of range [1,%d]"), INTEGER(by)[0], length(x));
1266 x = VECTOR_ELT(x, INTEGER(by)[0]-1);
1267 }
1268 const int n = length(x);
1269 if (n <= 1) return(ScalarLogical(TRUE));
1270 if (!isVectorAtomic(x)) STOP(_("is.sorted does not work on list columns"));
1271 int i=1;
1272 switch(TYPEOF(x)) {
1273 case INTSXP : case LGLSXP : {
1274 int *xd = INTEGER(x);
1275 while (i<n && xd[i]>=xd[i-1]) i++;
1276 } break;
1277 case REALSXP :
1278 if (inherits(x,"integer64")) {
1279 int64_t *xd = (int64_t *)REAL(x);
1280 while (i<n && xd[i]>=xd[i-1]) i++;
1281 } else {
1282 double *xd = REAL(x);
1283 while (i<n && dtwiddle(xd[i])>=dtwiddle(xd[i-1])) i++; // TODO: change to loop over any NA or -Inf at the beginning and then proceed without dtwiddle() (but rounding)
1284 }
1285 break;
1286 case STRSXP : {
1287 SEXP *xd = STRING_PTR(x);
1288 i = 0;
1289 while (i<n && xd[i]==NA_STRING) i++;
1290 bool need = NEED2UTF8(xd[i]);
1291 i++; // pass over first non-NA_STRING
1292 while (i<n) {
1293 if (xd[i]==xd[i-1]) {i++; continue;}
1294 if (xd[i]==NA_STRING) break;
1295 if (!need) need = NEED2UTF8(xd[i]);
1296 if ((need ? strcmp(CHAR(ENC2UTF8(xd[i])), CHAR(ENC2UTF8(xd[i-1]))) :
1297 strcmp(CHAR(xd[i]), CHAR(xd[i-1]))) < 0) break;
1298 i++;
1299 }
1300 } break;
1301 default :
1302 STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(x)));
1303 }
1304 return ScalarLogical(i==n);
1305 }
1306 const int ncol = length(by);
1307 const R_xlen_t nrow = xlength(VECTOR_ELT(x,0));
1308 // ncol>1
1309 // pre-save lookups to save deep switch later for each column type
1310 size_t *sizes = (size_t *)R_alloc(ncol, sizeof(size_t));
1311 const char **ptrs = (const char **)R_alloc(ncol, sizeof(char *));
1312 int *types = (int *)R_alloc(ncol, sizeof(int));
1313 for (int j=0; j<ncol; ++j) {
1314 int c = INTEGER(by)[j];
1315 if (c<1 || c>length(x)) STOP(_("issorted 'by' [%d] out of range [1,%d]"), c, length(x));
1316 SEXP col = VECTOR_ELT(x, c-1);
1317 sizes[j] = SIZEOF(col);
1318 switch(TYPEOF(col)) {
1319 case INTSXP: case LGLSXP:
1320 types[j] = 0;
1321 ptrs[j] = (const char *)INTEGER(col);
1322 break;
1323 case REALSXP:
1324 types[j] = inherits(col, "integer64") ? 2 : 1;
1325 ptrs[j] = (const char *)REAL(col);
1326 break;
1327 case STRSXP:
1328 types[j] = 3;
1329 ptrs[j] = (const char *)STRING_PTR(col);
1330 break;
1331 default:
1332 STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(col))); // # nocov
1333 }
1334 }
1335 for (R_xlen_t i=1; i<nrow; ++i) {
1336 int j = -1;
1337 while (++j<ncol) {
1338 size_t size = sizes[j];
1339 const char *colp = ptrs[j] + size*i;
1340 if (memcmp(colp, colp-size, size)==0) continue; // in all-but-last column, we see many repeats so we can save the switch for those
1341 bool ok = false;
1342 switch (types[j]) {
1343 case 0 : { // INTSXP, LGLSXP
1344 const int *p = (const int *)colp;
1345 ok = p[0]>p[-1];
1346 } break;
1347 case 1: { // regular double in REALSXP
1348 const double *p = (const double *)colp;
1349 ok = dtwiddle(p[0])>dtwiddle(p[-1]); // TODO: avoid dtwiddle by looping over any NA at the beginning, and remove NumericRounding.
1350 } break;
1351 case 2: { // integer64 in REALSXP
1352 const int64_t *p = (const int64_t *)colp;
1353 ok = p[0]>p[-1];
1354 } break;
1355 case 3 : { // STRSXP
1356 const SEXP *p = (const SEXP *)colp;
1357 if (*p==NA_STRING) {
1358 ok = false; // previous value not NA (otherwise memcmp would have returned equal above) so can't be ordered
1359 } else {
1360 ok = (NEED2UTF8(p[0]) || NEED2UTF8(p[-1]) ? // TODO: provide user option to choose ascii-only mode
1361 strcmp(CHAR(ENC2UTF8(p[0])), CHAR(ENC2UTF8(p[-1]))) :
1362 strcmp(CHAR(p[0]), CHAR(p[-1]))) >= 0;
1363 }
1364 } break;
1365 default :
1366 STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(x))); // # nocov
1367 }
1368 if (!ok) return ScalarLogical(FALSE); // not sorted so return early
1369 break; // this item is greater than previous in this column so ignore any remaining columns on this row
1370 }
1371 }
1372 return ScalarLogical(TRUE);
1373 }
1374
isOrderedSubset(SEXP x,SEXP nrowArg)1375 SEXP isOrderedSubset(SEXP x, SEXP nrowArg)
1376 // specialized for use in [.data.table only
1377 // Ignores 0s but heeds NAs and any out-of-range (which result in NA)
1378 {
1379 if (!isNull(x) && !isInteger(x)) error(_("x must be either NULL or an integer vector"));
1380 if (length(x)<=1) return(ScalarLogical(TRUE)); // a single NA when length(x)==1 is ordered (e.g. tests 128 & 130) otherwise anyNA => FALSE
1381 if (!isInteger(nrowArg) || LENGTH(nrowArg)!=1) error(_("nrow must be integer vector length 1"));
1382 const int nrow = INTEGER(nrowArg)[0];
1383 if (nrow<0) error(_("nrow==%d but must be >=0"), nrow);
1384 const int *xd = INTEGER(x), xlen=LENGTH(x);
1385 for (int i=0, last=INT_MIN; i<xlen; ++i) {
1386 int elem = xd[i];
1387 if (elem==0) continue;
1388 if (elem<last || elem<0/*includes NA==INT_MIN*/ || elem>nrow)
1389 return(ScalarLogical(FALSE));
1390 last = elem;
1391 }
1392 return(ScalarLogical(TRUE));
1393 }
1394
binary(SEXP x)1395 SEXP binary(SEXP x)
1396 // base::intToBits is close, but why does that print the result as "00 00 00 00" (raw) rather than ("0000") bits? Seems odd.
1397 {
1398 char buffer[69];
1399 int j;
1400 if (!isReal(x)) error(_("x must be type 'double'"));
1401 SEXP ans = PROTECT(allocVector(STRSXP, LENGTH(x)));
1402 uint64_t *xd = (uint64_t *)REAL(x);
1403 for (int i=0; i<LENGTH(x); i++) {
1404 uint64_t i64 = xd[i];
1405 j = 0;
1406 for(int bit=64; bit>=1; bit--)
1407 {
1408 buffer[j++] = '0' + (i64 >> (bit-1) & 1);
1409 if (bit==64 || bit==53 || bit==17 || bit==9) buffer[j++]=' ';
1410 // ^sign ^exponent ^last 2 byte rounding
1411 }
1412 SET_STRING_ELT(ans, i, mkCharLen(buffer,68)); // 64 bits + 4 spaces
1413 }
1414 UNPROTECT(1);
1415 return ans;
1416 }
1417
1418