1 #include "data.table.h"
2 /*
3   Inspired by :
4   icount in do_radixsort in src/main/sort.c @ rev 51389.
5   base::sort.list(method="radix") turns out not to be a radix sort, but a counting sort :
6     http://r.789695.n4.nabble.com/method-radix-in-sort-list-isn-t-actually-a-radix-sort-tp3309470p3309470.html
7   Wish granted for R to allow negatives (simple change) : https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=15644
8   Terdiman and Herf (LSB radix for floating point and other efficiency tricks) :
9   http://codercorner.com/RadixSortRevisited.htm
10   http://stereopsis.com/radix.html
11 
12   Previous version of this file was promoted into base R, see ?base::sort.
13   Denmark useR! presentation <link>
14   Stanford DSC presentation <link>
15   JSM presentation <link>
16   Techniques used :
17     skewed groups are split in parallel
18     finds unique bytes to save 256 sweeping
19     skips already-grouped yet unsorted
20     recursive group gathering for cache efficiency
21     reuses R's global character cache (truelength on CHARSXP)
22     compressed column can cross byte boundaries to use spare bits (e.g. 2 16-level columns in one byte)
23     just the remaining part of key is reordered as the radix progresses
24     columnar byte-key for within-radix MT cache efficiency
25 
26   Only forder() and dtwiddle() functions are meant for use by other C code in data.table, hence all other functions here are static.
27   The coding techniques deployed here are for efficiency. The static functions are recursive or called repetitively and we wish to minimise
28   overhead. They reach outside themselves to place results in the end result directly rather than returning many small pieces of memory.
29 */
30 
31 // #define TIMING_ON
32 
33 static int nth = 1;                 // number of threads to use, throttled by default; used by cleanup() to ensure no mismatch in getDTthreads() calls
34 static bool retgrp = true;          // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn :
35 static int nrow = 0;                // used as group size stack allocation limit (when all groups are 1 row)
36 static int *gs = NULL;              // gs = final groupsizes e.g. 23,12,87,2,1,34,...
37 static int gs_alloc = 0;            // allocated size of gs
38 static int gs_n = 0;                // the number of groups found so far (how much of the allocated gs is used)
39 static int **gs_thread=NULL;        // each thread has a private buffer which gets flushed to the final gs appropriately
40 static int *gs_thread_alloc=NULL;
41 static int *gs_thread_n=NULL;
42 static int *TMP=NULL;               // UINT16_MAX*sizeof(int) for each thread; used by counting sort in radix_r()
43 static uint8_t *UGRP=NULL;          // 256 bytes for each thread; used by counting sort in radix_r() when sortType==0 (byte appearance order)
44 
45 static int  *cradix_counts = NULL;
46 static SEXP *cradix_xtmp   = NULL;
47 static SEXP *ustr = NULL;
48 static int ustr_alloc = 0;
49 static int ustr_n = 0;
50 static int ustr_maxlen = 0;
51 static int sortType = 0;             // 0 just group; -1 descending, +1 ascending
52 static int nalast = 0;               // 1 (true i.e. last), 0 (false i.e. first), -1 (na i.e. remove)
53 static int nradix = 0;
54 static uint8_t **key = NULL;
55 static int *anso = NULL;
56 static bool notFirst=false;
57 
58 static char msg[1001];
59 #define STOP(...) do {snprintf(msg, 1000, __VA_ARGS__); cleanup(); error(msg);} while(0)      // http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon
60 // use STOP in this file (not error()) to ensure cleanup() is called first
61 // snprintf to msg first in case nrow (just as an example) is provided in the message because cleanup() sets nrow to 0
62 #undef warning
63 #define warning(...) Do not use warning in this file                // since it can be turned to error via warn=2
64 /* Using OS realloc() in this file to benefit from (often) in-place realloc() to save copy
65  * We have to trap on exit anyway to call savetl_end().
66  * NB: R_alloc() would be more convenient (fails within) and robust (auto free) but there is no R_realloc(). Implementing R_realloc() would be an alloc and copy, iiuc.
67  *     Calloc/Realloc needs to be Free'd, even before error() [R-exts$6.1.2]. An oom within Calloc causes a previous Calloc to leak so Calloc would still needs to be trapped anyway.
68  * Therefore, using <<if (!malloc()) STOP(_("helpful context msg"))>> approach to cleanup() on error.
69  */
70 
free_ustr()71 static void free_ustr() {
72   for(int i=0; i<ustr_n; i++)
73     SET_TRUELENGTH(ustr[i],0);
74   free(ustr); ustr=NULL;
75   ustr_alloc=0; ustr_n=0; ustr_maxlen=0;
76 }
77 
cleanup()78 static void cleanup() {
79   free(gs); gs=NULL;
80   gs_alloc = 0;
81   gs_n = 0;
82 
83   if (gs_thread!=NULL) for (int i=0; i<nth; i++) free(gs_thread[i]);
84   free(gs_thread);       gs_thread=NULL;
85   free(gs_thread_alloc); gs_thread_alloc=NULL;
86   free(gs_thread_n);     gs_thread_n=NULL;
87 
88   free(TMP); TMP=NULL;
89   free(UGRP); UGRP=NULL;
90 
91   nrow = 0;
92   free(cradix_counts); cradix_counts=NULL;
93   free(cradix_xtmp);   cradix_xtmp=NULL;
94   free_ustr();
95   if (key!=NULL) { int i=0; while (key[i]!=NULL) free(key[i++]); }  // ==nradix, other than rare cases e.g. tests 1844.5-6 (#3940), and if a calloc fails
96   free(key); key=NULL; nradix=0;
97   savetl_end();  // Restore R's own usage of tl. Must run after the for loop in free_ustr() since only CHARSXP which had tl>0 (R's usage) are stored there.
98 }
99 
push(const int * x,const int n)100 static void push(const int *x, const int n) {
101   if (!retgrp) return;  // clearer to have the switch here rather than before each call
102   int me = omp_get_thread_num();
103   int newn = gs_thread_n[me] + n;
104   if (gs_thread_alloc[me] < newn) {
105     gs_thread_alloc[me] = (newn < nrow/3) ? (1+(newn*2)/4096)*4096 : nrow;  // [2|3] to not overflow and 3 not 2 to avoid allocating close to nrow (nrow groups occurs when all size 1 groups)
106     gs_thread[me] = realloc(gs_thread[me], gs_thread_alloc[me]*sizeof(int));
107     if (gs_thread[me]==NULL) STOP(_("Failed to realloc thread private group size buffer to %d*4bytes"), (int)gs_thread_alloc[me]);
108   }
109   memcpy(gs_thread[me]+gs_thread_n[me], x, n*sizeof(int));
110   gs_thread_n[me] += n;
111 }
112 
flush()113 static void flush() {
114   if (!retgrp) return;
115   int me = omp_get_thread_num();
116   int n = gs_thread_n[me];
117   int newn = gs_n + n;
118   if (gs_alloc < newn) {
119     gs_alloc = (newn < nrow/3) ? (1+(newn*2)/4096)*4096 : nrow;
120     gs = realloc(gs, gs_alloc*sizeof(int));
121     if (gs==NULL) STOP(_("Failed to realloc group size result to %d*4bytes"), (int)gs_alloc);
122   }
123   memcpy(gs+gs_n, gs_thread[me], n*sizeof(int));
124   gs_n += n;
125   gs_thread_n[me] = 0;
126 }
127 
128 #ifdef TIMING_ON
129   #define NBLOCK 64
130   #define MAX_NTH 256
131   static double tblock[MAX_NTH*NBLOCK];
132   static int nblock[MAX_NTH*NBLOCK];
133   static uint64_t stat[257];
134   #define TBEG() double tstart = wallclock();   // tstart declared locally for thread safety
135   #define TEND(i) {  \
136     double now = wallclock(); \
137     int w = omp_get_thread_num()*NBLOCK + i; \
138     tblock[w] += now-tstart; \
139     nblock[w]++; \
140     tstart = now; \
141   }
142 #else
143   #define TBEG()
144   #define TEND(i)
145 #endif
146 
147 // range_* functions return [min,max] of the non-NAs as common uint64_t type
148 // TODO parallelize these; not a priority according to TIMING_ON though (contiguous read with prefetch)
149 
range_i32(const int32_t * x,const int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count)150 static void range_i32(const int32_t *x, const int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
151 {
152   int32_t min = NA_INTEGER;
153   int32_t max = NA_INTEGER;
154   int i=0;
155   while(i<n && x[i]==NA_INTEGER) i++;
156   int na_count = i;
157   if (i<n) max = min = x[i++];
158   for(; i<n; i++) {
159     int tmp = x[i];
160     if (tmp>max) max=tmp;
161     else if (tmp<min) {
162       if (tmp==NA_INTEGER) na_count++;
163       else min=tmp;
164     }
165   }
166   *out_na_count = na_count;
167   *out_min = min ^ 0x80000000u;  // map [-2147483648(INT32_MIN), 2147483647(INT32_MAX)] => [0, 4294967295(UINT32_MAX)]
168   *out_max = max ^ 0x80000000u;
169 }
170 
range_i64(int64_t * x,int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count)171 static void range_i64(int64_t *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
172 {
173   int64_t min = INT64_MIN;
174   int64_t max = INT64_MIN;
175   int i=0;
176   while(i<n && x[i]==INT64_MIN) i++;
177   int na_count = i;
178   if (i<n) max = min = x[i++];
179   for(; i<n; i++) {
180     int64_t tmp = x[i];
181     if (tmp>max) max=tmp;
182     else if (tmp<min) {
183       if (tmp==INT64_MIN) na_count++;
184       else min=tmp;
185     }
186   }
187   *out_na_count = na_count;
188   // map [INT64_MIN, INT64_MAX] => [0, UINT64_MAX]
189   *out_min = min ^ 0x8000000000000000u;
190   *out_max = max ^ 0x8000000000000000u;
191 }
192 
range_d(double * x,int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count,int * out_infnan_count)193 static void range_d(double *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, int *out_infnan_count)
194 // return range of finite numbers (excluding NA, NaN, -Inf, +Inf), a count of NA and a count of Inf|-Inf|NaN
195 {
196   uint64_t min=0, max=0;
197   int na_count=0, infnan_count=0;
198   int i=0;
199   while(i<n && !R_FINITE(x[i])) { ISNA(x[i++]) ? na_count++ : infnan_count++; }
200   if (i<n) { max = min = dtwiddle(x[i++]); }
201   for(; i<n; i++) {
202     if (!R_FINITE(x[i])) { ISNA(x[i]) ? na_count++ : infnan_count++; continue; }
203     uint64_t tmp = dtwiddle(x[i]);
204     if (tmp>max) max=tmp;
205     else if (tmp<min) min=tmp;
206   }
207   *out_na_count = na_count;
208   *out_infnan_count = infnan_count;
209   *out_min = min;
210   *out_max = max;
211 }
212 
213 // used by bmerge only; chmatch uses coerceUtf8IfNeeded
StrCmp(SEXP x,SEXP y)214 int StrCmp(SEXP x, SEXP y)
215 {
216   if (x == y) return 0;             // same cached pointer (including NA_STRING==NA_STRING)
217   if (x == NA_STRING) return -1;    // x<y
218   if (y == NA_STRING) return 1;     // x>y
219   return strcmp(CHAR(x), CHAR(y));  // bmerge calls ENC2UTF8 on x and y before passing here
220 }
221 
cradix_r(SEXP * xsub,int n,int radix)222 static void cradix_r(SEXP *xsub, int n, int radix)
223 // xsub is a unique set of CHARSXP, to be ordered by reference
224 // First time, radix==0, and xsub==x. Then recursively moves SEXP together for cache efficiency.
225 // Quite different to iradix because
226 //   1) x is known to be unique so fits in cache (wide random access not an issue)
227 //   2) they're variable length character strings
228 //   3) no need to maintain o.  Just simply reorder x. No grps or push.
229 {
230   if (n<=1) return;
231   int *thiscounts = cradix_counts + radix*256;
232   uint8_t lastx = 0;  // the last x is used to test its bin
233   for (int i=0; i<n; i++) {
234     lastx = radix<LENGTH(xsub[i]) ? (uint8_t)(CHAR(xsub[i])[radix]) : 1;  // no NA_STRING present,  1 for "" (could use 0 too maybe since NA_STRING not present)
235     thiscounts[ lastx ]++;
236   }
237   if (thiscounts[lastx]==n && radix<ustr_maxlen-1) {
238     cradix_r(xsub, n, radix+1);
239     thiscounts[lastx] = 0;  // all x same value, the rest must be 0 already, save the memset
240     return;
241   }
242   int itmp = thiscounts[0];
243   for (int i=1; i<256; i++) {
244     if (thiscounts[i]) thiscounts[i] = (itmp += thiscounts[i]);  // don't cummulate through 0s, important below
245   }
246   for (int i=n-1; i>=0; i--) {
247     uint8_t thisx = radix<LENGTH(xsub[i]) ? (uint8_t)(CHAR(xsub[i])[radix]) : 1;
248     cradix_xtmp[--thiscounts[thisx]] = xsub[i];
249   }
250   memcpy(xsub, cradix_xtmp, n*sizeof(SEXP));
251   if (radix == ustr_maxlen-1) {
252     memset(thiscounts, 0, 256*sizeof(int));
253     return;
254   }
255   if (thiscounts[0] != 0) STOP(_("Logical error. counts[0]=%d in cradix but should have been decremented to 0. radix=%d"), thiscounts[0], radix);
256   itmp = 0;
257   for (int i=1; i<256; i++) {
258     if (thiscounts[i] == 0) continue;
259     int thisgrpn = thiscounts[i] - itmp;  // undo cumulate; i.e. diff
260     cradix_r(xsub+itmp, thisgrpn, radix+1);
261     itmp = thiscounts[i];
262     thiscounts[i] = 0;  // set to 0 now since we're here, saves memset afterwards. Important to do this ready for this memory's reuse
263   }
264   if (itmp<n-1) cradix_r(xsub+itmp, n-itmp, radix+1);  // final group
265 }
266 
cradix(SEXP * x,int n)267 static void cradix(SEXP *x, int n)
268 {
269   cradix_counts = (int *)calloc(ustr_maxlen*256, sizeof(int));  // counts for the letters of left-aligned strings
270   if (!cradix_counts) STOP(_("Failed to alloc cradix_counts"));
271   cradix_xtmp = (SEXP *)malloc(ustr_n*sizeof(SEXP));
272   if (!cradix_xtmp) STOP(_("Failed to alloc cradix_tmp"));
273   cradix_r(x, n, 0);
274   free(cradix_counts); cradix_counts=NULL;
275   free(cradix_xtmp);   cradix_xtmp=NULL;
276 }
277 
range_str(SEXP * x,int n,uint64_t * out_min,uint64_t * out_max,int * out_na_count)278 static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count)
279 // group numbers are left in truelength to be fetched by WRITE_KEY
280 {
281   int na_count=0;
282   bool anyneedutf8=false;
283   if (ustr_n!=0) STOP(_("Internal error: ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d"), ustr_n, ustr_alloc);  // # nocov
284   if (ustr_maxlen!=0) STOP(_("Internal error: ustr_maxlen isn't 0 when starting range_str"));  // # nocov
285   // savetl_init() has already been called at the start of forder
286   #pragma omp parallel for num_threads(getDTthreads(n, true))
287   for(int i=0; i<n; i++) {
288     SEXP s = x[i];
289     if (s==NA_STRING) {
290       #pragma omp atomic update
291       na_count++;
292       continue;
293     }
294     if (TRUELENGTH(s)<0) continue;  // seen this group before
295     #pragma omp critical
296     if (TRUELENGTH(s)>=0) {  // another thread may have set it while I was waiting, so check it again
297       if (TRUELENGTH(s)>0)   // save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore
298         savetl(s);           // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.
299       // now save unique SEXP in ustr so i) we can loop through them afterwards and reset TRUELENGTH to 0 and ii) sort uniques when sorting too
300       if (ustr_alloc<=ustr_n) {
301         ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2;  // small initial guess, negligible time to alloc 128KB (32 pages)
302         if (ustr_alloc>n) ustr_alloc = n;  // clamp at n. Reaches n when fully unique (no dups)
303         ustr = realloc(ustr, ustr_alloc * sizeof(SEXP));
304         if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP));  // # nocov
305       }
306       ustr[ustr_n++] = s;
307       SET_TRUELENGTH(s, -ustr_n);  // unique in any order is fine. first-appearance order is achieved later in count_group
308       if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
309       if (!anyneedutf8 && NEED2UTF8(s)) anyneedutf8=true;
310     }
311   }
312   *out_na_count = na_count;
313   if (ustr_n==0) {  // all na
314     *out_min = 0;
315     *out_max = 0;
316     return;
317   }
318   if (anyneedutf8) {
319     SEXP ustr2 = PROTECT(allocVector(STRSXP, ustr_n));
320     for (int i=0; i<ustr_n; i++) SET_STRING_ELT(ustr2, i, ENC2UTF8(ustr[i]));
321     SEXP *ustr3 = (SEXP *)malloc(ustr_n * sizeof(SEXP));
322     if (!ustr3) STOP(_("Failed to alloc ustr3 when converting strings to UTF8"));  // # nocov
323     memcpy(ustr3, STRING_PTR(ustr2), ustr_n*sizeof(SEXP));
324     // need to reset ustr_maxlen because we need ustr_maxlen for utf8 strings
325     ustr_maxlen = 0;
326     for (int i=0; i<ustr_n; i++) {
327       SEXP s = ustr3[i];
328       if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
329       if (TRUELENGTH(s)>0) savetl(s);
330     }
331     cradix(ustr3, ustr_n);  // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8
332     SET_TRUELENGTH(ustr3[0], -1);
333     int o = -1;
334     for (int i=1; i<ustr_n; i++) {
335       if (ustr3[i] == ustr3[i-1]) continue;  // use the same o for duplicates
336       SET_TRUELENGTH(ustr3[i], --o);
337     }
338     // now use the 1-1 mapping from ustr to ustr2 to get the ordering back into original ustr, being careful to reset tl to 0
339     int *tl = (int *)malloc(ustr_n * sizeof(int));
340     if (!tl) STOP(_("Failed to alloc tl when converting strings to UTF8"));  // # nocov
341     SEXP *tt = STRING_PTR(ustr2);
342     for (int i=0; i<ustr_n; i++) tl[i] = TRUELENGTH(tt[i]);   // fetches the o in ustr3 into tl which is ordered by ustr
343     for (int i=0; i<ustr_n; i++) SET_TRUELENGTH(ustr3[i], 0);    // reset to 0 tl of the UTF8 (and possibly non-UTF in ustr too)
344     for (int i=0; i<ustr_n; i++) SET_TRUELENGTH(ustr[i], tl[i]); // put back the o into ustr's tl
345     free(tl);
346     free(ustr3);
347     UNPROTECT(1);
348     *out_min = 1;
349     *out_max = -o;  // could be less than ustr_n if there are duplicates in the utf8s
350   } else {
351     *out_min = 1;
352     *out_max = ustr_n;
353     if (sortType) {
354       // that this is always ascending; descending is done in WRITE_KEY using max-this
355       cradix(ustr, ustr_n);  // sorts ustr in-place by reference. assumes NA_STRING not present.
356       for(int i=0; i<ustr_n; i++)     // save ordering in the CHARSXP. negative so as to distinguish with R's own usage.
357         SET_TRUELENGTH(ustr[i], -i-1);
358     }
359     // else group appearance order was already saved to tl in the first pass
360   }
361 }
362 
363 static int dround=0;      // No rounding by default, for now. Handles #1642, #1728, #1463, #485
364 static uint64_t dmask=0;
365 
setNumericRounding(SEXP droundArg)366 SEXP setNumericRounding(SEXP droundArg)
367 // init.c has initial call with default of 2
368 {
369   if (!isInteger(droundArg) || LENGTH(droundArg)!=1) error(_("Must an integer or numeric vector length 1"));
370   if (INTEGER(droundArg)[0] < 0 || INTEGER(droundArg)[0] > 2) error(_("Must be 2, 1 or 0"));
371   dround = INTEGER(droundArg)[0];
372   dmask = dround ? 1 << (8*dround-1) : 0;
373   return R_NilValue;
374 }
375 
getNumericRounding()376 SEXP getNumericRounding()
377 {
378   return ScalarInteger(dround);
379 }
380 
getNumericRounding_C()381 int getNumericRounding_C()
382 // for use in uniqlist.c
383 {
384   return dround;
385 }
386 
387 // for signed integers it's easy: flip sign bit to swap positives and negatives; the resulting unsigned is in the right order with INT_MIN ending up as 0
388 // for floating point finite you have to flip the other bits too if it was signed: http://stereopsis.com/radix.html
dtwiddle(double x)389 uint64_t dtwiddle(double x) //const void *p, int i)
390 {
391   union {
392     double d;
393     uint64_t u64;
394   } u;  // local for thread safety
395   u.d = x; //((double *)p)[i];
396   if (R_FINITE(u.d)) {
397     if (u.d==0) u.d=0; // changes -0.0 to 0.0,  issue #743
398     u.u64 ^= (u.u64 & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; // always flip sign bit and if negative (sign bit was set) flip other bits too
399     u.u64 += (u.u64 & dmask) << 1/*is this shift really correct. No need to shift*/  ;   // when dround==1|2, if 8th|16th bit is set, round up before chopping last 1|2 bytes
400     return u.u64 >> (dround*8);
401   }
402   if (ISNAN(u.d)) return ISNA(u.d)    ? 0 /*NA*/   : 1 /*NaN*/;  // also normalises a difference between NA on 32bit R (bit 13 set) and 64bit R (bit 13 not set)
403   if (isinf(u.d)) return signbit(u.d) ? 2 /*-Inf*/ : (0xffffffffffffffff>>(dround*8)) /*+Inf*/;
404   STOP(_("Unknown non-finite value; not NA, NaN, -Inf or +Inf"));  // # nocov
405 }
406 
407 void radix_r(const int from, const int to, const int radix);
408 
forder(SEXP DT,SEXP by,SEXP retGrpArg,SEXP sortGroupsArg,SEXP ascArg,SEXP naArg)409 SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg)
410 // sortGroups TRUE from setkey and regular forder, FALSE from by= for efficiency so strings don't have to be sorted and can be left in appearance order
411 // when sortGroups is TRUE, ascArg contains +1/-1 for ascending/descending of each by column; when FALSE ascArg is ignored
412 {
413 
414 #ifdef TIMING_ON
415   memset(tblock, 0, MAX_NTH*NBLOCK*sizeof(double));
416   memset(nblock, 0, MAX_NTH*NBLOCK*sizeof(int));
417   memset(stat,   0, 257*sizeof(uint64_t));
418   TBEG()
419 #endif
420 
421   int n_protect = 0;
422   const bool verbose = GetVerbose();
423 
424   if (!isNewList(DT)) {
425     if (!isVectorAtomic(DT))
426       STOP(_("Internal error: input is not either a list of columns, or an atomic vector."));  // # nocov; caught by colnamesInt at R level, test 1962.0472
427     if (!isNull(by))
428       STOP(_("Internal error: input is an atomic vector (not a list of columns) but by= is not NULL"));  // # nocov; caught at R level, test 1962.043
429     if (!isInteger(ascArg) || LENGTH(ascArg)!=1)
430       STOP(_("Input is an atomic vector (not a list of columns) but order= is not a length 1 integer"));
431     if (verbose)
432       Rprintf(_("forder.c received a vector type '%s' length %d\n"), type2char(TYPEOF(DT)), length(DT));
433     SEXP tt = PROTECT(allocVector(VECSXP, 1)); n_protect++;
434     SET_VECTOR_ELT(tt, 0, DT);
435     DT = tt;
436     by = PROTECT(allocVector(INTSXP, 1)); n_protect++;
437     INTEGER(by)[0] = 1;
438   } else {
439     if (verbose)
440       Rprintf(_("forder.c received %d rows and %d columns\n"), length(VECTOR_ELT(DT,0)), length(DT));
441   }
442   if (!length(DT))
443     STOP(_("Internal error: DT is an empty list() of 0 columns"));  // # nocov  should have been caught be colnamesInt, test 2099.1
444   if (!isInteger(by) || !LENGTH(by))
445     STOP(_("Internal error: DT has %d columns but 'by' is either not integer or is length 0"), length(DT));  // # nocov  colnamesInt catches, 2099.2
446   if (!isInteger(ascArg) || LENGTH(ascArg)!=LENGTH(by))
447     STOP(_("Either order= is not integer or its length (%d) is different to by='s length (%d)"), LENGTH(ascArg), LENGTH(by));
448   nrow = length(VECTOR_ELT(DT,0));
449   int n_cplx = 0;
450   for (int i=0; i<LENGTH(by); i++) {
451     int by_i = INTEGER(by)[i];
452     if (by_i < 1 || by_i > length(DT))
453       STOP(_("internal error: 'by' value %d out of range [1,%d]"), by_i, length(DT)); // # nocov # R forderv already catch that using C colnamesInt
454     if ( nrow != length(VECTOR_ELT(DT, by_i-1)) )
455       STOP(_("Column %d is length %d which differs from length of column 1 (%d)\n"), INTEGER(by)[i], length(VECTOR_ELT(DT, INTEGER(by)[i]-1)), nrow);
456     if (TYPEOF(VECTOR_ELT(DT, by_i-1)) == CPLXSXP) n_cplx++;
457   }
458   if (!isLogical(retGrpArg) || LENGTH(retGrpArg)!=1 || INTEGER(retGrpArg)[0]==NA_LOGICAL)
459     STOP(_("retGrp must be TRUE or FALSE"));
460   retgrp = LOGICAL(retGrpArg)[0]==TRUE;
461   if (!isLogical(sortGroupsArg) || LENGTH(sortGroupsArg)!=1 || INTEGER(sortGroupsArg)[0]==NA_LOGICAL )
462     STOP(_("sort must be TRUE or FALSE"));
463   sortType = LOGICAL(sortGroupsArg)[0]==TRUE;   // if sortType is 1, it is later flipped between +1/-1 according to ascArg. Otherwise ascArg is ignored when sortType==0
464   if (!retgrp && !sortType)
465     STOP(_("At least one of retGrp= or sort= must be TRUE"));
466   if (!isLogical(naArg) || LENGTH(naArg) != 1)
467     STOP(_("na.last must be logical TRUE, FALSE or NA of length 1"));
468   nalast = (LOGICAL(naArg)[0] == NA_LOGICAL) ? -1 : LOGICAL(naArg)[0]; // 1=na last, 0=na first (default), -1=remove na
469 
470   if (nrow==0) {
471     // empty vector or 0-row DT is always sorted
472     SEXP ans = PROTECT(allocVector(INTSXP, 0)); n_protect++;
473     if (retgrp) {
474       setAttrib(ans, sym_starts, allocVector(INTSXP, 0));
475       setAttrib(ans, sym_maxgrpn, ScalarInteger(0));
476     }
477     UNPROTECT(n_protect);
478     return ans;
479   }
480   // if n==1, the code is left to proceed below in case one or more of the 1-row by= columns are NA and na.last=NA. Otherwise it would be easy to return now.
481   notFirst = false;
482 
483   SEXP ans = PROTECT(allocVector(INTSXP, nrow)); n_protect++;
484   anso = INTEGER(ans);
485   TEND(0)
486   #pragma omp parallel for num_threads(getDTthreads(nrow, true))
487   for (int i=0; i<nrow; i++) anso[i]=i+1;   // gdb 8.1.0.20180409-git very slow here, oddly
488   TEND(1)
489   savetl_init();   // from now on use Error not error
490 
491   int ncol=length(by);
492   int keyAlloc = (ncol+n_cplx)*8 + 1;         // +1 for NULL to mark end; calloc to initialize with NULLs
493   key = calloc(keyAlloc, sizeof(uint8_t *));  // needs to be before loop because part II relies on part I, column-by-column.
494   if (!key)
495     STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)keyAlloc*sizeof(uint8_t *));  // # nocov
496   nradix=0; // the current byte we're writing this column to; might be squashing into it (spare>0)
497   int spare=0;  // the amount of bits remaining on the right of the current nradix byte
498   bool isReal=false;
499   bool complexRerun = false;   // see comments below in CPLXSXP case
500   SEXP CplxPart = R_NilValue;
501   if (n_cplx) { CplxPart=PROTECT(allocVector(REALSXP, nrow)); n_protect++; } // one alloc is reused for each part
502   TEND(2);
503   for (int col=0; col<ncol; col++) {
504     // Rprintf(_("Finding range of column %d ...\n"), col);
505     SEXP x = VECTOR_ELT(DT,INTEGER(by)[col]-1);
506     uint64_t min=0, max=0;     // min and max of non-NA finite values
507     int na_count=0, infnan_count=0;
508     if (sortType) {
509       sortType=INTEGER(ascArg)[col];  // if sortType!=0 (not first-appearance) then +1/-1 comes from ascArg.
510       if (sortType!=1 && sortType!=-1)
511         STOP(_("Item %d of order (ascending/descending) is %d. Must be +1 or -1."), col+1, sortType);
512     }
513     //Rprintf(_("sortType = %d\n"), sortType);
514     switch(TYPEOF(x)) {
515     case INTSXP : case LGLSXP :  // TODO skip LGL and assume range [0,1]
516       range_i32(INTEGER(x), nrow, &min, &max, &na_count);
517       break;
518     case CPLXSXP : {
519       // treat as if two separate columns of double
520       const Rcomplex *xd = COMPLEX(x);
521       double *tmp = REAL(CplxPart);
522       if (!complexRerun) {
523         for (int i=0; i<nrow; ++i) tmp[i] = xd[i].r;  // extract the real part on the first time
524         complexRerun = true;
525         col--;  // cause this loop iteration to rerun; decrement now in case of early continue below
526       } else {
527         for (int i=0; i<nrow; ++i) tmp[i] = xd[i].i;
528         complexRerun = false;
529       }
530       x = CplxPart;
531     } // !no break! so as to fall through to REAL case
532     case REALSXP :
533       if (INHERITS(x, char_integer64)) {
534         range_i64((int64_t *)REAL(x), nrow, &min, &max, &na_count);
535       } else {
536         if (verbose && INHERITS(x, char_Date) && INTEGER(isReallyReal(x))[0]==0) {
537           Rprintf(_("\n*** Column %d passed to forder is a date stored as an 8 byte double but no fractions are present. Please consider a 4 byte integer date such as IDate to save space and time.\n"), col+1);
538           // Note the (slightly expensive) isReallyReal will only run when verbose is true. Prefix '***' just to make it stand out in verbose output
539           // In future this could be upgraded to option warning. But I figured that's what we use verbose to do (to trace problems and look for efficiencies).
540           // If an automatic coerce is desired (see discussion in #1738) then this is the point to do that in this file. Move the INTSXP case above to be
541           // next, do the coerce of Date to integer now to a tmp, and then let this case fall through to INTSXP in the same way as CPLXSXP falls through to REALSXP.
542         }
543         range_d(REAL(x), nrow, &min, &max, &na_count, &infnan_count);
544         if (min==0 && na_count<nrow) { min=3; max=4; } // column contains no finite numbers and is not-all NA; create dummies to yield positive min-2 later
545         isReal = true;
546       }
547       break;
548     case STRSXP :
549       // need2utf8 now happens inside range_str on the uniques
550       range_str(STRING_PTR(x), nrow, &min, &max, &na_count);
551       break;
552     default:
553       STOP(_("Column %d passed to [f]order is type '%s', not yet supported."), col+1, type2char(TYPEOF(x)));
554     }
555     TEND(3);
556     if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) {
557       // all same value; skip column as nothing to do;  [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA)
558       if (na_count==nrow && nalast==-1) { for (int i=0; i<nrow; i++) anso[i]=0; }
559       if (TYPEOF(x)==STRSXP) free_ustr();
560       continue;
561     }
562 
563     uint64_t range = max-min+1 +1/*NA*/ +isReal*3/*NaN, -Inf, +Inf*/;
564     // Rprintf(_("range=%"PRIu64"  min=%"PRIu64"  max=%"PRIu64"  na_count==%d\n"), range, min, max, na_count);
565 
566     int maxBit=0;
567     while (range) { maxBit++; range>>=1; }
568     int nbyte = 1+(maxBit-1)/8; // the number of bytes spanned by the value
569     int firstBits = maxBit - (nbyte-1)*8;  // how many bits used in most significant byte
570     if (spare==0) {
571       spare = 8-firstBits; // left align to byte boundary to get better first split.
572     } else {
573       if (sortType==0) {
574         // can't squash into previous spare as that will change first-appearance order within that radix-byte
575         //   found thanks to test 1246.55 with seed 1540402216L and added new direct test 1954
576         spare = 8-firstBits;
577         nradix++;
578       }
579       else if (spare >= firstBits) {
580         // easiest case. just left shift a few bits within these nbyte to fill the spare gap
581         spare -= firstBits;      // new spare is also how many bits to now left shift
582       }
583       else {
584         if (nbyte<8) {
585           spare += 8-firstBits;
586           nbyte++;               // after shift, will need an extra byte
587         } else {
588           // range takes 8 bytes so can't shift into 9th to use spare of last byte of previous column; start with a new left-aligned byte
589           nradix++;
590           spare = 8-firstBits;
591         }
592       }
593     }
594 
595     for (int b=0; b<nbyte; b++) {
596       if (key[nradix+b]==NULL) {
597         uint8_t *tt = calloc(nrow, sizeof(uint8_t));  // 0 initialize so that NA's can just skip (NA is always the 0 offset)
598         if (!tt)
599           STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)nrow*sizeof(uint8_t)); // # nocov
600         key[nradix+b] = tt;
601       }
602     }
603 
604     const bool asc = (sortType>=0);
605     uint64_t min2=min, max2=max;
606     if (nalast<1 && asc) {
607       min2--; // so that x-min results in 1 for the minimum; NA coded by 0. Always provide for the NA spot even if NAs aren't present for code brevity and robustness
608       if (isReal) min2--;  // Nan first
609     } else {
610       max2++;            // NA is max+1 so max2-elem should result in 0
611       if (isReal) max2++;  // Nan last
612     }
613     if (isReal) { min2--; max2++; }  // -Inf and +Inf  in min-1 and max+1 spots respectively
614 
615     const uint64_t naval = ((nalast==1) == asc) ? max+1+isReal*2 : min-1-isReal*2;
616     const uint64_t nanval = ((nalast==1) == asc) ? max+2 : min-2;  // only used when isReal
617     // Rprintf(_("asc=%d  min2=%"PRIu64"  max2=%"PRIu64"  naval==%"PRIu64"  nanval==%"PRIu64"\n"), asc, min2, max2, naval, nanval);
618 
619     // several columns could squash into 1 byte. due to this bit squashing is why we deal
620     // with asc|desc here, otherwise it could be done in the ugrp sorting by reversing the ugrp insert sort
621     #define WRITE_KEY                                   \
622     elem = asc ? elem-min2 : max2-elem;                 \
623     elem <<= spare;                                     \
624     for (int b=nbyte-1; b>0; b--) {                     \
625       key[nradix+b][i] = (uint8_t)(elem & 0xff);        \
626       elem >>= 8;                                       \
627     }                                                   \
628     key[nradix][i] |= (uint8_t)(elem & 0xff);
629     // this last |= squashes the most significant bits of this column into the the spare of the previous
630     // NB: retaining group order does not retain the appearance-order in the sense that DT[,,by=] does. Because we go column-by-column, the first column values
631     //     are grouped together.   Whether we do byte-column-within-column doesn't alter this and neither does bit packing across column boundaries.
632     //     if input data is grouped-by-column (not grouped-by-row) then this grouping strategy may help a lot.
633     //     if input data is random ordering, then a consequence of grouping-by-column (whether sorted or not) is that the output ordering will not be by
634     //       row-group-appearance-order built it; the result will still need to be resorted by row number of first group item.
635     //     this was the case with the previous forder as well, which went by whole-column
636     //     So, we may as well bit-pack as done above.
637     //     Retaining appearance-order within key-byte-column is still advatangeous for when we do encounter column-grouped data (to avoid data movement); and
638     //       the ordering will be appearance-order preserving in that case (just at the byte level).
639     // TODO: in future we could provide an option to return 'any' group order for efficiency, which would be the byte-appearance order. But for now, the
640     //     the forder afterwards on o__[f__] (in [.data.table) is not significant.
641 
642     switch(TYPEOF(x)) {
643     case INTSXP : case LGLSXP : {
644       int32_t *xd = INTEGER(x);
645       #pragma omp parallel for num_threads(getDTthreads(nrow, true))
646       for (int i=0; i<nrow; i++) {
647         uint64_t elem=0;
648         if (xd[i]==NA_INTEGER) {  // TODO: go branchless if na_count==0
649           if (nalast==-1) anso[i]=0;
650           elem = naval;
651         } else {
652           elem = xd[i] ^ 0x80000000u;
653         }
654         WRITE_KEY
655       }}
656       break;
657     case REALSXP :
658       if (inherits(x, "integer64")) {
659         int64_t *xd = (int64_t *)REAL(x);
660         #pragma omp parallel for num_threads(getDTthreads(nrow, true))
661         for (int i=0; i<nrow; i++) {
662           uint64_t elem=0;
663           if (xd[i]==INT64_MIN) {
664             if (nalast==-1) anso[i]=0;
665             elem = naval;
666           } else {
667             elem = xd[i] ^ 0x8000000000000000u;
668           }
669           WRITE_KEY
670         }
671       } else {
672         double *xd = REAL(x);     // TODO: revisit double compression (skip bytes/mult by 10,100 etc) as currently it's often 6-8 bytes even for 3.14,3.15
673         #pragma omp parallel for num_threads(getDTthreads(nrow, true))
674         for (int i=0; i<nrow; i++) {
675           uint64_t elem=0;
676           if (!R_FINITE(xd[i])) {
677             if (isinf(xd[i])) elem = signbit(xd[i]) ? min-1 : max+1;
678             else {
679               if (nalast==-1) anso[i]=0;  // for both NA and NaN
680               elem = ISNA(xd[i]) ? naval : nanval;
681             }
682           } else {
683             elem = dtwiddle(xd[i]);  // TODO: could avoid twiddle() if all positive finite which could be known from range_d.
684                                      //       also R_FINITE is repeated within dtwiddle() currently, wastefully given the if() above
685           }
686           WRITE_KEY
687         }
688       }
689       break;
690     case STRSXP : {
691       SEXP *xd = STRING_PTR(x);
692       #pragma omp parallel for num_threads(getDTthreads(nrow, true))
693       for (int i=0; i<nrow; i++) {
694         uint64_t elem=0;
695         if (xd[i]==NA_STRING) {
696           if (nalast==-1) anso[i]=0;
697           elem = naval;
698         } else {
699           elem = -TRUELENGTH(xd[i]);
700         }
701         WRITE_KEY
702       }}
703       free_ustr();  // ustr could be left allocated and reused, but free now in case large and we're tight on ram
704       break;
705     default:
706        STOP(_("Internal error: column not supported, not caught earlier"));  // # nocov
707     }
708     nradix += nbyte-1+(spare==0);
709     TEND(4)
710     // Rprintf(_("Written key for column %d\n"), col);
711   }
712   if (key[nradix]!=NULL) nradix++;  // nradix now number of bytes in key
713   #ifdef TIMING_ON
714   Rprintf(_("nradix=%d\n"), nradix);
715   #endif
716 
717   nth = getDTthreads(nrow, true);  // this nth is relied on in cleanup()
718   TMP =  (int *)malloc(nth*UINT16_MAX*sizeof(int)); // used by counting sort (my_n<=65536) in radix_r()
719   UGRP = (uint8_t *)malloc(nth*256);                // TODO: align TMP and UGRP to cache lines (and do the same for stack allocations too)
720   if (!TMP || !UGRP /*|| TMP%64 || UGRP%64*/) STOP(_("Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d"), nth);
721   if (retgrp) {
722     gs_thread = calloc(nth, sizeof(int *));     // thread private group size buffers
723     gs_thread_alloc = calloc(nth, sizeof(int));
724     gs_thread_n = calloc(nth, sizeof(int));
725     if (!gs_thread || !gs_thread_alloc || !gs_thread_n) STOP(_("Could not allocate (very tiny) group size thread buffers"));
726   }
727   if (nradix) {
728     radix_r(0, nrow-1, 0);  // top level recursive call: (from, to, radix)
729   } else {
730     push(&nrow, 1);
731   }
732 
733   TEND(30)
734 
735   if (anso[0]==1 && anso[nrow-1]==nrow && (nrow<3 || anso[nrow/2]==nrow/2+1)) {
736     // There used to be all_skipped shared bool. But even though it was safe to update this bool to false naked (without atomic protection) :
737     // i) there were a lot of updates from deeply iterated insert, so there were a lot of writes to it and that bool likely sat on a shared cache line
738     // ii) there were a lot of places in the code which needed to remember to set all_skipped properly. It's simpler code just to test now almost instantly.
739     // Alternatively, we could try and avoid creating anso[] until it's needed, but that has similar complexity issues as (ii)
740     // Note that if nalast==-1 (remove NA) anso will contain 0's for the NAs and will be considered not-sorted.
741     bool stop = false;
742     #pragma omp parallel for num_threads(getDTthreads(nrow, true))
743     for (int i=0; i<nrow; i++) {
744       if (stop) continue;
745       if (anso[i]!=i+1) stop=true;
746     }
747     if (!stop) {
748       // data is already grouped or sorted, integer() returned with group sizes attached
749       ans = PROTECT(allocVector(INTSXP, 0));  // can't attach attributes to NULL, hence an empty integer()
750       n_protect++;
751     }
752   }
753   TEND(31)
754 
755   if (retgrp) {
756     SEXP tt;
757     int final_gs_n = (gs_n==0) ? gs_thread_n[0] : gs_n;   // TODO: find a neater way to do this
758     int *final_gs  = (gs_n==0) ? gs_thread[0] : gs;
759     setAttrib(ans, sym_starts, tt = allocVector(INTSXP, final_gs_n));
760     int *ss = INTEGER(tt);
761     int maxgrpn = 0;
762     for (int i=0, tmp=1; i<final_gs_n; i++) {
763       int elem = final_gs[i];
764       if (elem>maxgrpn) maxgrpn=elem;
765       ss[i]=tmp;
766       tmp+=elem;
767     }
768     setAttrib(ans, sym_maxgrpn, ScalarInteger(maxgrpn));
769   }
770 
771   cleanup();
772   UNPROTECT(n_protect);
773   TEND(32);
774   #ifdef TIMING_ON
775   {
776     // first sum across threads
777     for (int i=0; i<NBLOCK; i++) {
778       for (int j=1; j<MAX_NTH; j++) {
779         tblock[i] += tblock[j*NBLOCK + i];
780         nblock[i] += nblock[j*NBLOCK + i];
781       }
782     }
783     int last=NBLOCK-1;
784     while (last>=0 && nblock[last]==0) last--; // remove unused timing slots
785     for (int i=0; i<=last; i++) {
786       Rprintf(_("Timing block %2d%s = %8.3f   %8d\n"), i, (i>=17&&i<=19)?"(*)":"   ", tblock[i], nblock[i]);
787     }
788     for (int i=0; i<=256; i++) {
789       if (stat[i]) Rprintf(_("stat[%03d]==%20"PRIu64"\n"), i, (uint64_t)stat[i]);
790     }
791   }
792   #endif
793   return ans;
794 }
795 
sort_ugrp(uint8_t * x,const int n)796 static bool sort_ugrp(uint8_t *x, const int n)
797 // x contains n unique bytes; sort them in-place using insert sort
798 // always ascending. desc and nalast are done in WRITE_KEY because columns may cross byte boundaries
799 // maximum value for n is 256 which is one too big for uint8_t, hence int n
800 {
801   bool skip = true;            // was x already sorted? if so, the caller can skip reordering
802   for (int i=1; i<n; i++) {
803     uint8_t tmp = x[i];
804     if (tmp>x[i-1]) continue;  // x[i-1]==x[i] doesn't happen because x is unique
805     skip = false;
806     int j = i-1;
807     do {
808       x[j+1] = x[j];
809     } while (--j>=0 && tmp<x[j]);
810     x[j+1] = tmp;
811   }
812   return skip;
813 }
814 
radix_r(const int from,const int to,const int radix)815 void radix_r(const int from, const int to, const int radix) {
816   TBEG();
817   const int my_n = to-from+1;
818   if (my_n==1) {  // minor TODO: batch up the 1's instead in caller (and that's only needed when retgrp anyway)
819     push(&my_n, 1);
820     TEND(5);
821     return;
822   }
823   else if (my_n<=256) {
824     // if nth==1
825     // Rprintf(_("insert clause: radix=%d, my_n=%d, from=%d, to=%d\n"), radix, my_n, from, to);
826     // insert sort with some twists:
827     // i) detects if grouped; if sortType==0 can then skip
828     // ii) keeps group appearance order at byte level to minimize movement
829     #ifdef TIMING_ON
830       // #pragma omp atomic   // turn on manually as the atomic affects timings
831       // stat[my_n]++;
832     #endif
833 
834     uint8_t *restrict my_key = key[radix]+from;  // safe to write as we don't use this radix again
835     uint8_t o[my_n];
836     // if last key (i.e. radix+1==nradix) there are no more keys to reorder so we could reorder osub by reference directly and save allocating and populating o just
837     // to use it once. However, o's type is uint8_t so many moves within this max-256 vector should be faster than many moves in osub (4 byte or 8 byte ints) [1 byte
838     // type is always aligned]
839     bool skip = true;
840     if (sortType!=0) {
841       // always ascending as desc (sortType==-1) was dealt with in WRITE_KEY
842       int start = 1;
843       while (start<my_n && my_key[start]>=my_key[start-1]) start++;
844       if (start<my_n) {
845         skip = false;  // finding start is really just to take skip out of the loop below
846         for (int i=0; i<start; i++) o[i]=i;  // always at least sets o[0]=0
847         for (int i=start; i<my_n; i++) {
848           uint8_t ktmp = my_key[i];
849           int j=i-1;
850           while (j>=0 && ktmp<my_key[j]) {
851             my_key[j+1] = my_key[j];
852             o[j+1] = o[j];
853             j--;
854           }
855           my_key[j+1] = ktmp;  // redundant write when while() did nothing, but that's unlikely given the common case of pre-ordered is handled by skip==true
856           o[j+1] = i;          // important to initialize o[] even when while() did nothing.
857         }
858       }
859       TEND(6)
860     } else {
861       // sortType==0; retain group appearance order to hopefully benefit from skip, but there is a still a sort afterwards to get back to appearance order
862       // because columns are split into multiple bytes: retaining the byte appearance order within key-byte is not the same as retaining the order of
863       // unique values within each by= column.  In other words, retaining group order at byte level here does not affect correctness, just efficiency.
864       int second = 1;
865       while (second<my_n && my_key[second]==my_key[0]) second++;  // look for the second different byte value
866       int third = second+1;
867       while (third<my_n && my_key[third]==my_key[second]) third++;  // look for the last of the second value (which might be a repeat of the first)
868       if (third<my_n) {
869         // it's not either i) all one value (xxxxx), or ii) two values (xxyyy). It could be xxyyyx.. or xxyyyz..  It's likely worth allocating seen[] now.
870         bool seen[256]={false};
871         seen[my_key[0]] = seen[my_key[second]] = true;  // first two different bytes have been seen
872         for(; third<my_n; third++) {
873           uint8_t ktmp = my_key[third];
874           if (ktmp==my_key[third-1]) continue;
875           if (!seen[ktmp]) { seen[ktmp]=true; continue; }
876           // else different byte but we've seen it before, so this my_n is not grouped (e.g. xxyyyx)
877           skip = false;
878           break;
879         }
880         if (!skip) {
881           // and now it's worth populating o[]
882           for (int i=0; i<third; i++) o[i] = i;
883           for (int i=third; i<my_n; i++) {
884             uint8_t ktmp = my_key[i];
885             if (!seen[ktmp]) { seen[ktmp]=true; o[i]=i; continue; }
886             // move this byte that we've seen before back to just after the last one to group them
887             int j = i-1;
888             while (j>=0 && ktmp!=my_key[j]) {
889               my_key[j+1] = my_key[j];
890               o[j+1] = o[j];
891               j--;
892             }
893             my_key[j+1] = ktmp;   // redundant write if my_key[i]==my_key[i-1] and while() did nothing (but that's ok as that's at least some writing since skip==false)
894             o[j+1] = i;           // important to initialize o[] even if while() did nothing
895           }
896         }
897       }
898       TEND(7)
899     }
900     if (!skip) {
901       // reorder osub and each remaining ksub
902       int TMP[my_n];  // on stack fine since my_n is very small (<=256)
903       const int *restrict osub = anso+from;
904       for (int i=0; i<my_n; i++) TMP[i] = osub[o[i]];
905       memcpy((int *restrict)(anso+from), TMP, my_n*sizeof(int));
906       for (int r=radix+1; r<nradix; r++) {
907         const uint8_t *restrict ksub = key[r]+from;
908         for (int i=0; i<my_n; i++) ((uint8_t *)TMP)[i] = ksub[o[i]];
909         memcpy((uint8_t *restrict)(key[r]+from), (uint8_t *)TMP, my_n);
910       }
911       TEND(8)
912     }
913     // my_key is now grouped (and sorted by group too if sort!=0)
914     // all we have left to do is find the group sizes and either recurse or push
915     if (radix+1==nradix && !retgrp) {
916       return;
917     }
918     int ngrp=0, my_gs[my_n];  //minor TODO: could know number of groups with certainty up above
919     my_gs[ngrp]=1;
920     for (int i=1; i<my_n; i++) {
921       if (my_key[i]!=my_key[i-1]) my_gs[++ngrp] = 1;
922       else my_gs[ngrp]++;
923     }
924     ngrp++;
925     TEND(9)
926     if (radix+1==nradix || ngrp==my_n) {  // ngrp==my_n => unique groups all size 1 and we can stop recursing now
927       push(my_gs, ngrp);
928     } else {
929       for (int i=0, f=from; i<ngrp; i++) {
930         radix_r(f, f+my_gs[i]-1, radix+1);
931         f+=my_gs[i];
932       }
933     }
934     return;
935   }
936   else if (my_n<=UINT16_MAX) {    // UINT16_MAX==65535 (important not 65536)
937     // if (nth==1) Rprintf(_("counting clause: radix=%d, my_n=%d\n"), radix, my_n);
938     uint16_t my_counts[256] = {0};  // Needs to be all-0 on entry. This ={0} initialization should be fast as it's on stack. Otherwise, we have to manage
939                                     // a stack of counts anyway since this is called recursively and these counts are needed to make the recursive calls.
940                                     // This thread-private stack alloc has no chance of false sharing and gives omp and compiler best chance.
941     uint8_t *restrict my_ugrp = UGRP + omp_get_thread_num()*256;  // uninitialized is fine; will use the first ngrp items. Only used if sortType==0
942     // TODO: ensure my_counts, my_grp and my_tmp below are cache line aligned on both Linux and Windows.
943     const uint8_t *restrict my_key = key[radix]+from;
944     int ngrp = 0;          // number of groups (items in ugrp[]). Max value 256 but could be uint8_t later perhaps if 0 is understood as 1.
945     bool skip = true;      // i) if already _grouped_ and sortType==0 then caller can skip, ii) if already _grouped and sorted__ when sort!=0 then caller can skip too
946     TEND(10)
947     if (sortType!=0) {
948       // Always ascending sort here (even when sortType==-1) because descending was prepared in write_key
949       for (int i=0; i<my_n; ++i) my_counts[my_key[i]]++;  // minimal branch-free loop first, #3647
950       for (int i=1; i<my_n; ++i) {
951         if (my_key[i]<my_key[i-1]) { skip=false; break; }
952         // stop early as soon as not-ordered is detected; likely quickly when it isn't sorted
953         // otherwise, it's worth checking if it is ordered because skip saves time later
954       }
955       TEND(11)
956     } else {
957       for (int i=0; i<my_n; i++) {
958         uint8_t elem = my_key[i];
959         if (++my_counts[elem]==1) {
960           // first time seen this value.  i==0 always does this branch
961           my_ugrp[ngrp++]=elem;
962         } else if (skip && elem!=my_key[i-1]) {   // does not happen for i==0
963           // seen this value before and it isn't the previous value, so data is not grouped
964           // including "skip &&" first is to avoid the != comparison
965           skip=false;
966         }
967       }
968       TEND(12)
969     }
970     if (!skip) {
971       // reorder anso and remaining radix keys
972 
973       // avoid allocating and populating order vector (my_n long); we just use counts several times to push rather than pull
974       // with contiguous-read from osub and ksub, 256 write live cache-lines is worst case. However, often there are many fewer ugrp and only that number of
975       // write cache lines will be active. These write-cache lines will be constrained within the UINT16_MAX width, so should be close by in cache, too.
976       // If there is a good degree of grouping, there contiguous-read/write both ways happens automatically in this approach.
977 
978       // cumulate; for forwards-assign to give cpu prefetch best chance (cpu may not support prefetch backwards).
979       uint16_t my_starts[256], my_starts_copy[256];
980       // TODO: could be allocated up front (like my_TMP below), or are they better on stack like this? TODO: allocating up front would provide to cache-align them.
981       if (sortType!=0) {
982         for (int i=0, sum=0; i<256; i++) { int tmp=my_counts[i]; my_starts[i]=my_starts_copy[i]=sum; sum+=tmp; ngrp+=(tmp>0);}  // cumulate through 0's too (won't be used)
983       } else {
984         for (int i=0, sum=0; i<ngrp; i++) { uint8_t w=my_ugrp[i]; int tmp=my_counts[w]; my_starts[w]=my_starts_copy[w]=sum; sum+=tmp; }  // cumulate in ugrp appearance order
985       }
986 
987       int *restrict my_TMP = TMP + omp_get_thread_num()*UINT16_MAX; // Allocated up front to save malloc calls which i) block internally and ii) could fail
988       if (radix==0 && nalast!=-1) {
989         // anso contains 1:n so skip reading and copying it. Only happens when nrow<65535. Saving worth the branch (untested) when user repeatedly calls a small-n small-cardinality order.
990         for (int i=0; i<my_n; i++) anso[my_starts[my_key[i]]++] = i+1;  // +1 as R is 1-based.
991         // The loop counter could be uint_fast16_t since max i here will be UINT16_MAX-1 (65534), hence ++ after last iteration won't overflow 16bits. However, have chosen signed
992         // integer for counters for now, as signed probably very slightly faster than unsigned on most platforms from what I can gather.
993       } else {
994         const int *restrict osub = anso+from;
995         for (int i=0; i<my_n; i++) my_TMP[my_starts[my_key[i]]++] = osub[i];
996         memcpy(anso+from, my_TMP, my_n*sizeof(int));
997       }
998       TEND(13)
999 
1000       // reorder remaining key columns (radix+1 onwards).   This could be done in one-step too (a single pass through x[],  with a larger TMP
1001       //    that's how its done in the batched approach below.  Which is better?  The way here is multiple (but contiguous) passes through (one-byte) my_key
1002       if (radix+1<nradix) {
1003         for (int r=radix+1; r<nradix; r++) {
1004           memcpy(my_starts, my_starts_copy, 256*sizeof(uint16_t));  // restore starting offsets
1005           //for (int i=0,last=0; i<256; i++) { int tmp=my_counts[i]; if (tmp==0) continue; my_counts[i]=last; last=tmp; }  // rewind ++'s to offsets
1006           const uint8_t *restrict ksub = key[r]+from;
1007           for (int i=0; i<my_n; i++) ((uint8_t *)my_TMP)[my_starts[my_key[i]]++] = ksub[i];
1008           memcpy(key[r]+from, my_TMP, my_n);
1009         }
1010         TEND(14)
1011       }
1012     }
1013 
1014     if (!retgrp && radix+1==nradix) {
1015       return;  // we're done. avoid allocating and populating very last group sizes for last key
1016     }
1017     int my_gs[ngrp==0 ? 256 : ngrp];  // ngrp==0 when sort and skip==true; we didn't count the non-zeros in my_counts yet in that case
1018     if (sortType!=0) {
1019       ngrp=0;
1020       for (int i=0; i<256; i++) if (my_counts[i]) my_gs[ngrp++]=my_counts[i];  // this casts from uint16_t to int32, too
1021     } else {
1022       for (int i=0; i<ngrp; i++) my_gs[i]=my_counts[my_ugrp[i]];
1023     }
1024     TEND(15)
1025     if (radix+1==nradix) {
1026       // aside: cannot be all size 1 (a saving used in my_n<=256 case above) because my_n>256 and ngrp<=256
1027       push(my_gs, ngrp);
1028     } else {
1029       // this single thread will now descend and resolve all groups, now that the groups are close in cache
1030       for (int i=0, my_from=from; i<ngrp; i++) {
1031         radix_r(my_from, my_from+my_gs[i]-1, radix+1);
1032         my_from+=my_gs[i];
1033       }
1034     }
1035     return;
1036   }
1037   // else parallel batches. This is called recursively but only once or maybe twice before resolving to UINT16_MAX branch above
1038 
1039   int batchSize = MIN(UINT16_MAX, 1+my_n/getDTthreads(my_n, true));  // (my_n-1)/nBatch + 1;   //UINT16_MAX == 65535
1040   int nBatch = (my_n-1)/batchSize + 1;   // TODO: make nBatch a multiple of nThreads?
1041   int lastBatchSize = my_n - (nBatch-1)*batchSize;
1042   uint16_t *counts = calloc(nBatch*256,sizeof(uint16_t));
1043   uint8_t  *ugrps =  malloc(nBatch*256*sizeof(uint8_t));
1044   int      *ngrps =  calloc(nBatch    ,sizeof(int));
1045   if (!counts || !ugrps || !ngrps) STOP(_("Failed to allocate parallel counts. my_n=%d, nBatch=%d"), my_n, nBatch);
1046 
1047   bool skip=true;
1048   const int n_rem = nradix-radix-1;   // how many radix are remaining after this one
1049   TEND(16)
1050   #pragma omp parallel num_threads(getDTthreads(nBatch, false))
1051   {
1052     int     *my_otmp = malloc(batchSize * sizeof(int)); // thread-private write
1053     uint8_t *my_ktmp = malloc(batchSize * sizeof(uint8_t) * n_rem);
1054     // TODO: move these up above and point restrict[me] to them. Easier to Error that way if failed to alloc.
1055     #pragma omp for
1056     for (int batch=0; batch<nBatch; batch++) {
1057       const int my_n = (batch==nBatch-1) ? lastBatchSize : batchSize;  // lastBatchSize == batchSize when my_n is a multiple of batchSize
1058       const int my_from = from + batch*batchSize;
1059       uint16_t *restrict      my_counts = counts + batch*256;
1060       uint8_t  *restrict      my_ugrp   = ugrps  + batch*256;
1061       int                     my_ngrp   = 0;
1062       bool                    my_skip   = true;
1063       const uint8_t *restrict my_key    = key[radix] + my_from;
1064       const uint8_t *restrict byte = my_key;
1065       for (int i=0; i<my_n; i++, byte++) {
1066         if (++my_counts[*byte]==1) {   // always true first time when i==0
1067           my_ugrp[my_ngrp++] = *byte;
1068         } else if (my_skip && byte[0]!=byte[-1]) {   // include 'my_skip &&' to save != comparison after it's realized this batch is not grouped
1069           my_skip=false;
1070         }
1071       }
1072       ngrps[batch] = my_ngrp;  // write once to this shared cache line
1073       if (!my_skip) {
1074         skip = false;          // naked write to this shared byte is ok because false is only value written
1075         // gather this batch's anso and remaining keys. If we sorting too, urgrp is sorted later for that. Here we want to benefit from skip within batch
1076         // as much as possible which is a good chance since batchSize is relatively small (65535)
1077         for (int i=0, sum=0; i<my_ngrp; i++) { int tmp = my_counts[my_ugrp[i]]; my_counts[my_ugrp[i]]=sum; sum+=tmp; } // cumulate counts of this batch
1078         const int *restrict osub = anso+my_from;
1079         byte = my_key;
1080         for (int i=0; i<my_n; i++, byte++) {
1081           int dest = my_counts[*byte]++;
1082           my_otmp[dest] = *osub++;  // wastefully copies out 1:n when radix==0, but do not optimize as unlikely worth code complexity. my_otmp is not large, for example. Use first TEND() to decide.
1083           for (int r=0; r<n_rem; r++) my_ktmp[r*my_n + dest] = key[radix+1+r][my_from+i];   // reorder remaining keys
1084         }
1085         // or could do multiple passes through my_key like in the my_n<=65535 approach above. Test which is better depending on if TEND() points here.
1086 
1087         // we haven't completed all batches, so we don't know where these groups should place yet
1088         // So for now we write the thread-private small now-grouped buffers back in-place. The counts and groups across all batches will be used below to move these blocks.
1089         memcpy(anso+my_from, my_otmp, my_n*sizeof(int));
1090         for (int r=0; r<n_rem; r++) memcpy(key[radix+1+r]+my_from, my_ktmp+r*my_n, my_n*sizeof(uint8_t));
1091 
1092         // revert cumulate back to counts ready for vertical cumulate
1093         for (int i=0, last=0; i<my_ngrp; i++) { int tmp = my_counts[my_ugrp[i]]; my_counts[my_ugrp[i]]-=last; last=tmp; }
1094       }
1095     }
1096     free(my_otmp);
1097     free(my_ktmp);
1098   }
1099   TEND(17 + notFirst*3)  // 3 timings in this section: 17,18,19 first main split; 20,21,22 thereon
1100 
1101   // If my_n input is grouped and ugrp is sorted too (to illustrate), status now would be :
1102   // counts:                 ugrps:   ngrps:
1103   // 1: 20 18  2  0  0  0    0 1 2    3
1104   // 2:  0  0 17 21  5  0    2 3 4    3
1105   // 3:  0  0  0  0 15 19    4 5      2
1106   // If the keys within each and every batch were grouped, skip will be true.
1107   // Now we test if groups occurred in order across batches (like illustration above), and if not set skip=false
1108 
1109   uint8_t ugrp[256];  // head(ugrp,ngrp) will contain the unique values in appearance order
1110   bool    seen[256];  // is the value present in ugrp already
1111   int     ngrp=0;     // max value 256 so not uint8_t
1112   uint8_t last_seen=0;  // the last grp seen in the previous batch.  initialized 0 is not used
1113   for (int i=0; i<256; i++) seen[i]=false;
1114   for (int batch=0; batch<nBatch; batch++) {
1115     const uint8_t *restrict my_ugrp = ugrps + batch*256;
1116     if (ngrp==256 && !skip) break;  // no need to carry on
1117     for (int i=0; i<ngrps[batch]; i++) {
1118       if (!seen[my_ugrp[i]]) {
1119         seen[my_ugrp[i]] = true;
1120         ugrp[ngrp++] = last_seen = my_ugrp[i];
1121       } else if (skip && my_ugrp[i]!=last_seen) {   // ==last_seen would occur accross batch boundaries, like 2=>17 and 5=>15 in illustration above
1122         skip=false;
1123       }
1124     }
1125   }
1126 
1127   // If skip==true (my_n was pre-grouped) and
1128   // i) sortType==0 (not sorting groups) then osub and ksub don't need reordering. ugrp may happen to be sorted too but nothing special to do in that case.
1129   // ii) sortType==1|-1 and ugrp is already sorted then osub and ksub don't need reordering either.
1130 
1131   if (sortType!=0 && !sort_ugrp(ugrp, ngrp))
1132     skip=false;
1133 
1134   // now cumulate counts vertically to see where the blocks in the batches should be placed in the result across all batches
1135   // the counts are uint16_t but the cumulate needs to be int32_t (or int64_t in future) to hold the offsets
1136   // If skip==true and we're already done, we still need the first row of this cummulate (diff to get total group sizes) to push() or recurse below
1137 
1138   int *starts = calloc(nBatch*256, sizeof(int));  // keep starts the same shape and ugrp order as counts
1139   for (int j=0, sum=0; j<ngrp; j++) {  // iterate through columns (ngrp bytes)
1140     uint16_t *tmp1 = counts+ugrp[j];
1141     int      *tmp2 = starts+ugrp[j];
1142     for (int batch=0; batch<nBatch; batch++) {
1143       *tmp2 = sum;
1144       tmp2 += 256;
1145       sum += *tmp1;
1146       tmp1 += 256;
1147     }
1148   }
1149   // the first row now (when diff'd) now contains the size of each group across all batches
1150 
1151   TEND(18 + notFirst*3)
1152   if (!skip) {
1153     int *TMP = malloc(my_n * sizeof(int));
1154     if (!TMP) STOP(_("Unable to allocate TMP for my_n=%d items in parallel batch counting"), my_n);
1155     #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
1156     for (int batch=0; batch<nBatch; batch++) {
1157       const int *restrict      my_starts = starts + batch*256;
1158       const uint16_t *restrict my_counts = counts + batch*256;
1159       const int *restrict      osub = anso + from + batch*batchSize;  // the groups sit here contiguously
1160       const uint8_t *restrict  byte = ugrps + batch*256;              // in appearance order always logged here in ugrps
1161       const int                my_ngrp = ngrps[batch];
1162       for (int i=0; i<my_ngrp; i++, byte++) {
1163         const uint16_t len = my_counts[*byte];
1164         memcpy(TMP+my_starts[*byte], osub, len*sizeof(int));
1165         osub += len;
1166       }
1167     }
1168     memcpy(anso+from, TMP, my_n*sizeof(int));
1169 
1170     for (int r=0; r<n_rem; r++) {    // TODO: groups of sizeof(anso)  4 byte int currently  (in future 8).  To save team startup cost (but unlikely significant anyway)
1171       #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
1172       for (int batch=0; batch<nBatch; batch++) {
1173         const int *restrict      my_starts = starts + batch*256;
1174         const uint16_t *restrict my_counts = counts + batch*256;
1175         const uint8_t *restrict  ksub = key[radix+1+r] + from + batch*batchSize;  // the groups sit here contiguosly
1176         const uint8_t *restrict  byte = ugrps + batch*256;                        // in appearance order always logged here in ugrps
1177         const int                my_ngrp = ngrps[batch];
1178         for (int i=0; i<my_ngrp; i++, byte++) {
1179           const uint16_t len = my_counts[*byte];
1180           memcpy((uint8_t *)TMP + my_starts[*byte], ksub, len);
1181           ksub += len;
1182         }
1183       }
1184       memcpy(key[radix+1+r]+from, (uint8_t *)TMP, my_n);
1185     }
1186     free(TMP);
1187   }
1188   TEND(19 + notFirst*3)
1189   notFirst = true;
1190 
1191   int my_gs[ngrp];
1192   for (int i=1; i<ngrp; i++) my_gs[i-1] = starts[ugrp[i]] - starts[ugrp[i-1]];   // use the first row of starts to get totals
1193   my_gs[ngrp-1] = my_n - starts[ugrp[ngrp-1]];
1194 
1195   if (radix+1==nradix) {
1196     // aside: ngrp==my_n (all size 1 groups) isn't a possible short-circuit here similar to my_n>256 case above, my_n>65535 but ngrp<=256
1197     push(my_gs, ngrp);
1198     TEND(23)
1199   }
1200   else {
1201     // TODO: explicitly repeat parallel batch for any skew bins
1202     bool anyBig = false;
1203     for (int i=0; i<ngrp; i++) if (my_gs[i]>UINT16_MAX) { anyBig=true; break; }
1204     if (anyBig) {
1205       // Likely that they're all big. If so, each will go one-by-one and each will be parallel (case my_n>65535)
1206       // If only one or a few are big, then we have skew. The not-big ones will be <65535 and will be very
1207       // fast to run anyway relative to the big one(s) and the big one(s) will go in parallel. Hence dealing
1208       // with skew.
1209       // We could try and separate the not-big ones and do those in parallel first.  However, it's critical
1210       // that the groups are flushed in order otherwise the group size final result won't be correct (i.e. it
1211       // won't be aligned with anso). This consideration is independent of maintaining group appearance order.
1212       // When we have skew, by design it's a special type of limited skew in that the small ones are fixed to
1213       // to be small at <65535 (and hence fast anyway). In contract when they are all 'big' (>65535) but there
1214       // is skew in that one or a few are significantly bigger than the others, then they'll all go one-by-one
1215       // each in parallel here and they're all dealt with in parallel. There is no nestedness here.
1216       for (int i=0; i<ngrp; i++) {
1217         int start = from + starts[ugrp[i]];
1218         radix_r(start, start+my_gs[i]-1, radix+1);
1219         flush();
1220       }
1221       TEND(24)
1222     } else {
1223       // all groups are <=65535 and radix_r() will handle each one single-threaded. Therefore, this time
1224       // it does make sense to start a parallel team and there will be no nestedness here either.
1225       if (retgrp) {
1226         #pragma omp parallel for ordered schedule(dynamic) num_threads(getDTthreads(ngrp, false))
1227         for (int i=0; i<ngrp; i++) {
1228           int start = from + starts[ugrp[i]];
1229           radix_r(start, start+my_gs[i]-1, radix+1);
1230           #pragma omp ordered
1231           flush();
1232         }
1233       } else {
1234         // flush() is only relevant when retgrp==true so save the redundant ordered clause
1235         #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(ngrp, false))
1236         for (int i=0; i<ngrp; i++) {
1237           int start = from + starts[ugrp[i]];
1238           radix_r(start, start+my_gs[i]-1, radix+1);
1239         }
1240       }
1241       TEND(25)
1242     }
1243   }
1244   free(counts);
1245   free(starts);
1246   free(ugrps);
1247   free(ngrps);
1248   TEND(26)
1249 }
1250 
1251 
issorted(SEXP x,SEXP by)1252 SEXP issorted(SEXP x, SEXP by)
1253 {
1254   // Just checks if ordered and returns FALSE early if not. Does not return ordering if so, unlike forder.
1255   // Always increasing order with NA's first
1256   // Similar to base:is.unsorted but accepts NA at the beginning (standard in data.table and considered sorted) rather than
1257   // returning NA when NA present, and is multi-column.
1258   // TODO: test in big steps first to return faster if unsortedness is at the end (a common case of rbind'ing data to end)
1259   // These are all sequential access to x, so quick and cache efficient. Could be parallel by checking continuity at batch boundaries.
1260 
1261   if (!isNull(by) && !isInteger(by)) STOP(_("Internal error: issorted 'by' must be NULL or integer vector"));
1262   if (isVectorAtomic(x) || length(by)==1) {
1263     // one-column special case is very common so specialize it by avoiding column-type switches inside the row-loop later
1264     if (length(by)==1) {
1265       if (INTEGER(by)[0]<1 || INTEGER(by)[0]>length(x)) STOP(_("issorted 'by' [%d] out of range [1,%d]"), INTEGER(by)[0], length(x));
1266       x = VECTOR_ELT(x, INTEGER(by)[0]-1);
1267     }
1268     const int n = length(x);
1269     if (n <= 1) return(ScalarLogical(TRUE));
1270     if (!isVectorAtomic(x)) STOP(_("is.sorted does not work on list columns"));
1271     int i=1;
1272     switch(TYPEOF(x)) {
1273     case INTSXP : case LGLSXP : {
1274       int *xd = INTEGER(x);
1275       while (i<n && xd[i]>=xd[i-1]) i++;
1276     } break;
1277     case REALSXP :
1278       if (inherits(x,"integer64")) {
1279         int64_t *xd = (int64_t *)REAL(x);
1280         while (i<n && xd[i]>=xd[i-1]) i++;
1281       } else {
1282         double *xd = REAL(x);
1283         while (i<n && dtwiddle(xd[i])>=dtwiddle(xd[i-1])) i++;  // TODO: change to loop over any NA or -Inf at the beginning and then proceed without dtwiddle() (but rounding)
1284       }
1285       break;
1286     case STRSXP : {
1287       SEXP *xd = STRING_PTR(x);
1288       i = 0;
1289       while (i<n && xd[i]==NA_STRING) i++;
1290       bool need = NEED2UTF8(xd[i]);
1291       i++; // pass over first non-NA_STRING
1292       while (i<n) {
1293         if (xd[i]==xd[i-1]) {i++; continue;}
1294         if (xd[i]==NA_STRING) break;
1295         if (!need) need = NEED2UTF8(xd[i]);
1296         if ((need ? strcmp(CHAR(ENC2UTF8(xd[i])), CHAR(ENC2UTF8(xd[i-1]))) :
1297                     strcmp(CHAR(xd[i]), CHAR(xd[i-1]))) < 0) break;
1298         i++;
1299       }
1300     } break;
1301     default :
1302       STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(x)));
1303     }
1304     return ScalarLogical(i==n);
1305   }
1306   const int ncol = length(by);
1307   const R_xlen_t nrow = xlength(VECTOR_ELT(x,0));
1308   // ncol>1
1309   // pre-save lookups to save deep switch later for each column type
1310   size_t *sizes =          (size_t *)R_alloc(ncol, sizeof(size_t));
1311   const char **ptrs = (const char **)R_alloc(ncol, sizeof(char *));
1312   int *types =                (int *)R_alloc(ncol, sizeof(int));
1313   for (int j=0; j<ncol; ++j) {
1314     int c = INTEGER(by)[j];
1315     if (c<1 || c>length(x)) STOP(_("issorted 'by' [%d] out of range [1,%d]"), c, length(x));
1316     SEXP col = VECTOR_ELT(x, c-1);
1317     sizes[j] = SIZEOF(col);
1318     switch(TYPEOF(col)) {
1319     case INTSXP: case LGLSXP:
1320       types[j] = 0;
1321       ptrs[j] = (const char *)INTEGER(col);
1322       break;
1323     case REALSXP:
1324       types[j] = inherits(col, "integer64") ? 2 : 1;
1325       ptrs[j] = (const char *)REAL(col);
1326       break;
1327     case STRSXP:
1328       types[j] = 3;
1329       ptrs[j] = (const char *)STRING_PTR(col);
1330       break;
1331     default:
1332       STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(col)));  // # nocov
1333     }
1334   }
1335   for (R_xlen_t i=1; i<nrow; ++i) {
1336     int j = -1;
1337     while (++j<ncol) {
1338       size_t size = sizes[j];
1339       const char *colp = ptrs[j] + size*i;
1340       if (memcmp(colp, colp-size, size)==0) continue;  // in all-but-last column, we see many repeats so we can save the switch for those
1341       bool ok = false;
1342       switch (types[j]) {
1343       case 0 : {   // INTSXP, LGLSXP
1344         const int *p = (const int *)colp;
1345         ok = p[0]>p[-1];
1346       } break;
1347       case 1: {   // regular double in REALSXP
1348         const double *p = (const double *)colp;
1349         ok = dtwiddle(p[0])>dtwiddle(p[-1]);  // TODO: avoid dtwiddle by looping over any NA at the beginning, and remove NumericRounding.
1350       } break;
1351       case 2: {  // integer64 in REALSXP
1352         const int64_t *p = (const int64_t *)colp;
1353         ok = p[0]>p[-1];
1354       } break;
1355       case 3 : { // STRSXP
1356         const SEXP *p = (const SEXP *)colp;
1357         if (*p==NA_STRING) {
1358           ok = false; // previous value not NA (otherwise memcmp would have returned equal above) so can't be ordered
1359         } else {
1360           ok = (NEED2UTF8(p[0]) || NEED2UTF8(p[-1]) ?  // TODO: provide user option to choose ascii-only mode
1361                 strcmp(CHAR(ENC2UTF8(p[0])), CHAR(ENC2UTF8(p[-1]))) :
1362                 strcmp(CHAR(p[0]), CHAR(p[-1]))) >= 0;
1363         }
1364       } break;
1365       default :
1366         STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(x)));  // # nocov
1367       }
1368       if (!ok) return ScalarLogical(FALSE);  // not sorted so return early
1369       break; // this item is greater than previous in this column so ignore any remaining columns on this row
1370     }
1371   }
1372   return ScalarLogical(TRUE);
1373 }
1374 
isOrderedSubset(SEXP x,SEXP nrowArg)1375 SEXP isOrderedSubset(SEXP x, SEXP nrowArg)
1376 // specialized for use in [.data.table only
1377 // Ignores 0s but heeds NAs and any out-of-range (which result in NA)
1378 {
1379   if (!isNull(x) && !isInteger(x)) error(_("x must be either NULL or an integer vector"));
1380   if (length(x)<=1) return(ScalarLogical(TRUE));  // a single NA when length(x)==1 is ordered (e.g. tests 128 & 130) otherwise anyNA => FALSE
1381   if (!isInteger(nrowArg) || LENGTH(nrowArg)!=1) error(_("nrow must be integer vector length 1"));
1382   const int nrow = INTEGER(nrowArg)[0];
1383   if (nrow<0) error(_("nrow==%d but must be >=0"), nrow);
1384   const int *xd = INTEGER(x), xlen=LENGTH(x);
1385   for (int i=0, last=INT_MIN; i<xlen; ++i) {
1386     int elem = xd[i];
1387     if (elem==0) continue;
1388     if (elem<last || elem<0/*includes NA==INT_MIN*/ || elem>nrow)
1389       return(ScalarLogical(FALSE));
1390     last = elem;
1391   }
1392   return(ScalarLogical(TRUE));
1393 }
1394 
binary(SEXP x)1395 SEXP binary(SEXP x)
1396 // base::intToBits is close, but why does that print the result as "00 00 00 00" (raw) rather than ("0000") bits? Seems odd.
1397 {
1398   char buffer[69];
1399   int j;
1400   if (!isReal(x)) error(_("x must be type 'double'"));
1401   SEXP ans = PROTECT(allocVector(STRSXP, LENGTH(x)));
1402   uint64_t *xd = (uint64_t *)REAL(x);
1403   for (int i=0; i<LENGTH(x); i++) {
1404     uint64_t i64 = xd[i];
1405     j = 0;
1406     for(int bit=64; bit>=1; bit--)
1407     {
1408        buffer[j++] = '0' + (i64 >> (bit-1) & 1);
1409        if (bit==64 || bit==53 || bit==17 || bit==9) buffer[j++]=' ';
1410        //       ^sign      ^exponent  ^last 2 byte rounding
1411     }
1412     SET_STRING_ELT(ans, i, mkCharLen(buffer,68));  // 64 bits + 4 spaces
1413   }
1414   UNPROTECT(1);
1415   return ans;
1416 }
1417 
1418