1 #include "data.table.h"
2
3 // DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes
4 // Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up)
uniqlist(SEXP l,SEXP order)5 SEXP uniqlist(SEXP l, SEXP order)
6 {
7 // This works like UNIX uniq as referred to by ?base::unique; i.e., it
8 // drops immediately repeated rows but doesn't drop duplicates of any
9 // previous row. Unless, order is provided, then it also drops any previous
10 // row. l must be a list of same length vectors ans is allocated first
11 // (maximum length the number of rows) and the length returned in anslen.
12 // No NA in order which is guaranteed since internal-only. Used at R level internally (Cuniqlist) but is not and should not be exported.
13 // DONE: ans is now grown
14 if (!isNewList(l)) error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov
15 R_len_t ncol = length(l);
16 R_len_t nrow = length(VECTOR_ELT(l,0));
17 if (!isInteger(order)) error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov
18 if (LENGTH(order)<1) error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov
19 if (LENGTH(order)>1 && LENGTH(order)!=nrow) error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov
20 bool via_order = INTEGER(order)[0] != -1; // has an ordering vector been passed in that we have to hop via? Don't use MISSING() here as it appears unstable on Windows
21
22 unsigned long long *ulv; // for numeric check speed-up
23 SEXP v, ans;
24 R_len_t len, thisi, previ, isize=1000;
25 int *iidx = Calloc(isize, int); // for 'idx'
26 len = 1;
27 iidx[0] = 1; // first row is always the first of the first group
28
29 if (ncol==1) {
30
31 #define COMPARE1 \
32 prev = *vd; \
33 for (int i=1; i<nrow; i++) { \
34 elem = *++vd; \
35 if (elem!=prev
36
37 #define COMPARE1_VIA_ORDER \
38 prev = vd[*o -1]; \
39 for (int i=1; i<nrow; i++) { \
40 elem = vd[*++o -1]; \
41 if (elem!=prev
42
43 #define COMPARE2 \
44 ) { \
45 iidx[len++] = i+1; \
46 if (len>=isize) { \
47 isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); \
48 iidx = Realloc(iidx, isize, int); \
49 } \
50 } \
51 prev = elem; \
52 }
53
54 SEXP v = VECTOR_ELT(l,0);
55 int *o = INTEGER(order); // only used when via_order is true
56 switch(TYPEOF(v)) {
57 case INTSXP : case LGLSXP : {
58 const int *vd=INTEGER(v);
59 int prev, elem;
60 if (via_order) {
61 // ad hoc by (order passed in)
62 COMPARE1_VIA_ORDER COMPARE2
63 } else {
64 // e.g. by=key(DT)[1]
65 COMPARE1 COMPARE2
66 }
67 } break;
68 case STRSXP : {
69 const SEXP *vd=STRING_PTR(v);
70 SEXP prev, elem;
71 if (via_order) {
72 COMPARE1_VIA_ORDER && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2 // but most of the time they are equal, so ENC2UTF8 doesn't need to be called
73 } else {
74 COMPARE1 && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2
75 }
76 } break;
77 case REALSXP : {
78 // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine
79 if (getNumericRounding_C()==0 /*default*/ || inherits(v, "integer64")) {
80 const uint64_t *vd=(const uint64_t *)REAL(v);
81 uint64_t prev, elem;
82 if (via_order) {
83 COMPARE1_VIA_ORDER COMPARE2
84 } else {
85 COMPARE1 COMPARE2
86 }
87 } else {
88 const double *vd=(const double *)REAL(v);
89 double prev, elem;
90 if (via_order) {
91 COMPARE1_VIA_ORDER && dtwiddle(elem)!=dtwiddle(prev) COMPARE2
92 } else {
93 COMPARE1 && dtwiddle(elem)!=dtwiddle(prev) COMPARE2
94 }
95 }
96 } break;
97 default :
98 error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov
99 }
100 } else {
101 // ncol>1
102 thisi = via_order ? INTEGER(order)[0]-1 : 0;
103 bool *i64 = (bool *)R_alloc(ncol, sizeof(bool));
104 for (int i=0; i<ncol; i++) i64[i] = INHERITS(VECTOR_ELT(l,i), char_integer64);
105 for (int i=1; i<nrow; i++) {
106 previ = thisi;
107 thisi = via_order ? INTEGER(order)[i]-1 : i;
108 int j = ncol; // the last column varies the most frequently so check that first and work backwards
109 bool b = true;
110 while (--j>=0 && b) {
111 v=VECTOR_ELT(l,j);
112 switch (TYPEOF(v)) {
113 case INTSXP : case LGLSXP : // NA_INTEGER==NA_LOGICAL checked in init.c
114 b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break;
115 case STRSXP :
116 // fix for #469, when key is set, duplicated calls uniqlist, where encoding
117 // needs to be taken care of.
118 b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
119 // TODO: surely faster way than this two deep STRING_ELT()
120 case REALSXP :
121 ulv = (unsigned long long *)REAL(v);
122 b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)
123 if (!b && !i64[j]) {
124 b = dtwiddle(REAL(v)[thisi]) == dtwiddle(REAL(v)[previ]);
125 // could store LHS for use next time as RHS (to save calling dtwiddle twice). However: i) there could be multiple double columns so vector of RHS would need
126 // to be stored, ii) many short-circuit early before the if (!b) anyway (negating benefit) and iii) we may not have needed LHS this time so logic would be complex.
127 }
128 break;
129 default :
130 error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov
131 }
132 }
133 if (!b) {
134 iidx[len++] = i+1;
135 if (len >= isize) {
136 isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));
137 iidx = Realloc(iidx, isize, int);
138 }
139 }
140 }
141 }
142 PROTECT(ans = allocVector(INTSXP, len));
143 memcpy(INTEGER(ans), iidx, sizeof(int)*len); // sizeof is of type size_t - no integer overflow issues
144 Free(iidx);
145 UNPROTECT(1);
146 return(ans);
147 }
148
149 SEXP uniqlengths(SEXP x, SEXP n) {
150 // seems very similar to rbindlist.c:uniq_lengths. TODO: centralize into common function
151 if (TYPEOF(x) != INTSXP) error(_("Input argument 'x' to 'uniqlengths' must be an integer vector"));
152 if (TYPEOF(n) != INTSXP || length(n) != 1) error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1"));
153 R_len_t len = length(x);
154 SEXP ans = PROTECT(allocVector(INTSXP, len));
155 for (R_len_t i=1; i<len; i++) {
156 INTEGER(ans)[i-1] = INTEGER(x)[i] - INTEGER(x)[i-1];
157 }
158 if (len>0) INTEGER(ans)[len-1] = INTEGER(n)[0] - INTEGER(x)[len-1] + 1;
159 UNPROTECT(1);
160 return(ans);
161 }
162
163 // we could compute `uniqlist` and `uniqlengths` and then construct the result
164 // but that seems unnecessary waste of memory and roundabout..
165 // so, we'll do it directly here..
166 SEXP rleid(SEXP l, SEXP cols) {
167 R_xlen_t nrow = xlength(VECTOR_ELT(l, 0));
168 R_len_t ncol = length(l), lencols = length(cols);
169 if (!nrow || !ncol) return(allocVector(INTSXP, 0));
170 if (!isInteger(cols) || lencols==0) error(_("cols must be an integer vector with length >= 1"));
171 int *icols = INTEGER(cols);
172 for (int i=0; i<lencols; i++) {
173 int elem = icols[i];
174 if (elem<1 || elem>ncol) error(_("Item %d of cols is %d which is outside range of l [1,length(l)=%d]"), i+1, elem, ncol);
175 }
176 for (int i=1; i<ncol; i++) {
177 if (xlength(VECTOR_ELT(l,i)) != nrow) error(_("All elements to input list must be of same length. Element [%d] has length %"PRIu64" != length of first element = %"PRIu64"."), i+1, (uint64_t)xlength(VECTOR_ELT(l,i)), (uint64_t)nrow);
178 }
179 SEXP ans = PROTECT(allocVector(INTSXP, nrow));
180 int *ians = INTEGER(ans);
181 int grp = 1;
182 ians[0] = grp; // first row is always the first of first group
183 if (ncol > 1) {
184 for (R_xlen_t i=1; i<nrow; i++) {
185 bool same = true;
186 int j = lencols;
187 // the last column varies the most frequently so check that first and work backwards
188 while (--j>=0 && same) {
189 SEXP jcol = VECTOR_ELT(l, icols[j]-1);
190 switch (TYPEOF(jcol)) {
191 case INTSXP : case LGLSXP :
192 same = INTEGER(jcol)[i]==INTEGER(jcol)[i-1];
193 break;
194 case STRSXP :
195 same = STRING_ELT(jcol,i)==STRING_ELT(jcol,i-1);
196 // TODO: do we want to check encodings here now that forder seems to?
197 // Old comment : forder checks no non-ascii unknown, and either UTF-8 or Latin1 but not both.
198 // So == pointers is ok given that check
199 break;
200 case REALSXP : {
201 long long *ll = (long long *)REAL(jcol);
202 same = ll[i]==ll[i-1];
203 // 8 bytes of bits are identical. For real (no rounding currently) and integer64
204 // long long == 8 bytes checked in init.c
205 } break;
206 case CPLXSXP: {
207 Rcomplex *pz = COMPLEX(jcol);
208 same = memcmp(&pz[i], &pz[i-1], sizeof(Rcomplex))==0; // compiler optimization should replace library call with best 16-byte fixed method
209 } break;
210 default :
211 error(_("Type '%s' not supported"), type2char(TYPEOF(jcol))); // # nocov
212 }
213 }
214 ians[i] = (grp+=!same);
215 }
216 } else { // we are checking only one column so we can easily takes inline R functions out of the loops
217 SEXP jcol = VECTOR_ELT(l, icols[0]-1);
218 switch (TYPEOF(jcol)) {
219 case INTSXP : case LGLSXP : {
220 int *ijcol = INTEGER(jcol);
221 for (R_xlen_t i=1; i<nrow; i++) {
222 bool same = ijcol[i]==ijcol[i-1];
223 ians[i] = (grp+=!same);
224 }
225 } break;
226 case STRSXP : {
227 const SEXP *jd = STRING_PTR(jcol);
228 for (R_xlen_t i=1; i<nrow; i++) {
229 bool same = jd[i]==jd[i-1];
230 ians[i] = (grp+=!same);
231 }
232 } break;
233 case REALSXP : {
234 long long *lljcol = (long long *)REAL(jcol);
235 for (R_xlen_t i=1; i<nrow; i++) {
236 bool same = lljcol[i]==lljcol[i-1];
237 ians[i] = (grp+=!same);
238 }
239 } break;
240 case CPLXSXP: {
241 Rcomplex *pzjcol = COMPLEX(jcol);
242 for (R_xlen_t i=1; i<nrow; i++) {
243 bool same = memcmp(&pzjcol[i], &pzjcol[i-1], sizeof(Rcomplex))==0;
244 ians[i] = (grp += !same);
245 }
246 } break;
247 default :
248 error(_("Type '%s' not supported"), type2char(TYPEOF(jcol)));
249 }
250 }
251 UNPROTECT(1);
252 return(ans);
253 }
254
255 SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP multArg) {
256 Rboolean byorder = (length(order)>0);
257 SEXP v, ans;
258 if (!isNewList(l) || length(l) < 1) error(_("Internal error: nestedid was not passed a list length 1 or more")); // # nocov
259 R_len_t nrows = length(VECTOR_ELT(l,0)), ncols = length(cols);
260 if (nrows==0) return(allocVector(INTSXP, 0));
261 R_len_t thisi, previ, ansgrpsize=1000, nansgrp=0;
262 R_len_t *ansgrp = Calloc(ansgrpsize, R_len_t), starts, grplen; // #3401 fix. Needs to be Calloc due to Realloc below .. else segfaults.
263 R_len_t ngrps = length(grps);
264 bool *i64 = (bool *)R_alloc(ncols, sizeof(bool));
265 if (ngrps==0) error(_("Internal error: nrows[%d]>0 but ngrps==0"), nrows); // # nocov
266 R_len_t resetctr=0, rlen = length(resetvals) ? INTEGER(resetvals)[0] : 0;
267 if (!isInteger(cols) || ncols == 0) error(_("cols must be an integer vector of positive length"));
268 // mult arg
269 enum {ALL, FIRST, LAST} mult = ALL;
270 if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL;
271 else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST;
272 else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST;
273 else error(_("Internal error: invalid value for 'mult'. please report to data.table issue tracker")); // # nocov
274 // integer64
275 for (int j=0; j<ncols; j++) {
276 i64[j] = INHERITS(VECTOR_ELT(l, INTEGER(cols)[j]-1), char_integer64);
277 }
278 ans = PROTECT(allocVector(INTSXP, nrows));
279 int *ians = INTEGER(ans), *igrps = INTEGER(grps);
280 grplen = (ngrps == 1) ? nrows : igrps[1]-igrps[0];
281 starts = igrps[0]-1 + (mult != LAST ? 0 : grplen-1);
282 ansgrp[0] = byorder ? INTEGER(order)[starts]-1 : starts;
283 for (int j=0; j<grplen; j++) {
284 ians[byorder ? INTEGER(order)[igrps[0]-1+j]-1 : igrps[0]-1+j] = 1;
285 }
286 nansgrp = 1;
287 for (int i=1; i<ngrps; i++) {
288 // "first"=add next grp to current grp iff min(next) >= min(current)
289 // "last"=add next grp to current grp iff max(next) >= max(current)
290 // in addition to this thisi >= previ should be satisfied
291 // could result in more groups.. so done only for first/last cases
292 // as it allows to extract indices directly in bmerge.
293 grplen = (i+1 < ngrps) ? igrps[i+1]-igrps[i] : nrows-igrps[i]+1;
294 starts = igrps[i]-1 + (mult != LAST ? 0 : grplen-1);
295 thisi = byorder ? INTEGER(order)[starts]-1 : starts;
296 Rboolean b = TRUE;
297 int k = 0;
298 for (; k<nansgrp; k++) {
299 int j = ncols;
300 previ = ansgrp[k];
301 // b=TRUE is ideal for mult=ALL, results in lesser groups
302 b = mult == ALL || (thisi >= previ);
303 // >= 0 is not necessary as first col will always be in
304 // increasing order. NOTE: all "==" cols are already skipped for
305 // computing nestedid during R-side call, for efficiency.
306 while(b && --j>0) {
307 v = VECTOR_ELT(l,INTEGER(cols)[j]-1);
308 switch(TYPEOF(v)) {
309 case INTSXP: case LGLSXP:
310 b = INTEGER(v)[thisi] >= INTEGER(v)[previ];
311 break;
312 case STRSXP :
313 b = ENC2UTF8(STRING_ELT(v,thisi)) == ENC2UTF8(STRING_ELT(v,previ));
314 break;
315 case REALSXP: {
316 double *xd = REAL(v);
317 b = i64[j] ? ((int64_t *)xd)[thisi] >= ((int64_t *)xd)[previ] :
318 dtwiddle(xd[thisi]) >= dtwiddle(xd[previ]);
319 } break;
320 default:
321 error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov
322 }
323 }
324 if (b) break;
325 }
326 // TODO: move this as the outer for-loop and parallelise..
327 // but preferably wait to see if problems with that big non-equi
328 // group sizes do occur that commonly before to invest time here.
329 int tmp=0;
330 if (rlen != starts) {
331 tmp = b ? k : nansgrp++;
332 } else { // we're wrapping up this group, reset nansgrp
333 tmp = 0; nansgrp = 1;
334 rlen += INTEGER(resetvals)[++resetctr];
335 }
336 if (nansgrp >= ansgrpsize) {
337 ansgrpsize = MIN(nrows, (size_t)(1.1*(double)ansgrpsize*((double)nrows/i)));
338 ansgrp = Realloc(ansgrp, ansgrpsize, int);
339 }
340 for (int j=0; j<grplen; j++) {
341 ians[byorder ? INTEGER(order)[igrps[i]-1+j]-1 : igrps[i]-1+j] = tmp+1;
342 }
343 ansgrp[tmp] = thisi;
344 }
345 Free(ansgrp);
346 UNPROTECT(1);
347 return(ans);
348 }
349
350 SEXP uniqueNlogical(SEXP x, SEXP narmArg) {
351 // single pass; short-circuit and return as soon as all 3 values are found
352 if (!isLogical(x)) error(_("x is not a logical vector"));
353 if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error(_("na.rm must be TRUE or FALSE"));
354 bool narm = LOGICAL(narmArg)[0]==1;
355 const R_xlen_t n = xlength(x);
356 if (n==0)
357 return ScalarInteger(0); // empty vector
358 Rboolean first = LOGICAL(x)[0];
359 R_xlen_t i=0;
360 const int *ix = LOGICAL(x);
361 while (++i<n && ix[i]==first);
362 if (i==n)
363 return ScalarInteger(first==NA_INTEGER && narm ? 0 : 1); // all one value
364 Rboolean second = ix[i];
365 // we've found 2 different values (first and second). Which one didn't we find? Then just look for that.
366 // NA_LOGICAL == INT_MIN checked in init.c
367 const int third = (first+second == 1) ? NA_LOGICAL : ( first+second == INT_MIN ? TRUE : FALSE );
368 if (third==NA_LOGICAL && narm)
369 return ScalarInteger(2); // TRUE and FALSE found before any NA, but na.rm=TRUE so we're done
370 while (++i<n) if (ix[i]==third)
371 return ScalarInteger(3-narm);
372 return ScalarInteger(2-(narm && third!=NA_LOGICAL));
373 }
374
375