1 #include "data.table.h"
2 
3 // DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes
4 // Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up)
uniqlist(SEXP l,SEXP order)5 SEXP uniqlist(SEXP l, SEXP order)
6 {
7   // This works like UNIX uniq as referred to by ?base::unique; i.e., it
8   // drops immediately repeated rows but doesn't drop duplicates of any
9   // previous row. Unless, order is provided, then it also drops any previous
10   // row. l must be a list of same length vectors ans is allocated first
11   // (maximum length the number of rows) and the length returned in anslen.
12   // No NA in order which is guaranteed since internal-only. Used at R level internally (Cuniqlist) but is not and should not be exported.
13   // DONE: ans is now grown
14   if (!isNewList(l)) error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov
15   R_len_t ncol = length(l);
16   R_len_t nrow = length(VECTOR_ELT(l,0));
17   if (!isInteger(order)) error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov
18   if (LENGTH(order)<1) error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov
19   if (LENGTH(order)>1 && LENGTH(order)!=nrow) error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov
20   bool via_order = INTEGER(order)[0] != -1;  // has an ordering vector been passed in that we have to hop via? Don't use MISSING() here as it appears unstable on Windows
21 
22   unsigned long long *ulv; // for numeric check speed-up
23   SEXP v, ans;
24   R_len_t len, thisi, previ, isize=1000;
25   int *iidx = Calloc(isize, int); // for 'idx'
26   len = 1;
27   iidx[0] = 1; // first row is always the first of the first group
28 
29   if (ncol==1) {
30 
31 #define COMPARE1                                                                 \
32       prev = *vd;                                                                \
33       for (int i=1; i<nrow; i++) {                                               \
34         elem = *++vd;                                                            \
35         if (elem!=prev
36 
37 #define COMPARE1_VIA_ORDER                                                       \
38       prev = vd[*o -1];                                                          \
39       for (int i=1; i<nrow; i++) {                                               \
40         elem = vd[*++o -1];                                                      \
41         if (elem!=prev
42 
43 #define COMPARE2                                                                 \
44                         ) {                                                      \
45           iidx[len++] = i+1;                                                     \
46           if (len>=isize) {                                                      \
47             isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));     \
48             iidx = Realloc(iidx, isize, int);                                    \
49           }                                                                      \
50         }                                                                        \
51         prev = elem;                                                             \
52       }
53 
54     SEXP v = VECTOR_ELT(l,0);
55     int *o = INTEGER(order);  // only used when via_order is true
56     switch(TYPEOF(v)) {
57     case INTSXP : case LGLSXP : {
58       const int *vd=INTEGER(v);
59       int prev, elem;
60       if (via_order) {
61         // ad hoc by (order passed in)
62         COMPARE1_VIA_ORDER COMPARE2
63       } else {
64         // e.g. by=key(DT)[1]
65         COMPARE1           COMPARE2
66       }
67     } break;
68     case STRSXP : {
69       const SEXP *vd=STRING_PTR(v);
70       SEXP prev, elem;
71       if (via_order) {
72         COMPARE1_VIA_ORDER && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2   // but most of the time they are equal, so ENC2UTF8 doesn't need to be called
73       } else {
74         COMPARE1           && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2
75       }
76     } break;
77     case REALSXP : {
78       // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine
79       if (getNumericRounding_C()==0 /*default*/ || inherits(v, "integer64")) {
80         const uint64_t *vd=(const uint64_t *)REAL(v);
81         uint64_t prev, elem;
82         if (via_order) {
83           COMPARE1_VIA_ORDER COMPARE2
84         } else {
85           COMPARE1           COMPARE2
86         }
87       } else {
88         const double *vd=(const double *)REAL(v);
89         double prev, elem;
90         if (via_order) {
91           COMPARE1_VIA_ORDER && dtwiddle(elem)!=dtwiddle(prev) COMPARE2
92         } else {
93           COMPARE1           && dtwiddle(elem)!=dtwiddle(prev) COMPARE2
94         }
95       }
96     } break;
97     default :
98       error(_("Type '%s' not supported"), type2char(TYPEOF(v)));  // # nocov
99     }
100   } else {
101     // ncol>1
102     thisi = via_order ? INTEGER(order)[0]-1 : 0;
103     bool *i64 = (bool *)R_alloc(ncol, sizeof(bool));
104     for (int i=0; i<ncol; i++) i64[i] = INHERITS(VECTOR_ELT(l,i), char_integer64);
105     for (int i=1; i<nrow; i++) {
106       previ = thisi;
107       thisi = via_order ? INTEGER(order)[i]-1 : i;
108       int j = ncol;  // the last column varies the most frequently so check that first and work backwards
109       bool b = true;
110       while (--j>=0 && b) {
111         v=VECTOR_ELT(l,j);
112         switch (TYPEOF(v)) {
113         case INTSXP : case LGLSXP :  // NA_INTEGER==NA_LOGICAL checked in init.c
114           b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break;
115         case STRSXP :
116           // fix for #469, when key is set, duplicated calls uniqlist, where encoding
117           // needs to be taken care of.
118           b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break;  // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
119           // TODO: surely faster way than this two deep STRING_ELT()
120         case REALSXP :
121           ulv = (unsigned long long *)REAL(v);
122           b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)
123           if (!b && !i64[j]) {
124             b = dtwiddle(REAL(v)[thisi]) == dtwiddle(REAL(v)[previ]);
125             // could store LHS for use next time as RHS (to save calling dtwiddle twice). However: i) there could be multiple double columns so vector of RHS would need
126             // to be stored, ii) many short-circuit early before the if (!b) anyway (negating benefit) and iii) we may not have needed LHS this time so logic would be complex.
127           }
128           break;
129         default :
130           error(_("Type '%s' not supported"), type2char(TYPEOF(v)));  // # nocov
131         }
132       }
133       if (!b) {
134         iidx[len++] = i+1;
135         if (len >= isize) {
136           isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));
137           iidx = Realloc(iidx, isize, int);
138         }
139       }
140     }
141   }
142   PROTECT(ans = allocVector(INTSXP, len));
143   memcpy(INTEGER(ans), iidx, sizeof(int)*len); // sizeof is of type size_t - no integer overflow issues
144   Free(iidx);
145   UNPROTECT(1);
146   return(ans);
147 }
148 
149 SEXP uniqlengths(SEXP x, SEXP n) {
150   // seems very similar to rbindlist.c:uniq_lengths. TODO: centralize into common function
151   if (TYPEOF(x) != INTSXP) error(_("Input argument 'x' to 'uniqlengths' must be an integer vector"));
152   if (TYPEOF(n) != INTSXP || length(n) != 1) error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1"));
153   R_len_t len = length(x);
154   SEXP ans = PROTECT(allocVector(INTSXP, len));
155   for (R_len_t i=1; i<len; i++) {
156     INTEGER(ans)[i-1] = INTEGER(x)[i] - INTEGER(x)[i-1];
157   }
158   if (len>0) INTEGER(ans)[len-1] = INTEGER(n)[0] - INTEGER(x)[len-1] + 1;
159   UNPROTECT(1);
160   return(ans);
161 }
162 
163 // we could compute `uniqlist` and `uniqlengths` and then construct the result
164 // but that seems unnecessary waste of memory and roundabout..
165 // so, we'll do it directly here..
166 SEXP rleid(SEXP l, SEXP cols) {
167   R_xlen_t nrow = xlength(VECTOR_ELT(l, 0));
168   R_len_t ncol = length(l), lencols = length(cols);
169   if (!nrow || !ncol) return(allocVector(INTSXP, 0));
170   if (!isInteger(cols) || lencols==0) error(_("cols must be an integer vector with length >= 1"));
171   int *icols = INTEGER(cols);
172   for (int i=0; i<lencols; i++) {
173     int elem = icols[i];
174     if (elem<1 || elem>ncol) error(_("Item %d of cols is %d which is outside range of l [1,length(l)=%d]"), i+1, elem, ncol);
175   }
176   for (int i=1; i<ncol; i++) {
177     if (xlength(VECTOR_ELT(l,i)) != nrow) error(_("All elements to input list must be of same length. Element [%d] has length %"PRIu64" != length of first element = %"PRIu64"."), i+1, (uint64_t)xlength(VECTOR_ELT(l,i)), (uint64_t)nrow);
178   }
179   SEXP ans = PROTECT(allocVector(INTSXP, nrow));
180   int *ians = INTEGER(ans);
181   int grp = 1;
182   ians[0] = grp; // first row is always the first of first group
183   if (ncol > 1) {
184     for (R_xlen_t i=1; i<nrow; i++) {
185       bool same = true;
186       int j = lencols;
187       // the last column varies the most frequently so check that first and work backwards
188       while (--j>=0 && same) {
189         SEXP jcol = VECTOR_ELT(l, icols[j]-1);
190         switch (TYPEOF(jcol)) {
191         case INTSXP : case LGLSXP :
192           same = INTEGER(jcol)[i]==INTEGER(jcol)[i-1];
193           break;
194         case STRSXP :
195           same = STRING_ELT(jcol,i)==STRING_ELT(jcol,i-1);
196           // TODO: do we want to check encodings here now that forder seems to?
197           // Old comment : forder checks no non-ascii unknown, and either UTF-8 or Latin1 but not both.
198           //               So == pointers is ok given that check
199           break;
200         case REALSXP : {
201           long long *ll = (long long *)REAL(jcol);
202           same = ll[i]==ll[i-1];
203           // 8 bytes of bits are identical. For real (no rounding currently) and integer64
204           // long long == 8 bytes checked in init.c
205         } break;
206         case CPLXSXP: {
207           Rcomplex *pz = COMPLEX(jcol);
208           same = memcmp(&pz[i], &pz[i-1], sizeof(Rcomplex))==0; // compiler optimization should replace library call with best 16-byte fixed method
209         } break;
210         default :
211           error(_("Type '%s' not supported"), type2char(TYPEOF(jcol)));  // # nocov
212         }
213       }
214       ians[i] = (grp+=!same);
215     }
216   } else { // we are checking only one column so we can easily takes inline R functions out of the loops
217     SEXP jcol = VECTOR_ELT(l, icols[0]-1);
218     switch (TYPEOF(jcol)) {
219     case INTSXP : case LGLSXP : {
220       int *ijcol = INTEGER(jcol);
221       for (R_xlen_t i=1; i<nrow; i++) {
222         bool same = ijcol[i]==ijcol[i-1];
223         ians[i] = (grp+=!same);
224       }
225     } break;
226     case STRSXP : {
227       const SEXP *jd = STRING_PTR(jcol);
228       for (R_xlen_t i=1; i<nrow; i++) {
229         bool same = jd[i]==jd[i-1];
230         ians[i] = (grp+=!same);
231       }
232     } break;
233     case REALSXP : {
234       long long *lljcol = (long long *)REAL(jcol);
235       for (R_xlen_t i=1; i<nrow; i++) {
236         bool same = lljcol[i]==lljcol[i-1];
237         ians[i] = (grp+=!same);
238       }
239     } break;
240     case CPLXSXP: {
241       Rcomplex *pzjcol = COMPLEX(jcol);
242       for (R_xlen_t i=1; i<nrow; i++) {
243         bool same = memcmp(&pzjcol[i], &pzjcol[i-1], sizeof(Rcomplex))==0;
244         ians[i] = (grp += !same);
245       }
246     } break;
247     default :
248       error(_("Type '%s' not supported"), type2char(TYPEOF(jcol)));
249     }
250   }
251   UNPROTECT(1);
252   return(ans);
253 }
254 
255 SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP multArg) {
256   Rboolean byorder = (length(order)>0);
257   SEXP v, ans;
258   if (!isNewList(l) || length(l) < 1) error(_("Internal error: nestedid was not passed a list length 1 or more")); // # nocov
259   R_len_t nrows = length(VECTOR_ELT(l,0)), ncols = length(cols);
260   if (nrows==0) return(allocVector(INTSXP, 0));
261   R_len_t thisi, previ, ansgrpsize=1000, nansgrp=0;
262   R_len_t *ansgrp = Calloc(ansgrpsize, R_len_t), starts, grplen; // #3401 fix. Needs to be Calloc due to Realloc below .. else segfaults.
263   R_len_t ngrps = length(grps);
264   bool *i64 = (bool *)R_alloc(ncols, sizeof(bool));
265   if (ngrps==0) error(_("Internal error: nrows[%d]>0 but ngrps==0"), nrows); // # nocov
266   R_len_t resetctr=0, rlen = length(resetvals) ? INTEGER(resetvals)[0] : 0;
267   if (!isInteger(cols) || ncols == 0) error(_("cols must be an integer vector of positive length"));
268   // mult arg
269   enum {ALL, FIRST, LAST} mult = ALL;
270   if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL;
271   else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST;
272   else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST;
273   else error(_("Internal error: invalid value for 'mult'. please report to data.table issue tracker")); // # nocov
274   // integer64
275   for (int j=0; j<ncols; j++) {
276     i64[j] = INHERITS(VECTOR_ELT(l, INTEGER(cols)[j]-1), char_integer64);
277   }
278   ans  = PROTECT(allocVector(INTSXP, nrows));
279   int *ians = INTEGER(ans), *igrps = INTEGER(grps);
280   grplen = (ngrps == 1) ? nrows : igrps[1]-igrps[0];
281   starts = igrps[0]-1 + (mult != LAST ? 0 : grplen-1);
282   ansgrp[0] = byorder ? INTEGER(order)[starts]-1 : starts;
283   for (int j=0; j<grplen; j++) {
284     ians[byorder ? INTEGER(order)[igrps[0]-1+j]-1 : igrps[0]-1+j] = 1;
285   }
286   nansgrp = 1;
287   for (int i=1; i<ngrps; i++) {
288     // "first"=add next grp to current grp iff min(next) >= min(current)
289     // "last"=add next grp to current grp iff max(next) >= max(current)
290     // in addition to this thisi >= previ should be satisfied
291     // could result in more groups.. so done only for first/last cases
292     // as it allows to extract indices directly in bmerge.
293     grplen = (i+1 < ngrps) ? igrps[i+1]-igrps[i] : nrows-igrps[i]+1;
294     starts = igrps[i]-1 + (mult != LAST ? 0 : grplen-1);
295     thisi = byorder ? INTEGER(order)[starts]-1 : starts;
296     Rboolean b = TRUE;
297     int k = 0;
298     for (; k<nansgrp; k++) {
299       int j = ncols;
300       previ = ansgrp[k];
301       // b=TRUE is ideal for mult=ALL, results in lesser groups
302       b = mult == ALL || (thisi >= previ);
303       // >= 0 is not necessary as first col will always be in
304       // increasing order. NOTE: all "==" cols are already skipped for
305       // computing nestedid during R-side call, for efficiency.
306       while(b && --j>0) {
307         v = VECTOR_ELT(l,INTEGER(cols)[j]-1);
308         switch(TYPEOF(v)) {
309         case INTSXP: case LGLSXP:
310           b = INTEGER(v)[thisi] >= INTEGER(v)[previ];
311           break;
312         case STRSXP :
313           b = ENC2UTF8(STRING_ELT(v,thisi)) == ENC2UTF8(STRING_ELT(v,previ));
314           break;
315         case REALSXP: {
316           double *xd = REAL(v);
317           b = i64[j] ? ((int64_t *)xd)[thisi] >= ((int64_t *)xd)[previ] :
318                        dtwiddle(xd[thisi]) >= dtwiddle(xd[previ]);
319         } break;
320         default:
321           error(_("Type '%s' not supported"), type2char(TYPEOF(v)));  // # nocov
322         }
323       }
324       if (b) break;
325     }
326     // TODO: move this as the outer for-loop and parallelise..
327     // but preferably wait to see if problems with that big non-equi
328     // group sizes do occur that commonly before to invest time here.
329     int tmp=0;
330     if (rlen != starts) {
331       tmp = b ? k : nansgrp++;
332     } else { // we're wrapping up this group, reset nansgrp
333       tmp = 0; nansgrp = 1;
334       rlen += INTEGER(resetvals)[++resetctr];
335     }
336     if (nansgrp >= ansgrpsize) {
337       ansgrpsize = MIN(nrows, (size_t)(1.1*(double)ansgrpsize*((double)nrows/i)));
338       ansgrp = Realloc(ansgrp, ansgrpsize, int);
339     }
340     for (int j=0; j<grplen; j++) {
341       ians[byorder ? INTEGER(order)[igrps[i]-1+j]-1 : igrps[i]-1+j] = tmp+1;
342     }
343     ansgrp[tmp] = thisi;
344   }
345   Free(ansgrp);
346   UNPROTECT(1);
347   return(ans);
348 }
349 
350 SEXP uniqueNlogical(SEXP x, SEXP narmArg) {
351   // single pass; short-circuit and return as soon as all 3 values are found
352   if (!isLogical(x)) error(_("x is not a logical vector"));
353   if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error(_("na.rm must be TRUE or FALSE"));
354   bool narm = LOGICAL(narmArg)[0]==1;
355   const R_xlen_t n = xlength(x);
356   if (n==0)
357     return ScalarInteger(0);  // empty vector
358   Rboolean first = LOGICAL(x)[0];
359   R_xlen_t i=0;
360   const int *ix = LOGICAL(x);
361   while (++i<n && ix[i]==first);
362   if (i==n)
363     return ScalarInteger(first==NA_INTEGER && narm ? 0 : 1); // all one value
364   Rboolean second = ix[i];
365   // we've found 2 different values (first and second). Which one didn't we find? Then just look for that.
366   // NA_LOGICAL == INT_MIN checked in init.c
367   const int third = (first+second == 1) ? NA_LOGICAL : ( first+second == INT_MIN ? TRUE : FALSE );
368   if (third==NA_LOGICAL && narm)
369     return ScalarInteger(2);  // TRUE and FALSE found before any NA, but na.rm=TRUE so we're done
370   while (++i<n) if (ix[i]==third)
371     return ScalarInteger(3-narm);
372   return ScalarInteger(2-(narm && third!=NA_LOGICAL));
373 }
374 
375