1 #include "vctrs.h"
2 #include "utils.h"
3 #include "equal.h"
4 #include "translate.h"
5 
6 // -----------------------------------------------------------------------------
7 
8 static SEXP vec_locate_runs(SEXP x, bool start);
9 
10 // [[register()]]
vctrs_locate_runs(SEXP x,SEXP start)11 SEXP vctrs_locate_runs(SEXP x, SEXP start) {
12   bool c_start = (bool) r_bool_as_int(start);
13   return vec_locate_runs(x, c_start);
14 }
15 
16 static void vec_locate_run_starts(const int* p_id, r_ssize size, int* p_out);
17 static void vec_locate_run_ends(const int* p_id, r_ssize size, int* p_out);
18 
19 static
vec_locate_runs(SEXP x,bool start)20 SEXP vec_locate_runs(SEXP x, bool start) {
21   SEXP id = PROTECT(vec_identify_runs(x));
22   const int* p_id = INTEGER(id);
23 
24   r_ssize size = r_length(id);
25 
26   int n = r_int_get(r_attrib_get(id, syms_n), 0);
27 
28   SEXP out = PROTECT(r_new_integer(n));
29   int* p_out = INTEGER(out);
30 
31   if (n == 0) {
32     UNPROTECT(2);
33     return out;
34   }
35 
36   if (start) {
37     vec_locate_run_starts(p_id, size, p_out);
38   } else {
39     vec_locate_run_ends(p_id, size, p_out);
40   }
41 
42   UNPROTECT(2);
43   return out;
44 }
45 
46 static
vec_locate_run_starts(const int * p_id,r_ssize size,int * p_out)47 void vec_locate_run_starts(const int* p_id, r_ssize size, int* p_out) {
48   r_ssize loc = 0;
49 
50   // Handle first case
51   int ref = p_id[0];
52   p_out[loc] = 1;
53   ++loc;
54 
55   for (r_ssize i = 1; i < size; ++i) {
56     const int elt = p_id[i];
57 
58     if (elt == ref) {
59       continue;
60     }
61 
62     ref = elt;
63     p_out[loc] = i + 1;
64     ++loc;
65   }
66 }
67 
68 static
vec_locate_run_ends(const int * p_id,r_ssize size,int * p_out)69 void vec_locate_run_ends(const int* p_id, r_ssize size, int* p_out) {
70   r_ssize loc = 0;
71 
72   int ref = p_id[0];
73 
74   for (r_ssize i = 1; i < size; ++i) {
75     const int elt = p_id[i];
76 
77     if (elt == ref) {
78       continue;
79     }
80 
81     ref = elt;
82     p_out[loc] = i;
83     ++loc;
84   }
85 
86   // Handle last case
87   p_out[loc] = size;
88 }
89 
90 // -----------------------------------------------------------------------------
91 
92 static SEXP vec_detect_runs(SEXP x, bool start);
93 
94 // [[register()]]
vctrs_detect_runs(SEXP x,SEXP start)95 SEXP vctrs_detect_runs(SEXP x, SEXP start) {
96   bool c_start = (bool) r_bool_as_int(start);
97   return vec_detect_runs(x, c_start);
98 }
99 
100 static void vec_detect_run_starts(const int* p_id, r_ssize size, int* p_out);
101 static void vec_detect_run_ends(const int* p_id, r_ssize size, int* p_out);
102 
103 static
vec_detect_runs(SEXP x,bool start)104 SEXP vec_detect_runs(SEXP x, bool start) {
105   SEXP id = PROTECT(vec_identify_runs(x));
106   const int* p_id = INTEGER(id);
107 
108   r_ssize size = r_length(id);
109 
110   SEXP out = PROTECT(r_new_logical(size));
111   int* p_out = LOGICAL(out);
112   memset(p_out, 0, size * sizeof(int));
113 
114   if (size == 0) {
115     UNPROTECT(2);
116     return out;
117   }
118 
119   if (start) {
120     vec_detect_run_starts(p_id, size, p_out);
121   } else {
122     vec_detect_run_ends(p_id, size, p_out);
123   }
124 
125   UNPROTECT(2);
126   return out;
127 }
128 
129 static
vec_detect_run_starts(const int * p_id,r_ssize size,int * p_out)130 void vec_detect_run_starts(const int* p_id, r_ssize size, int* p_out) {
131   // Handle first case
132   int ref = p_id[0];
133   p_out[0] = 1;
134 
135   for (r_ssize i = 1; i < size; ++i) {
136     const int elt = p_id[i];
137 
138     if (elt == ref) {
139       continue;
140     }
141 
142     ref = elt;
143     p_out[i] = 1;
144   }
145 }
146 
147 static
vec_detect_run_ends(const int * p_id,r_ssize size,int * p_out)148 void vec_detect_run_ends(const int* p_id, r_ssize size, int* p_out) {
149   int ref = p_id[0];
150 
151   for (r_ssize i = 1; i < size; ++i) {
152     const int elt = p_id[i];
153 
154     if (elt == ref) {
155       continue;
156     }
157 
158     ref = elt;
159     p_out[i - 1] = 1;
160   }
161 
162   // Handle last case
163   p_out[size - 1] = 1;
164 }
165 
166 // -----------------------------------------------------------------------------
167 
168 // [[register()]]
vctrs_identify_runs(SEXP x)169 SEXP vctrs_identify_runs(SEXP x) {
170   return vec_identify_runs(x);
171 }
172 
173 static int lgl_identify_runs(SEXP x, R_len_t size, int* p_out);
174 static int int_identify_runs(SEXP x, R_len_t size, int* p_out);
175 static int dbl_identify_runs(SEXP x, R_len_t size, int* p_out);
176 static int cpl_identify_runs(SEXP x, R_len_t size, int* p_out);
177 static int chr_identify_runs(SEXP x, R_len_t size, int* p_out);
178 static int raw_identify_runs(SEXP x, R_len_t size, int* p_out);
179 static int list_identify_runs(SEXP x, R_len_t size, int* p_out);
180 static int df_identify_runs(SEXP x, R_len_t size, int* p_out);
181 
182 // [[ include("vctrs.h") ]]
vec_identify_runs(SEXP x)183 SEXP vec_identify_runs(SEXP x) {
184   SEXP proxy = PROTECT(vec_proxy_equal(x));
185   R_len_t size = vec_size(proxy);
186   proxy = PROTECT(vec_normalize_encoding(proxy));
187 
188   SEXP out = PROTECT(Rf_allocVector(INTSXP, size));
189   int* p_out = INTEGER(out);
190 
191   // Handle size 0 up front.
192   // All implementations assume at least 1 element.
193   if (size == 0) {
194     SEXP n = PROTECT(r_int(0));
195     r_attrib_poke(out, syms_n, n);
196     UNPROTECT(4);
197     return out;
198   }
199 
200   enum vctrs_type type = vec_proxy_typeof(proxy);
201 
202   int n;
203 
204   switch (type) {
205   case vctrs_type_logical: n = lgl_identify_runs(proxy, size, p_out); break;
206   case vctrs_type_integer: n = int_identify_runs(proxy, size, p_out); break;
207   case vctrs_type_double: n = dbl_identify_runs(proxy, size, p_out); break;
208   case vctrs_type_complex: n = cpl_identify_runs(proxy, size, p_out); break;
209   case vctrs_type_character: n = chr_identify_runs(proxy, size, p_out); break;
210   case vctrs_type_raw: n = raw_identify_runs(proxy, size, p_out); break;
211   case vctrs_type_list: n = list_identify_runs(proxy, size, p_out); break;
212   case vctrs_type_dataframe: n = df_identify_runs(proxy, size, p_out); break;
213   default: stop_unimplemented_vctrs_type("vec_identify_runs", type);
214   }
215 
216   SEXP r_n = PROTECT(r_int(n));
217   r_attrib_poke(out, syms_n, r_n);
218 
219   UNPROTECT(4);
220   return out;
221 }
222 
223 // -----------------------------------------------------------------------------
224 
225 #define VEC_IDENTIFY_RUNS(CTYPE, CONST_DEREF, EQUAL_NA_EQUAL) {  \
226   int id = 1;                                                    \
227   const CTYPE* p_x = CONST_DEREF(x);                             \
228                                                                  \
229   /* Handle first case */                                        \
230   CTYPE ref = p_x[0];                                            \
231   p_out[0] = id;                                                 \
232                                                                  \
233   for (R_len_t i = 1; i < size; ++i) {                           \
234     const CTYPE elt = p_x[i];                                    \
235                                                                  \
236     if (EQUAL_NA_EQUAL(elt, ref) == 0) {                         \
237       ++id;                                                      \
238       ref = elt;                                                 \
239     }                                                            \
240                                                                  \
241     p_out[i] = id;                                               \
242   }                                                              \
243                                                                  \
244   return id;                                                     \
245 }
246 
247 static
lgl_identify_runs(SEXP x,R_len_t size,int * p_out)248 int lgl_identify_runs(SEXP x, R_len_t size, int* p_out) {
249   VEC_IDENTIFY_RUNS(int, LOGICAL_RO, lgl_equal_na_equal);
250 }
251 static
int_identify_runs(SEXP x,R_len_t size,int * p_out)252 int int_identify_runs(SEXP x, R_len_t size, int* p_out) {
253   VEC_IDENTIFY_RUNS(int, INTEGER_RO, int_equal_na_equal);
254 }
255 static
dbl_identify_runs(SEXP x,R_len_t size,int * p_out)256 int dbl_identify_runs(SEXP x, R_len_t size, int* p_out) {
257   VEC_IDENTIFY_RUNS(double, REAL_RO, dbl_equal_na_equal);
258 }
259 static
cpl_identify_runs(SEXP x,R_len_t size,int * p_out)260 int cpl_identify_runs(SEXP x, R_len_t size, int* p_out) {
261   VEC_IDENTIFY_RUNS(Rcomplex, COMPLEX_RO, cpl_equal_na_equal);
262 }
263 static
chr_identify_runs(SEXP x,R_len_t size,int * p_out)264 int chr_identify_runs(SEXP x, R_len_t size, int* p_out) {
265   VEC_IDENTIFY_RUNS(SEXP, STRING_PTR_RO, chr_equal_na_equal);
266 }
267 static
raw_identify_runs(SEXP x,R_len_t size,int * p_out)268 int raw_identify_runs(SEXP x, R_len_t size, int* p_out) {
269   VEC_IDENTIFY_RUNS(Rbyte, RAW_RO, raw_equal_na_equal);
270 }
271 static
list_identify_runs(SEXP x,R_len_t size,int * p_out)272 int list_identify_runs(SEXP x, R_len_t size, int* p_out) {
273   VEC_IDENTIFY_RUNS(SEXP, VECTOR_PTR_RO, list_equal_na_equal);
274 }
275 
276 #undef VEC_IDENTIFY_RUNS
277 
278 // -----------------------------------------------------------------------------
279 
280 static inline int vec_identify_runs_col(SEXP x,
281                                         int id,
282                                         struct df_short_circuit_info* p_info,
283                                         int* p_out);
284 
285 static
df_identify_runs(SEXP x,R_len_t size,int * p_out)286 int df_identify_runs(SEXP x, R_len_t size, int* p_out) {
287   int nprot = 0;
288 
289   const SEXP* p_x = VECTOR_PTR_RO(x);
290 
291   struct df_short_circuit_info info = new_df_short_circuit_info(size, false);
292   PROTECT_DF_SHORT_CIRCUIT_INFO(&info, &nprot);
293 
294   int id = 1;
295   R_len_t n_col = Rf_length(x);
296 
297   // Define 0 column case to be a single run
298   if (n_col == 0) {
299     r_p_int_fill(p_out, id, size);
300     UNPROTECT(nprot);
301     return id;
302   }
303 
304   // Handle first case
305   p_out[0] = id;
306   info.p_row_known[0] = true;
307   --info.remaining;
308 
309   // Compute non-sequential run IDs
310   for (R_len_t i = 0; i < n_col; ++i) {
311     SEXP col = p_x[i];
312 
313     id = vec_identify_runs_col(col, id, &info, p_out);
314 
315     // All values are unique
316     if (info.remaining == 0) {
317       break;
318     }
319   }
320 
321   id = 1;
322   int previous = p_out[0];
323 
324   // Overwrite with sequential IDs
325   for (R_len_t i = 1; i < size; ++i) {
326     const int current = p_out[i];
327 
328     if (current != previous) {
329       ++id;
330       previous = current;
331     }
332 
333     p_out[i] = id;
334   }
335 
336   UNPROTECT(nprot);
337   return id;
338 }
339 
340 // -----------------------------------------------------------------------------
341 
342 static int lgl_identify_runs_col(SEXP x,
343                                  int id,
344                                  struct df_short_circuit_info* p_info,
345                                  int* p_out);
346 static int int_identify_runs_col(SEXP x,
347                                  int id,
348                                  struct df_short_circuit_info* p_info,
349                                  int* p_out);
350 static int dbl_identify_runs_col(SEXP x,
351                                  int id,
352                                  struct df_short_circuit_info* p_info,
353                                  int* p_out);
354 static int cpl_identify_runs_col(SEXP x,
355                                  int id,
356                                  struct df_short_circuit_info* p_info,
357                                  int* p_out);
358 static int chr_identify_runs_col(SEXP x,
359                                  int id,
360                                  struct df_short_circuit_info* p_info,
361                                  int* p_out);
362 static int raw_identify_runs_col(SEXP x,
363                                  int id,
364                                  struct df_short_circuit_info* p_info,
365                                  int* p_out);
366 static int list_identify_runs_col(SEXP x,
367                                   int id,
368                                   struct df_short_circuit_info* p_info,
369                                   int* p_out);
370 
371 static inline
vec_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)372 int vec_identify_runs_col(SEXP x,
373                           int id,
374                           struct df_short_circuit_info* p_info,
375                           int* p_out) {
376   switch (vec_proxy_typeof(x)) {
377   case vctrs_type_logical: return lgl_identify_runs_col(x, id, p_info, p_out);
378   case vctrs_type_integer: return int_identify_runs_col(x, id, p_info, p_out);
379   case vctrs_type_double: return dbl_identify_runs_col(x, id, p_info, p_out);
380   case vctrs_type_complex: return cpl_identify_runs_col(x, id, p_info, p_out);
381   case vctrs_type_character: return chr_identify_runs_col(x, id, p_info, p_out);
382   case vctrs_type_raw: return raw_identify_runs_col(x, id, p_info, p_out);
383   case vctrs_type_list: return list_identify_runs_col(x, id, p_info, p_out);
384   case vctrs_type_dataframe: stop_internal("vec_identify_runs_col", "Data frame columns should be flattened.");
385   case vctrs_type_scalar: Rf_errorcall(R_NilValue, "Can't compare scalars with `vec_identify_runs()`");
386   default: Rf_error("Unimplemented type in `vec_identify_runs()`");
387   }
388 }
389 
390 // -----------------------------------------------------------------------------
391 
392 #define VEC_IDENTIFY_RUNS_COL(CTYPE, CONST_DEREF, EQUAL_NA_EQUAL) { \
393   const CTYPE* p_x = CONST_DEREF(x);                                \
394                                                                     \
395   /* First row is always known, so `run_val` and `run_id` */        \
396   /* will always be overwritten immediately below. */               \
397   /* But for gcc11 we have to initialize these variables. */        \
398   CTYPE run_val = p_x[0];                                           \
399   int run_id = 0;                                                   \
400                                                                     \
401   for (R_len_t i = 0; i < p_info->size; ++i) {                      \
402     /* Start of new run */                                          \
403     if (p_info->p_row_known[i]) {                                   \
404       run_val = p_x[i];                                             \
405       run_id = p_out[i];                                            \
406       continue;                                                     \
407     }                                                               \
408                                                                     \
409     const CTYPE elt = p_x[i];                                       \
410     const int eq = EQUAL_NA_EQUAL(elt, run_val);                    \
411                                                                     \
412     /* Update ID of identical values */                             \
413     if (eq != 0) {                                                  \
414       p_out[i] = run_id;                                            \
415       continue;                                                     \
416     }                                                               \
417                                                                     \
418     ++id;                                                           \
419     run_val = elt;                                                  \
420     run_id = id;                                                    \
421     p_out[i] = id;                                                  \
422                                                                     \
423     /* This is a run change, so don't check this row again */       \
424     p_info->p_row_known[i] = true;                                  \
425     --p_info->remaining;                                            \
426                                                                     \
427     if (p_info->remaining == 0) {                                   \
428       break;                                                        \
429     }                                                               \
430   }                                                                 \
431                                                                     \
432   return id;                                                        \
433 }
434 
435 static
lgl_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)436 int lgl_identify_runs_col(SEXP x,
437                           int id,
438                           struct df_short_circuit_info* p_info,
439                           int* p_out) {
440   VEC_IDENTIFY_RUNS_COL(int, LOGICAL_RO, lgl_equal_na_equal);
441 }
442 static
int_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)443 int int_identify_runs_col(SEXP x,
444                           int id,
445                           struct df_short_circuit_info* p_info,
446                           int* p_out) {
447   VEC_IDENTIFY_RUNS_COL(int, INTEGER_RO, int_equal_na_equal);
448 }
449 static
dbl_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)450 int dbl_identify_runs_col(SEXP x,
451                           int id,
452                           struct df_short_circuit_info* p_info,
453                           int* p_out) {
454   VEC_IDENTIFY_RUNS_COL(double, REAL_RO, dbl_equal_na_equal);
455 }
456 static
cpl_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)457 int cpl_identify_runs_col(SEXP x,
458                           int id,
459                           struct df_short_circuit_info* p_info,
460                           int* p_out) {
461   VEC_IDENTIFY_RUNS_COL(Rcomplex, COMPLEX_RO, cpl_equal_na_equal);
462 }
463 static
chr_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)464 int chr_identify_runs_col(SEXP x,
465                           int id,
466                           struct df_short_circuit_info* p_info,
467                           int* p_out) {
468   VEC_IDENTIFY_RUNS_COL(SEXP, STRING_PTR_RO, chr_equal_na_equal);
469 }
470 static
raw_identify_runs_col(SEXP x,R_len_t id,struct df_short_circuit_info * p_info,int * p_out)471 int raw_identify_runs_col(SEXP x,
472                           R_len_t id,
473                           struct df_short_circuit_info* p_info,
474                           int* p_out) {
475   VEC_IDENTIFY_RUNS_COL(Rbyte, RAW_RO, raw_equal_na_equal);
476 }
477 static
list_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)478 int list_identify_runs_col(SEXP x,
479                            int id,
480                            struct df_short_circuit_info* p_info,
481                            int* p_out) {
482   VEC_IDENTIFY_RUNS_COL(SEXP, VECTOR_PTR_RO, list_equal_na_equal);
483 }
484 
485 #undef VEC_IDENTIFY_RUNS_COL
486