1 #include "vctrs.h"
2 #include "utils.h"
3 #include "equal.h"
4 #include "translate.h"
5
6 // -----------------------------------------------------------------------------
7
8 static SEXP vec_locate_runs(SEXP x, bool start);
9
10 // [[register()]]
vctrs_locate_runs(SEXP x,SEXP start)11 SEXP vctrs_locate_runs(SEXP x, SEXP start) {
12 bool c_start = (bool) r_bool_as_int(start);
13 return vec_locate_runs(x, c_start);
14 }
15
16 static void vec_locate_run_starts(const int* p_id, r_ssize size, int* p_out);
17 static void vec_locate_run_ends(const int* p_id, r_ssize size, int* p_out);
18
19 static
vec_locate_runs(SEXP x,bool start)20 SEXP vec_locate_runs(SEXP x, bool start) {
21 SEXP id = PROTECT(vec_identify_runs(x));
22 const int* p_id = INTEGER(id);
23
24 r_ssize size = r_length(id);
25
26 int n = r_int_get(r_attrib_get(id, syms_n), 0);
27
28 SEXP out = PROTECT(r_new_integer(n));
29 int* p_out = INTEGER(out);
30
31 if (n == 0) {
32 UNPROTECT(2);
33 return out;
34 }
35
36 if (start) {
37 vec_locate_run_starts(p_id, size, p_out);
38 } else {
39 vec_locate_run_ends(p_id, size, p_out);
40 }
41
42 UNPROTECT(2);
43 return out;
44 }
45
46 static
vec_locate_run_starts(const int * p_id,r_ssize size,int * p_out)47 void vec_locate_run_starts(const int* p_id, r_ssize size, int* p_out) {
48 r_ssize loc = 0;
49
50 // Handle first case
51 int ref = p_id[0];
52 p_out[loc] = 1;
53 ++loc;
54
55 for (r_ssize i = 1; i < size; ++i) {
56 const int elt = p_id[i];
57
58 if (elt == ref) {
59 continue;
60 }
61
62 ref = elt;
63 p_out[loc] = i + 1;
64 ++loc;
65 }
66 }
67
68 static
vec_locate_run_ends(const int * p_id,r_ssize size,int * p_out)69 void vec_locate_run_ends(const int* p_id, r_ssize size, int* p_out) {
70 r_ssize loc = 0;
71
72 int ref = p_id[0];
73
74 for (r_ssize i = 1; i < size; ++i) {
75 const int elt = p_id[i];
76
77 if (elt == ref) {
78 continue;
79 }
80
81 ref = elt;
82 p_out[loc] = i;
83 ++loc;
84 }
85
86 // Handle last case
87 p_out[loc] = size;
88 }
89
90 // -----------------------------------------------------------------------------
91
92 static SEXP vec_detect_runs(SEXP x, bool start);
93
94 // [[register()]]
vctrs_detect_runs(SEXP x,SEXP start)95 SEXP vctrs_detect_runs(SEXP x, SEXP start) {
96 bool c_start = (bool) r_bool_as_int(start);
97 return vec_detect_runs(x, c_start);
98 }
99
100 static void vec_detect_run_starts(const int* p_id, r_ssize size, int* p_out);
101 static void vec_detect_run_ends(const int* p_id, r_ssize size, int* p_out);
102
103 static
vec_detect_runs(SEXP x,bool start)104 SEXP vec_detect_runs(SEXP x, bool start) {
105 SEXP id = PROTECT(vec_identify_runs(x));
106 const int* p_id = INTEGER(id);
107
108 r_ssize size = r_length(id);
109
110 SEXP out = PROTECT(r_new_logical(size));
111 int* p_out = LOGICAL(out);
112 memset(p_out, 0, size * sizeof(int));
113
114 if (size == 0) {
115 UNPROTECT(2);
116 return out;
117 }
118
119 if (start) {
120 vec_detect_run_starts(p_id, size, p_out);
121 } else {
122 vec_detect_run_ends(p_id, size, p_out);
123 }
124
125 UNPROTECT(2);
126 return out;
127 }
128
129 static
vec_detect_run_starts(const int * p_id,r_ssize size,int * p_out)130 void vec_detect_run_starts(const int* p_id, r_ssize size, int* p_out) {
131 // Handle first case
132 int ref = p_id[0];
133 p_out[0] = 1;
134
135 for (r_ssize i = 1; i < size; ++i) {
136 const int elt = p_id[i];
137
138 if (elt == ref) {
139 continue;
140 }
141
142 ref = elt;
143 p_out[i] = 1;
144 }
145 }
146
147 static
vec_detect_run_ends(const int * p_id,r_ssize size,int * p_out)148 void vec_detect_run_ends(const int* p_id, r_ssize size, int* p_out) {
149 int ref = p_id[0];
150
151 for (r_ssize i = 1; i < size; ++i) {
152 const int elt = p_id[i];
153
154 if (elt == ref) {
155 continue;
156 }
157
158 ref = elt;
159 p_out[i - 1] = 1;
160 }
161
162 // Handle last case
163 p_out[size - 1] = 1;
164 }
165
166 // -----------------------------------------------------------------------------
167
168 // [[register()]]
vctrs_identify_runs(SEXP x)169 SEXP vctrs_identify_runs(SEXP x) {
170 return vec_identify_runs(x);
171 }
172
173 static int lgl_identify_runs(SEXP x, R_len_t size, int* p_out);
174 static int int_identify_runs(SEXP x, R_len_t size, int* p_out);
175 static int dbl_identify_runs(SEXP x, R_len_t size, int* p_out);
176 static int cpl_identify_runs(SEXP x, R_len_t size, int* p_out);
177 static int chr_identify_runs(SEXP x, R_len_t size, int* p_out);
178 static int raw_identify_runs(SEXP x, R_len_t size, int* p_out);
179 static int list_identify_runs(SEXP x, R_len_t size, int* p_out);
180 static int df_identify_runs(SEXP x, R_len_t size, int* p_out);
181
182 // [[ include("vctrs.h") ]]
vec_identify_runs(SEXP x)183 SEXP vec_identify_runs(SEXP x) {
184 SEXP proxy = PROTECT(vec_proxy_equal(x));
185 R_len_t size = vec_size(proxy);
186 proxy = PROTECT(vec_normalize_encoding(proxy));
187
188 SEXP out = PROTECT(Rf_allocVector(INTSXP, size));
189 int* p_out = INTEGER(out);
190
191 // Handle size 0 up front.
192 // All implementations assume at least 1 element.
193 if (size == 0) {
194 SEXP n = PROTECT(r_int(0));
195 r_attrib_poke(out, syms_n, n);
196 UNPROTECT(4);
197 return out;
198 }
199
200 enum vctrs_type type = vec_proxy_typeof(proxy);
201
202 int n;
203
204 switch (type) {
205 case vctrs_type_logical: n = lgl_identify_runs(proxy, size, p_out); break;
206 case vctrs_type_integer: n = int_identify_runs(proxy, size, p_out); break;
207 case vctrs_type_double: n = dbl_identify_runs(proxy, size, p_out); break;
208 case vctrs_type_complex: n = cpl_identify_runs(proxy, size, p_out); break;
209 case vctrs_type_character: n = chr_identify_runs(proxy, size, p_out); break;
210 case vctrs_type_raw: n = raw_identify_runs(proxy, size, p_out); break;
211 case vctrs_type_list: n = list_identify_runs(proxy, size, p_out); break;
212 case vctrs_type_dataframe: n = df_identify_runs(proxy, size, p_out); break;
213 default: stop_unimplemented_vctrs_type("vec_identify_runs", type);
214 }
215
216 SEXP r_n = PROTECT(r_int(n));
217 r_attrib_poke(out, syms_n, r_n);
218
219 UNPROTECT(4);
220 return out;
221 }
222
223 // -----------------------------------------------------------------------------
224
225 #define VEC_IDENTIFY_RUNS(CTYPE, CONST_DEREF, EQUAL_NA_EQUAL) { \
226 int id = 1; \
227 const CTYPE* p_x = CONST_DEREF(x); \
228 \
229 /* Handle first case */ \
230 CTYPE ref = p_x[0]; \
231 p_out[0] = id; \
232 \
233 for (R_len_t i = 1; i < size; ++i) { \
234 const CTYPE elt = p_x[i]; \
235 \
236 if (EQUAL_NA_EQUAL(elt, ref) == 0) { \
237 ++id; \
238 ref = elt; \
239 } \
240 \
241 p_out[i] = id; \
242 } \
243 \
244 return id; \
245 }
246
247 static
lgl_identify_runs(SEXP x,R_len_t size,int * p_out)248 int lgl_identify_runs(SEXP x, R_len_t size, int* p_out) {
249 VEC_IDENTIFY_RUNS(int, LOGICAL_RO, lgl_equal_na_equal);
250 }
251 static
int_identify_runs(SEXP x,R_len_t size,int * p_out)252 int int_identify_runs(SEXP x, R_len_t size, int* p_out) {
253 VEC_IDENTIFY_RUNS(int, INTEGER_RO, int_equal_na_equal);
254 }
255 static
dbl_identify_runs(SEXP x,R_len_t size,int * p_out)256 int dbl_identify_runs(SEXP x, R_len_t size, int* p_out) {
257 VEC_IDENTIFY_RUNS(double, REAL_RO, dbl_equal_na_equal);
258 }
259 static
cpl_identify_runs(SEXP x,R_len_t size,int * p_out)260 int cpl_identify_runs(SEXP x, R_len_t size, int* p_out) {
261 VEC_IDENTIFY_RUNS(Rcomplex, COMPLEX_RO, cpl_equal_na_equal);
262 }
263 static
chr_identify_runs(SEXP x,R_len_t size,int * p_out)264 int chr_identify_runs(SEXP x, R_len_t size, int* p_out) {
265 VEC_IDENTIFY_RUNS(SEXP, STRING_PTR_RO, chr_equal_na_equal);
266 }
267 static
raw_identify_runs(SEXP x,R_len_t size,int * p_out)268 int raw_identify_runs(SEXP x, R_len_t size, int* p_out) {
269 VEC_IDENTIFY_RUNS(Rbyte, RAW_RO, raw_equal_na_equal);
270 }
271 static
list_identify_runs(SEXP x,R_len_t size,int * p_out)272 int list_identify_runs(SEXP x, R_len_t size, int* p_out) {
273 VEC_IDENTIFY_RUNS(SEXP, VECTOR_PTR_RO, list_equal_na_equal);
274 }
275
276 #undef VEC_IDENTIFY_RUNS
277
278 // -----------------------------------------------------------------------------
279
280 static inline int vec_identify_runs_col(SEXP x,
281 int id,
282 struct df_short_circuit_info* p_info,
283 int* p_out);
284
285 static
df_identify_runs(SEXP x,R_len_t size,int * p_out)286 int df_identify_runs(SEXP x, R_len_t size, int* p_out) {
287 int nprot = 0;
288
289 const SEXP* p_x = VECTOR_PTR_RO(x);
290
291 struct df_short_circuit_info info = new_df_short_circuit_info(size, false);
292 PROTECT_DF_SHORT_CIRCUIT_INFO(&info, &nprot);
293
294 int id = 1;
295 R_len_t n_col = Rf_length(x);
296
297 // Define 0 column case to be a single run
298 if (n_col == 0) {
299 r_p_int_fill(p_out, id, size);
300 UNPROTECT(nprot);
301 return id;
302 }
303
304 // Handle first case
305 p_out[0] = id;
306 info.p_row_known[0] = true;
307 --info.remaining;
308
309 // Compute non-sequential run IDs
310 for (R_len_t i = 0; i < n_col; ++i) {
311 SEXP col = p_x[i];
312
313 id = vec_identify_runs_col(col, id, &info, p_out);
314
315 // All values are unique
316 if (info.remaining == 0) {
317 break;
318 }
319 }
320
321 id = 1;
322 int previous = p_out[0];
323
324 // Overwrite with sequential IDs
325 for (R_len_t i = 1; i < size; ++i) {
326 const int current = p_out[i];
327
328 if (current != previous) {
329 ++id;
330 previous = current;
331 }
332
333 p_out[i] = id;
334 }
335
336 UNPROTECT(nprot);
337 return id;
338 }
339
340 // -----------------------------------------------------------------------------
341
342 static int lgl_identify_runs_col(SEXP x,
343 int id,
344 struct df_short_circuit_info* p_info,
345 int* p_out);
346 static int int_identify_runs_col(SEXP x,
347 int id,
348 struct df_short_circuit_info* p_info,
349 int* p_out);
350 static int dbl_identify_runs_col(SEXP x,
351 int id,
352 struct df_short_circuit_info* p_info,
353 int* p_out);
354 static int cpl_identify_runs_col(SEXP x,
355 int id,
356 struct df_short_circuit_info* p_info,
357 int* p_out);
358 static int chr_identify_runs_col(SEXP x,
359 int id,
360 struct df_short_circuit_info* p_info,
361 int* p_out);
362 static int raw_identify_runs_col(SEXP x,
363 int id,
364 struct df_short_circuit_info* p_info,
365 int* p_out);
366 static int list_identify_runs_col(SEXP x,
367 int id,
368 struct df_short_circuit_info* p_info,
369 int* p_out);
370
371 static inline
vec_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)372 int vec_identify_runs_col(SEXP x,
373 int id,
374 struct df_short_circuit_info* p_info,
375 int* p_out) {
376 switch (vec_proxy_typeof(x)) {
377 case vctrs_type_logical: return lgl_identify_runs_col(x, id, p_info, p_out);
378 case vctrs_type_integer: return int_identify_runs_col(x, id, p_info, p_out);
379 case vctrs_type_double: return dbl_identify_runs_col(x, id, p_info, p_out);
380 case vctrs_type_complex: return cpl_identify_runs_col(x, id, p_info, p_out);
381 case vctrs_type_character: return chr_identify_runs_col(x, id, p_info, p_out);
382 case vctrs_type_raw: return raw_identify_runs_col(x, id, p_info, p_out);
383 case vctrs_type_list: return list_identify_runs_col(x, id, p_info, p_out);
384 case vctrs_type_dataframe: stop_internal("vec_identify_runs_col", "Data frame columns should be flattened.");
385 case vctrs_type_scalar: Rf_errorcall(R_NilValue, "Can't compare scalars with `vec_identify_runs()`");
386 default: Rf_error("Unimplemented type in `vec_identify_runs()`");
387 }
388 }
389
390 // -----------------------------------------------------------------------------
391
392 #define VEC_IDENTIFY_RUNS_COL(CTYPE, CONST_DEREF, EQUAL_NA_EQUAL) { \
393 const CTYPE* p_x = CONST_DEREF(x); \
394 \
395 /* First row is always known, so `run_val` and `run_id` */ \
396 /* will always be overwritten immediately below. */ \
397 /* But for gcc11 we have to initialize these variables. */ \
398 CTYPE run_val = p_x[0]; \
399 int run_id = 0; \
400 \
401 for (R_len_t i = 0; i < p_info->size; ++i) { \
402 /* Start of new run */ \
403 if (p_info->p_row_known[i]) { \
404 run_val = p_x[i]; \
405 run_id = p_out[i]; \
406 continue; \
407 } \
408 \
409 const CTYPE elt = p_x[i]; \
410 const int eq = EQUAL_NA_EQUAL(elt, run_val); \
411 \
412 /* Update ID of identical values */ \
413 if (eq != 0) { \
414 p_out[i] = run_id; \
415 continue; \
416 } \
417 \
418 ++id; \
419 run_val = elt; \
420 run_id = id; \
421 p_out[i] = id; \
422 \
423 /* This is a run change, so don't check this row again */ \
424 p_info->p_row_known[i] = true; \
425 --p_info->remaining; \
426 \
427 if (p_info->remaining == 0) { \
428 break; \
429 } \
430 } \
431 \
432 return id; \
433 }
434
435 static
lgl_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)436 int lgl_identify_runs_col(SEXP x,
437 int id,
438 struct df_short_circuit_info* p_info,
439 int* p_out) {
440 VEC_IDENTIFY_RUNS_COL(int, LOGICAL_RO, lgl_equal_na_equal);
441 }
442 static
int_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)443 int int_identify_runs_col(SEXP x,
444 int id,
445 struct df_short_circuit_info* p_info,
446 int* p_out) {
447 VEC_IDENTIFY_RUNS_COL(int, INTEGER_RO, int_equal_na_equal);
448 }
449 static
dbl_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)450 int dbl_identify_runs_col(SEXP x,
451 int id,
452 struct df_short_circuit_info* p_info,
453 int* p_out) {
454 VEC_IDENTIFY_RUNS_COL(double, REAL_RO, dbl_equal_na_equal);
455 }
456 static
cpl_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)457 int cpl_identify_runs_col(SEXP x,
458 int id,
459 struct df_short_circuit_info* p_info,
460 int* p_out) {
461 VEC_IDENTIFY_RUNS_COL(Rcomplex, COMPLEX_RO, cpl_equal_na_equal);
462 }
463 static
chr_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)464 int chr_identify_runs_col(SEXP x,
465 int id,
466 struct df_short_circuit_info* p_info,
467 int* p_out) {
468 VEC_IDENTIFY_RUNS_COL(SEXP, STRING_PTR_RO, chr_equal_na_equal);
469 }
470 static
raw_identify_runs_col(SEXP x,R_len_t id,struct df_short_circuit_info * p_info,int * p_out)471 int raw_identify_runs_col(SEXP x,
472 R_len_t id,
473 struct df_short_circuit_info* p_info,
474 int* p_out) {
475 VEC_IDENTIFY_RUNS_COL(Rbyte, RAW_RO, raw_equal_na_equal);
476 }
477 static
list_identify_runs_col(SEXP x,int id,struct df_short_circuit_info * p_info,int * p_out)478 int list_identify_runs_col(SEXP x,
479 int id,
480 struct df_short_circuit_info* p_info,
481 int* p_out) {
482 VEC_IDENTIFY_RUNS_COL(SEXP, VECTOR_PTR_RO, list_equal_na_equal);
483 }
484
485 #undef VEC_IDENTIFY_RUNS_COL
486