1 #include "vctrs.h"
2 #include "dictionary.h"
3 #include "translate.h"
4 #include "type-data-frame.h"
5 #include "utils.h"
6
7 // [[ register() ]]
vctrs_group_id(SEXP x)8 SEXP vctrs_group_id(SEXP x) {
9 int nprot = 0;
10
11 R_len_t n = vec_size(x);
12
13 x = PROTECT_N(vec_proxy_equal(x), &nprot);
14 x = PROTECT_N(vec_normalize_encoding(x), &nprot);
15
16 struct dictionary* d = new_dictionary(x);
17 PROTECT_DICT(d, &nprot);
18
19 SEXP out = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
20 int* p_out = INTEGER(out);
21
22 R_len_t g = 1;
23
24 for (int i = 0; i < n; ++i) {
25 int32_t hash = dict_hash_scalar(d, i);
26 R_len_t key = d->key[hash];
27
28 if (key == DICT_EMPTY) {
29 dict_put(d, hash, i);
30 p_out[i] = g;
31 ++g;
32 } else {
33 p_out[i] = p_out[key];
34 }
35 }
36
37 SEXP n_groups = PROTECT_N(Rf_ScalarInteger(d->used), &nprot);
38 Rf_setAttrib(out, syms_n, n_groups);
39
40 UNPROTECT(nprot);
41 return out;
42 }
43
44 // -----------------------------------------------------------------------------
45
46 static SEXP new_group_rle(SEXP g, SEXP l, R_len_t n);
47
48 // [[ register() ]]
vctrs_group_rle(SEXP x)49 SEXP vctrs_group_rle(SEXP x) {
50 int nprot = 0;
51
52 R_len_t n = vec_size(x);
53
54 x = PROTECT_N(vec_proxy_equal(x), &nprot);
55 x = PROTECT_N(vec_normalize_encoding(x), &nprot);
56
57 struct dictionary* d = new_dictionary(x);
58 PROTECT_DICT(d, &nprot);
59
60 const void* p_vec = d->p_poly_vec->p_vec;
61
62 SEXP g = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
63 int* p_g = INTEGER(g);
64
65 SEXP l = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
66 int* p_l = INTEGER(l);
67
68 if (n == 0) {
69 SEXP out = PROTECT_N(new_group_rle(g, l, 0), &nprot);
70 UNPROTECT(nprot);
71 return out;
72 }
73
74 // Integer vector that maps `hash` values to locations in `g`
75 SEXP map = PROTECT_N(Rf_allocVector(INTSXP, d->size), &nprot);
76 int* p_map = INTEGER(map);
77
78 // Initialize first value
79 int32_t hash = dict_hash_scalar(d, 0);
80 dict_put(d, hash, 0);
81 p_map[hash] = 0;
82 *p_g = 1;
83 *p_l = 1;
84
85 int loc = 1;
86
87 for (int i = 1; i < n; ++i) {
88 if (d->p_equal_na_equal(p_vec, i - 1, p_vec, i)) {
89 ++(*p_l);
90 continue;
91 }
92
93 ++p_l;
94 *p_l = 1;
95
96 // Check if we have seen this value before
97 int32_t hash = dict_hash_scalar(d, i);
98
99 if (d->key[hash] == DICT_EMPTY) {
100 dict_put(d, hash, i);
101 p_map[hash] = loc;
102 p_g[loc] = d->used;
103 } else {
104 p_g[loc] = p_g[p_map[hash]];
105 }
106
107 ++loc;
108 }
109
110 g = PROTECT_N(Rf_lengthgets(g, loc), &nprot);
111 l = PROTECT_N(Rf_lengthgets(l, loc), &nprot);
112
113 SEXP out = new_group_rle(g, l, d->used);
114
115 UNPROTECT(nprot);
116 return out;
117 }
118
new_group_rle(SEXP g,SEXP l,R_len_t n)119 static SEXP new_group_rle(SEXP g, SEXP l, R_len_t n) {
120 SEXP out = PROTECT(Rf_allocVector(VECSXP, 2));
121
122 SET_VECTOR_ELT(out, 0, g);
123 SET_VECTOR_ELT(out, 1, l);
124
125 SEXP names = PROTECT(Rf_allocVector(STRSXP, 2));
126 SET_STRING_ELT(names, 0, strings_group);
127 SET_STRING_ELT(names, 1, strings_length);
128 Rf_setAttrib(out, R_NamesSymbol, names);
129
130 SEXP n_groups = PROTECT(Rf_ScalarInteger(n));
131 Rf_setAttrib(out, syms_n, n_groups);
132
133 Rf_setAttrib(out, R_ClassSymbol, classes_vctrs_group_rle);
134
135 UNPROTECT(3);
136 return out;
137 }
138
139 // -----------------------------------------------------------------------------
140
141 // [[ include("vctrs.h"); register() ]]
vec_group_loc(SEXP x)142 SEXP vec_group_loc(SEXP x) {
143 int nprot = 0;
144
145 R_len_t n = vec_size(x);
146
147 SEXP proxy = PROTECT_N(vec_proxy_equal(x), &nprot);
148 proxy = PROTECT_N(vec_normalize_encoding(proxy), &nprot);
149
150 struct dictionary* d = new_dictionary(proxy);
151 PROTECT_DICT(d, &nprot);
152
153 SEXP groups = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
154 int* p_groups = INTEGER(groups);
155
156 R_len_t g = 0;
157
158 // Identify groups, this is essentially `vec_group_id()`
159 for (int i = 0; i < n; ++i) {
160 const int32_t hash = dict_hash_scalar(d, i);
161 const R_len_t key = d->key[hash];
162
163 if (key == DICT_EMPTY) {
164 dict_put(d, hash, i);
165 p_groups[i] = g;
166 ++g;
167 } else {
168 p_groups[i] = p_groups[key];
169 }
170 }
171
172 const int n_groups = d->used;
173
174 // Location of first occurence of each group in `x`
175 SEXP key_loc = PROTECT_N(Rf_allocVector(INTSXP, n_groups), &nprot);
176 int* p_key_loc = INTEGER(key_loc);
177 int key_loc_current = 0;
178
179 // Count of the number of elements in each group
180 SEXP counts = PROTECT_N(Rf_allocVector(INTSXP, n_groups), &nprot);
181 int* p_counts = INTEGER(counts);
182 memset(p_counts, 0, n_groups * sizeof(int));
183
184 for (int i = 0; i < n; ++i) {
185 const int group = p_groups[i];
186
187 if (group == key_loc_current) {
188 p_key_loc[key_loc_current] = i + 1;
189 ++key_loc_current;
190 }
191
192 ++p_counts[group];
193 }
194
195 SEXP out_loc = PROTECT_N(Rf_allocVector(VECSXP, n_groups), &nprot);
196
197 // Direct pointer to the location vectors we store in `out_loc`
198 int** p_elt_loc = (int**) R_alloc(n_groups, sizeof(int*));
199
200 // Initialize `out_loc` to a list of integers with sizes corresponding
201 // to the number of elements in that group
202 for (int i = 0; i < n_groups; ++i) {
203 SEXP elt_loc = Rf_allocVector(INTSXP, p_counts[i]);
204 p_elt_loc[i] = INTEGER(elt_loc);
205 SET_VECTOR_ELT(out_loc, i, elt_loc);
206 }
207
208 // The current location we are updating, each group has its own counter
209 SEXP locations = PROTECT_N(Rf_allocVector(INTSXP, n_groups), &nprot);
210 int* p_locations = INTEGER(locations);
211 memset(p_locations, 0, n_groups * sizeof(int));
212
213 // Fill in the location values for each group
214 for (int i = 0; i < n; ++i) {
215 const int group = p_groups[i];
216 const int location = p_locations[group];
217 p_elt_loc[group][location] = i + 1;
218 ++p_locations[group];
219 }
220
221 SEXP out_key = PROTECT_N(vec_slice(x, key_loc), &nprot);
222
223 // Construct output data frame
224 SEXP out = PROTECT_N(Rf_allocVector(VECSXP, 2), &nprot);
225 SET_VECTOR_ELT(out, 0, out_key);
226 SET_VECTOR_ELT(out, 1, out_loc);
227
228 SEXP names = PROTECT_N(Rf_allocVector(STRSXP, 2), &nprot);
229 SET_STRING_ELT(names, 0, strings_key);
230 SET_STRING_ELT(names, 1, strings_loc);
231
232 Rf_setAttrib(out, R_NamesSymbol, names);
233
234 out = new_data_frame(out, n_groups);
235
236 UNPROTECT(nprot);
237 return out;
238 }
239