1 #include "vctrs.h"
2 #include "dictionary.h"
3 #include "translate.h"
4 #include "type-data-frame.h"
5 #include "utils.h"
6 
7 // [[ register() ]]
vctrs_group_id(SEXP x)8 SEXP vctrs_group_id(SEXP x) {
9   int nprot = 0;
10 
11   R_len_t n = vec_size(x);
12 
13   x = PROTECT_N(vec_proxy_equal(x), &nprot);
14   x = PROTECT_N(vec_normalize_encoding(x), &nprot);
15 
16   struct dictionary* d = new_dictionary(x);
17   PROTECT_DICT(d, &nprot);
18 
19   SEXP out = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
20   int* p_out = INTEGER(out);
21 
22   R_len_t g = 1;
23 
24   for (int i = 0; i < n; ++i) {
25     int32_t hash = dict_hash_scalar(d, i);
26     R_len_t key = d->key[hash];
27 
28     if (key == DICT_EMPTY) {
29       dict_put(d, hash, i);
30       p_out[i] = g;
31       ++g;
32     } else {
33       p_out[i] = p_out[key];
34     }
35   }
36 
37   SEXP n_groups = PROTECT_N(Rf_ScalarInteger(d->used), &nprot);
38   Rf_setAttrib(out, syms_n, n_groups);
39 
40   UNPROTECT(nprot);
41   return out;
42 }
43 
44 // -----------------------------------------------------------------------------
45 
46 static SEXP new_group_rle(SEXP g, SEXP l, R_len_t n);
47 
48 // [[ register() ]]
vctrs_group_rle(SEXP x)49 SEXP vctrs_group_rle(SEXP x) {
50   int nprot = 0;
51 
52   R_len_t n = vec_size(x);
53 
54   x = PROTECT_N(vec_proxy_equal(x), &nprot);
55   x = PROTECT_N(vec_normalize_encoding(x), &nprot);
56 
57   struct dictionary* d = new_dictionary(x);
58   PROTECT_DICT(d, &nprot);
59 
60   const void* p_vec = d->p_poly_vec->p_vec;
61 
62   SEXP g = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
63   int* p_g = INTEGER(g);
64 
65   SEXP l = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
66   int* p_l = INTEGER(l);
67 
68   if (n == 0) {
69     SEXP out = PROTECT_N(new_group_rle(g, l, 0), &nprot);
70     UNPROTECT(nprot);
71     return out;
72   }
73 
74   // Integer vector that maps `hash` values to locations in `g`
75   SEXP map = PROTECT_N(Rf_allocVector(INTSXP, d->size), &nprot);
76   int* p_map = INTEGER(map);
77 
78   // Initialize first value
79   int32_t hash = dict_hash_scalar(d, 0);
80   dict_put(d, hash, 0);
81   p_map[hash] = 0;
82   *p_g = 1;
83   *p_l = 1;
84 
85   int loc = 1;
86 
87   for (int i = 1; i < n; ++i) {
88     if (d->p_equal_na_equal(p_vec, i - 1, p_vec, i)) {
89       ++(*p_l);
90       continue;
91     }
92 
93     ++p_l;
94     *p_l = 1;
95 
96     // Check if we have seen this value before
97     int32_t hash = dict_hash_scalar(d, i);
98 
99     if (d->key[hash] == DICT_EMPTY) {
100       dict_put(d, hash, i);
101       p_map[hash] = loc;
102       p_g[loc] = d->used;
103     } else {
104       p_g[loc] = p_g[p_map[hash]];
105     }
106 
107     ++loc;
108   }
109 
110   g = PROTECT_N(Rf_lengthgets(g, loc), &nprot);
111   l = PROTECT_N(Rf_lengthgets(l, loc), &nprot);
112 
113   SEXP out = new_group_rle(g, l, d->used);
114 
115   UNPROTECT(nprot);
116   return out;
117 }
118 
new_group_rle(SEXP g,SEXP l,R_len_t n)119 static SEXP new_group_rle(SEXP g, SEXP l, R_len_t n) {
120   SEXP out = PROTECT(Rf_allocVector(VECSXP, 2));
121 
122   SET_VECTOR_ELT(out, 0, g);
123   SET_VECTOR_ELT(out, 1, l);
124 
125   SEXP names = PROTECT(Rf_allocVector(STRSXP, 2));
126   SET_STRING_ELT(names, 0, strings_group);
127   SET_STRING_ELT(names, 1, strings_length);
128   Rf_setAttrib(out, R_NamesSymbol, names);
129 
130   SEXP n_groups = PROTECT(Rf_ScalarInteger(n));
131   Rf_setAttrib(out, syms_n, n_groups);
132 
133   Rf_setAttrib(out, R_ClassSymbol, classes_vctrs_group_rle);
134 
135   UNPROTECT(3);
136   return out;
137 }
138 
139 // -----------------------------------------------------------------------------
140 
141 // [[ include("vctrs.h"); register() ]]
vec_group_loc(SEXP x)142 SEXP vec_group_loc(SEXP x) {
143   int nprot = 0;
144 
145   R_len_t n = vec_size(x);
146 
147   SEXP proxy = PROTECT_N(vec_proxy_equal(x), &nprot);
148   proxy = PROTECT_N(vec_normalize_encoding(proxy), &nprot);
149 
150   struct dictionary* d = new_dictionary(proxy);
151   PROTECT_DICT(d, &nprot);
152 
153   SEXP groups = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot);
154   int* p_groups = INTEGER(groups);
155 
156   R_len_t g = 0;
157 
158   // Identify groups, this is essentially `vec_group_id()`
159   for (int i = 0; i < n; ++i) {
160     const int32_t hash = dict_hash_scalar(d, i);
161     const R_len_t key = d->key[hash];
162 
163     if (key == DICT_EMPTY) {
164       dict_put(d, hash, i);
165       p_groups[i] = g;
166       ++g;
167     } else {
168       p_groups[i] = p_groups[key];
169     }
170   }
171 
172   const int n_groups = d->used;
173 
174   // Location of first occurence of each group in `x`
175   SEXP key_loc = PROTECT_N(Rf_allocVector(INTSXP, n_groups), &nprot);
176   int* p_key_loc = INTEGER(key_loc);
177   int key_loc_current = 0;
178 
179   // Count of the number of elements in each group
180   SEXP counts = PROTECT_N(Rf_allocVector(INTSXP, n_groups), &nprot);
181   int* p_counts = INTEGER(counts);
182   memset(p_counts, 0, n_groups * sizeof(int));
183 
184   for (int i = 0; i < n; ++i) {
185     const int group = p_groups[i];
186 
187     if (group == key_loc_current) {
188       p_key_loc[key_loc_current] = i + 1;
189       ++key_loc_current;
190     }
191 
192     ++p_counts[group];
193   }
194 
195   SEXP out_loc = PROTECT_N(Rf_allocVector(VECSXP, n_groups), &nprot);
196 
197   // Direct pointer to the location vectors we store in `out_loc`
198   int** p_elt_loc = (int**) R_alloc(n_groups, sizeof(int*));
199 
200   // Initialize `out_loc` to a list of integers with sizes corresponding
201   // to the number of elements in that group
202   for (int i = 0; i < n_groups; ++i) {
203     SEXP elt_loc = Rf_allocVector(INTSXP, p_counts[i]);
204     p_elt_loc[i] = INTEGER(elt_loc);
205     SET_VECTOR_ELT(out_loc, i, elt_loc);
206   }
207 
208   // The current location we are updating, each group has its own counter
209   SEXP locations = PROTECT_N(Rf_allocVector(INTSXP, n_groups), &nprot);
210   int* p_locations = INTEGER(locations);
211   memset(p_locations, 0, n_groups * sizeof(int));
212 
213   // Fill in the location values for each group
214   for (int i = 0; i < n; ++i) {
215     const int group = p_groups[i];
216     const int location = p_locations[group];
217     p_elt_loc[group][location] = i + 1;
218     ++p_locations[group];
219   }
220 
221   SEXP out_key = PROTECT_N(vec_slice(x, key_loc), &nprot);
222 
223   // Construct output data frame
224   SEXP out = PROTECT_N(Rf_allocVector(VECSXP, 2), &nprot);
225   SET_VECTOR_ELT(out, 0, out_key);
226   SET_VECTOR_ELT(out, 1, out_loc);
227 
228   SEXP names = PROTECT_N(Rf_allocVector(STRSXP, 2), &nprot);
229   SET_STRING_ELT(names, 0, strings_key);
230   SET_STRING_ELT(names, 1, strings_loc);
231 
232   Rf_setAttrib(out, R_NamesSymbol, names);
233 
234   out = new_data_frame(out, n_groups);
235 
236   UNPROTECT(nprot);
237   return out;
238 }
239