1 // ================================================================
2 // Array-only (open addressing) multi-level hash map, with linear probing for collisions.
3 // All keys, and terminal-level values, are mlrvals. All data passed into the put method
4 // are copied; no pointers in this data structure reference anything external.
5 //
6 // Notes:
7 // * null key is not supported.
8 // * null value is not supported.
9 //
10 // See also:
11 // * http://en.wikipedia.org/wiki/Hash_table
12 // * http://docs.oracle.com/javase/6/docs/api/java/util/Map.html
13 // ================================================================
14 
15 #ifndef MLHMMV_H
16 #define MLHMMV_H
17 
18 #include "lib/mlrval.h"
19 #include "containers/sllmv.h"
20 #include "containers/sllv.h"
21 #include "containers/lrec.h"
22 
23 #define MLHMMV_ERROR_NONE                0x0000
24 #define MLHMMV_ERROR_KEYLIST_TOO_DEEP    0xdeef
25 #define MLHMMV_ERROR_KEYLIST_TOO_SHALLOW 0x58a1
26 
27 // This is made visible here in the API so the unit-tester can be sure to exercise the resize logic.
28 #define MLHMMV_INITIAL_ARRAY_LENGTH 16
29 
30 // ----------------------------------------------------------------
31 void mlhmmv_print_terminal(mv_t* pmv, int quote_keys_always, int quote_values_always,
32 	FILE* ostream);
33 
34 // ----------------------------------------------------------------
35 struct _mlhmmv_level_t; // forward reference
36 
37 // The 'x' is for extended: this can hold a scalar or a map.
38 typedef struct _mlhmmv_xvalue_t {
39 	struct _mlhmmv_level_t* pnext_level;
40 	mv_t terminal_mlrval;
41 	char is_terminal;
42 } mlhmmv_xvalue_t;
43 
44 void            mlhmmv_xvalue_reset(mlhmmv_xvalue_t* pxvalue);
45 mlhmmv_xvalue_t mlhmmv_xvalue_alloc_empty_map();
46 mlhmmv_xvalue_t mlhmmv_xvalue_copy(mlhmmv_xvalue_t* pxvalue);
47 void            mlhmmv_xvalue_free(mlhmmv_xvalue_t* pxvalue);
48 
49 char* mlhmmv_xvalue_describe_type_simple(mlhmmv_xvalue_t* pxvalue);
50 
mlhmmv_xvalue_is_absent_and_nonterminal(mlhmmv_xvalue_t * pxvalue)51 static inline int mlhmmv_xvalue_is_absent_and_nonterminal(mlhmmv_xvalue_t* pxvalue) {
52 	return (pxvalue->is_terminal && mv_is_absent(&pxvalue->terminal_mlrval));
53 }
54 
mlhmmv_xvalue_is_present_and_nonterminal(mlhmmv_xvalue_t * pxvalue)55 static inline int mlhmmv_xvalue_is_present_and_nonterminal(mlhmmv_xvalue_t* pxvalue) {
56 	return (pxvalue->is_terminal && mv_is_present(&pxvalue->terminal_mlrval));
57 }
58 
59 // Used by for-loops over map-valued local variables
60 sllv_t* mlhmmv_xvalue_copy_keys_indexed   (mlhmmv_xvalue_t* pxvalue, sllmv_t* pmvkeys);
61 sllv_t* mlhmmv_xvalue_copy_keys_nonindexed(mlhmmv_xvalue_t* pxvalue);
62 
63 void mlhmmv_xvalues_to_lrecs_lashed(
64 	mlhmmv_xvalue_t** ptop_values,
65 	int               num_submaps,
66 	mv_t*             pbasenames,
67 	sllmv_t*          pnames,
68 	sllv_t*           poutrecs,
69 	int               do_full_prefixing,
70 	char*             flatten_separator);
71 
72 // ----------------------------------------------------------------
73 typedef struct _mlhmmv_level_entry_t {
74 	int     ideal_index;
75 	mv_t    level_key;
76 	mlhmmv_xvalue_t level_xvalue; // terminal mlrval, or another hashmap
77 	struct _mlhmmv_level_entry_t *pprev;
78 	struct _mlhmmv_level_entry_t *pnext;
79 } mlhmmv_level_entry_t;
80 
81 typedef unsigned char mlhmmv_level_entry_state_t;
82 
83 // Store a mlrval into the mlhmmv_xvalue without copying, implicitly transferring
84 // ownership of the mlrval's free_flags. This means the mlrval will be freed
85 // when the mlhmmv_xvalue is freed, so the caller should make a copy first if
86 // necessary.
87 //
88 // This is a hot path for non-map local-variable assignments.
mlhmmv_xvalue_wrap_terminal(mv_t val)89 static inline mlhmmv_xvalue_t mlhmmv_xvalue_wrap_terminal(mv_t val) {
90 	return (mlhmmv_xvalue_t) {.is_terminal = TRUE, .terminal_mlrval = val, .pnext_level = NULL};
91 }
92 
93 // ----------------------------------------------------------------
94 typedef struct _mlhmmv_level_t {
95 	int                         num_occupied;
96 	int                         num_freed;
97 	int                         array_length;
98 	mlhmmv_level_entry_t*       entries;
99 	mlhmmv_level_entry_state_t* states;
100 	mlhmmv_level_entry_t*       phead;
101 	mlhmmv_level_entry_t*       ptail;
102 } mlhmmv_level_t;
103 
104 mlhmmv_level_t* mlhmmv_level_alloc();
105 void mlhmmv_level_free(mlhmmv_level_t* plevel);
106 
107 void mlhmmv_level_clear(mlhmmv_level_t* plevel);
108 void mlhmmv_level_remove(mlhmmv_level_t* plevel, sllmve_t* prestkeys);
109 int mlhmmv_level_has_key(mlhmmv_level_t* plevel, mv_t* plevel_key);
110 
111 mv_t* mlhmmv_level_look_up_and_ref_terminal(
112 	mlhmmv_level_t* plevel,
113 	sllmv_t*        pmvkeys,
114 	int*            perror);
115 
116 mlhmmv_xvalue_t* mlhmmv_level_look_up_and_ref_xvalue(
117 	mlhmmv_level_t* plevel,
118 	sllmv_t*        pmvkeys,
119 	int*            perror);
120 
121 mlhmmv_level_t* mlhmmv_level_put_empty_map(
122 	mlhmmv_level_t* plevel,
123 	mv_t*           pkey);
124 
125 void mlhmmv_level_put_xvalue(
126 	mlhmmv_level_t*  plevel,
127 	sllmve_t*        prest_keys,
128 	mlhmmv_xvalue_t* pvalue);
129 
130 void mlhmmv_level_put_xvalue_singly_keyed(
131 	mlhmmv_level_t*  plevel,
132 	mv_t*            pkey,
133 	mlhmmv_xvalue_t* pvalue);
134 
135 void mlhmmv_level_put_terminal(
136 	mlhmmv_level_t* plevel,
137 	sllmve_t*       prest_keys,
138 	mv_t*           pterminal_value);
139 
140 void mlhmmv_level_put_terminal_singly_keyed(
141 	mlhmmv_level_t* plevel,
142 	mv_t*           pkey,
143 	mv_t*           pterminal_value);
144 
145 void mlhmmv_level_to_lrecs(
146 	mlhmmv_level_t* plevel,
147 	sllmv_t*        pkeys,
148 	sllmv_t*        pnames,
149 	sllv_t*         poutrecs,
150 	int             do_full_prefixing,
151 	char*           flatten_separator);
152 
153 void mlhmmv_level_print_stacked(
154 	mlhmmv_level_t* plevel,
155 	int             depth,
156 	int             do_final_comma,
157 	int             quote_keys_always,
158 	int             quote_values_always,
159 	char*           line_indent,
160 	char*           line_term,
161 	FILE*           ostream);
162 
163 // ----------------------------------------------------------------
164 typedef struct _mlhmmv_root_t {
165 	mlhmmv_xvalue_t root_xvalue;
166 } mlhmmv_root_t;
167 
168 mlhmmv_root_t* mlhmmv_root_alloc();
169 
170 void mlhmmv_root_free(mlhmmv_root_t* pmap);
171 
172 void mlhmmv_root_clear(mlhmmv_root_t* pmap);
173 
174 // If the return value is non-null, error will be MLHMMV_ERROR_NONE.  If the
175 // return value is null, the error will be MLHMMV_ERROR_KEYLIST_TOO_DEEP or
176 // MLHMMV_ERROR_KEYLIST_TOO_SHALLOW, or MLHMMV_ERROR_NONE if the keylist matches
177 // map depth but the entry is not found.
178 //
179 // Note: this returns a pointer to the map's data, not to a copy.
180 // The caller shouldn't free it, or modify it.
181 mv_t* mlhmmv_root_look_up_and_ref_terminal(mlhmmv_root_t* pmap, sllmv_t* pmvkeys, int* perror);
182 
183 // These are an optimization for assignment from full srec, e.g. '@records[$key1][$key2] = $*'.
184 // Using mlhmmv_root_look_up_or_create_then_ref_level, the CST logic can get or create the @records[$key1][$key2]
185 // level of the mlhmmv, then copy values there.
186 mlhmmv_level_t* mlhmmv_root_look_up_or_create_then_ref_level(mlhmmv_root_t* pmap, sllmv_t* pmvkeys);
187 
188 void mlhmmv_root_put_terminal(mlhmmv_root_t* pmap, sllmv_t* pmvkeys, mv_t* pterminal_value);
189 
190 // For for-loop-over-oosvar, wherein we need to copy the submap before iterating over it
191 // (since the iteration may modify it). If the keys don't index a submap, then the return
192 // value has is_terminal = TRUE and pnext_level = NULL.
193 mlhmmv_xvalue_t mlhmmv_root_copy_xvalue(mlhmmv_root_t* pmap, sllmv_t* pmvkeys);
194 
195 // Used by for-loops over oosvars. Return value is an array of ephemeral mlrvals.
196 sllv_t* mlhmmv_root_copy_keys_from_submap(mlhmmv_root_t* pmap, sllmv_t* pmvkeys);
197 
198 // Unset value/submap from a specified level onward, also unsetting any maps which become empty as a result.
199 // Examples:
200 //   {
201 //     "a" : { "x" : 1, "y" : 2 },
202 //     "b" : { "x" : 3, "y" : 4 },
203 //   }
204 // with pmvkeys = ["a"] leaves
205 //   {
206 //     "b" : { "x" : 3, "y" : 4 },
207 //   }
208 // but with pmvkeys = ["a", "y"] leaves
209 //   {
210 //     "a" : { "x" : 1 },
211 //     "b" : { "x" : 3, "y" : 4 },
212 //   }
213 // and with pmvkeys = [] leaves
214 //   {
215 //   }
216 // Now if ["a","x"] is removed from
217 //   {
218 //     "a" : { "x" : 1 },
219 //     "b" : { "x" : 3, "y" : 4 },
220 //   }
221 // then
222 //   {
223 //     "b" : { "x" : 3, "y" : 4 },
224 //   }
225 // is left: unsetting "a":"x" leaves the map at "a" so this is unset as well.
226 void mlhmmv_root_remove(mlhmmv_root_t* pmap, sllmv_t* pmvkeys);
227 
228 // For 'emit' and 'emitp' in the DSL. These allocate lrecs, appended to the poutrecs list.
229 // * pmap is the base-level oosvar multi-level hashmap.
230 // * pkeys specify the level in the mlhmmv at which to produce data.
231 // * pnames is used to pull subsequent-level keys out into separate fields.
232 // * In case pnames isn't long enough to reach a terminal mlrval level in the mlhmmv,
233 //   do_full_prefixing specifies whether to concatenate nested mlhmmv keys into single lrec keys.
234 //
235 // Examples:
236 
237 // * pkeys reaches a terminal level:
238 //
239 //   $ mlr --opprint put -q '@sum += $x; end { emit @sum }' ../data/small
240 //   sum
241 //   4.536294
242 
243 // * pkeys reaches terminal levels:
244 //
245 //   $ mlr --opprint put -q '@sum[$a][$b] += $x; end { emit @sum, "a", "b" }' ../data/small
246 //   a   b   sum
247 //   pan pan 0.346790
248 //   pan wye 0.502626
249 //   eks pan 0.758680
250 //   eks wye 0.381399
251 //   eks zee 0.611784
252 //   wye wye 0.204603
253 //   wye pan 0.573289
254 //   zee pan 0.527126
255 //   zee wye 0.598554
256 //   hat wye 0.031442
257 
258 // * pkeys reaches non-terminal levels: non-prefixed:
259 //
260 //   $ mlr --opprint put -q '@sum[$a][$b] += $x; end { emit @sum, "a" }' ../data/small
261 //   a   pan      wye
262 //   pan 0.346790 0.502626
263 //
264 //   a   pan      wye      zee
265 //   eks 0.758680 0.381399 0.611784
266 //
267 //   a   wye      pan
268 //   wye 0.204603 0.573289
269 //
270 //   a   pan      wye
271 //   zee 0.527126 0.598554
272 //
273 //   a   wye
274 //   hat 0.031442
275 
276 // * pkeys reaches non-terminal levels: prefixed:
277 //
278 //   $ mlr --opprint put -q '@sum[$a][$b] += $x; end { emitp @sum, "a" }' ../data/small
279 //   a   sum:pan  sum:wye
280 //   pan 0.346790 0.502626
281 //
282 //   a   sum:pan  sum:wye  sum:zee
283 //   eks 0.758680 0.381399 0.611784
284 //
285 //   a   sum:wye  sum:pan
286 //   wye 0.204603 0.573289
287 //
288 //   a   sum:pan  sum:wye
289 //   zee 0.527126 0.598554
290 //
291 //   a   sum:wye
292 //   hat 0.031442
293 
294 // For 'emit all' and 'emitp all' in the DSL
295 void mlhmmv_root_all_to_lrecs(mlhmmv_root_t* pmap, sllmv_t* pnames, sllv_t* poutrecs,
296 	int do_full_prefixing, char* flatten_separator);
297 
298 // For 'emit' and 'emitp' in the DSL
299 void mlhmmv_root_partial_to_lrecs(mlhmmv_root_t* pmap, sllmv_t* pkeys, sllmv_t* pnames, sllv_t* poutrecs,
300 	int do_full_prefixing, char* flatten_separator);
301 
302 // For 'dump' in the DSL; also used by the lrec-to-JSON writer.
303 void mlhmmv_root_print_json_stacked(mlhmmv_root_t* pmap,
304 	int quote_keys_always, int quote_values_always,
305 	char* line_indent, char* line_term, FILE* ostream);
306 void mlhmmv_root_print_json_single_lines(mlhmmv_root_t* pmap, int quote_keys_always,
307 	int quote_values_always, char* line_term, FILE* ostream);
308 
309 // Used for emit of localvars. Puts the xvalue in a single-key-value-pair map
310 // keyed by the specified name. The xvalue is referenced, not copied.
311 mlhmmv_root_t* mlhmmv_wrap_name_and_xvalue(mv_t* pname, mlhmmv_xvalue_t* pxval);
312 
313 // Used for takedown of the temporary map returned by mlhmmv_wrap_name_and_xvalue. Since the xvalue there
314 // is referenced, not copied, mlhmmv_xvalue_free would prematurely free the xvalue. This method releases
315 // the xvalue so that the remaining, map-internal structures can be freed correctly.
316 void mlhmmv_unwrap_name_and_xvalue(mlhmmv_root_t* pmap);
317 
318 #endif // MLHMMV_H
319