1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 /* Hash table with separate chaining.
18
19 This header (hmapx.h) supplies an "external" implementation of
20 a hash table that uses linked lists to resolve collisions
21 ("separate chaining"). Its companion header (hmap.h) supplies
22 a "embedded" implementation that is otherwise similar. The
23 two variants are described briefly here. The external
24 variant, for which this is the header, is described in
25 slightly more detail below. Each function also has a detailed
26 usage comment at its point of definition. (Many of those
27 definitions are inline in this file, because they are so
28 simple. Others are in hmapx.c.)
29
30 The "hmap" embedded hash table implementation puts the hash
31 table node (which includes the linked list used for resolving
32 collisions) within the data structure that the hash table
33 contains. This makes allocation efficient, in space and time,
34 because no additional call into an allocator is needed to
35 obtain memory for the hash table node. It also makes it easy
36 to find the hash table node associated with a given object.
37 However, it's difficult to include a given object in an
38 arbitrary number of hash tables.
39
40 The "hmapx" external hash table implementation allocates hash
41 table nodes separately from the objects in the hash table.
42 Inserting and removing hash table elements requires dynamic
43 allocation, so it is normally slower and takes more memory
44 than the embedded implementation. It also requires searching
45 the table to find the node associated with a given object.
46 However, it's easy to include a given object in an arbitrary
47 number of hash tables. It's also possible to create an
48 external hash table without adding a member to the data
49 structure that the hash table contains. */
50
51 #ifndef LIBPSPP_HMAPX_H
52 #define LIBPSPP_HMAPX_H 1
53
54 /* External hash table with separate chaining.
55
56 To create an external hash table, declare an instance of
57 struct hmapx, then initialize it with hmapx_init():
58 struct hmapx map;
59 hmapx_init (&map);
60 or, alternatively:
61 struct hmapx map = HMAPX_INITIALIZER (map);
62
63 An hmapx data structure contains data represented as void *.
64 The hmapx_insert() function inserts such a datum and returns
65 the address of a newly created struct hmapx_node that
66 represents the new element:
67 struct foo {
68 const char *key;
69 const char *value;
70 };
71 struct foo foo = {"key", "value"};
72 struct hmapx_node *node;
73 node = hmapx_insert (&map, &foo, hsh_hash_string (foo.key));
74 The element's hash value must be passed as one of
75 hmapx_insert()'s arguments. The hash table saves this hash
76 value for use later to speed searches and to rehash as the
77 hash table grows.
78
79 hmapx_insert() does not check whether the newly inserted
80 element duplicates an element already in the hash table. The
81 client is responsible for doing so, if this is desirable.
82
83 Use hmapx_delete() to delete an element from the hash table,
84 passing in its hmapx_node:
85 hmapx_delete (&map, node);
86 Deleting an element frees its node.
87
88 The hash table does not provide a direct way to search for an
89 existing element. Instead, it provides the means to iterate
90 over all the elements in the hash table with a given hash
91 value. It is easy to compose a search function from such a
92 building block. For example:
93 struct hmapx_node *
94 find_node (const struct hmapx *map, const char *target)
95 {
96 struct hmapx_node *node;
97 struct foo *foo;
98 HMAPX_FOR_EACH_WITH_HASH (foo, node, hsh_hash_string (target), map)
99 if (!strcmp (foo->key, target))
100 break;
101 return node;
102 }
103 This function's client can extract the data item from the
104 returned hmapx_node using the hmapx_node_data() function. The
105 hmapx_node can also be useful directly as an argument to other
106 hmapx functions, such as hmapx_delete().
107
108 Here is how to iterate through the elements currently in the
109 hash table:
110 struct hmapx_node *node;
111 const char *string;
112 HMAPX_FOR_EACH (data, node, &map)
113 {
114 ...do something with string...
115 }
116 */
117
118 #include "libpspp/hmap.h"
119 #include <stdlib.h>
120
121 /* Hash table node. */
122 struct hmapx_node
123 {
124 struct hmap_node hmap_node; /* Underlying hash node. */
125 void *data; /* User data. */
126 };
127
128 static inline void *hmapx_node_data (const struct hmapx_node *);
129 static inline size_t hmapx_node_hash (const struct hmapx_node *);
130
131 /* Hash table. */
132 struct hmapx
133 {
134 struct hmap hmap;
135 };
136
137 /* Suitable for use as the initializer for a struct hmapx named
138 MAP. Typical usage:
139 struct hmap map = HMAPX_INITIALIZER (map);
140 HMAPX_INITIALIZER() is an alternative to hmapx_init(). */
141 #define HMAPX_INITIALIZER(MAP) { HMAP_INITIALIZER (MAP.hmap) }
142
143 /* Creation and destruction. */
144 static inline void hmapx_init (struct hmapx *);
145 static inline void hmapx_swap (struct hmapx *, struct hmapx *);
146 void hmapx_clear (struct hmapx *);
147 void hmapx_destroy (struct hmapx *);
148
149 /* Storage management. */
150 static inline void hmapx_reserve (struct hmapx *, size_t capacity);
151 static inline void hmapx_shrink (struct hmapx *);
152
153 /* Search. */
154 static inline struct hmapx_node *hmapx_first_with_hash (struct hmapx *,
155 size_t hash);
156 static inline struct hmapx_node *hmapx_next_with_hash (struct hmapx_node *);
157
158 /* Insertion and deletion. */
159 struct hmapx_node *hmapx_insert (struct hmapx *, void *, size_t hash);
160 struct hmapx_node *hmapx_insert_fast (struct hmapx *, void *, size_t hash);
161 static inline void hmapx_delete (struct hmapx *, struct hmapx_node *);
162
163 /* Iteration. */
164 static inline struct hmapx_node *hmapx_first (const struct hmapx *);
165 static inline struct hmapx_node *hmapx_next (const struct hmapx *,
166 const struct hmapx_node *);
167
168 /* Counting. */
169 static inline bool hmapx_is_empty (const struct hmapx *);
170 static inline size_t hmapx_count (const struct hmapx *);
171 static inline size_t hmapx_capacity (const struct hmapx *);
172
173 /* Updating data elements. */
174 static inline void hmapx_change (struct hmapx *,
175 struct hmapx_node *, void *, size_t new_hash);
176 static inline void hmapx_changed (struct hmapx *, struct hmapx_node *,
177 size_t new_hash);
178 static inline void hmapx_move (struct hmapx_node *, void *);
179
180 /* Convenience macros for search.
181
182 These macros automatically use hmapx_node_data() to obtain the
183 data elements that encapsulate hmap nodes, which often saves
184 typing and can make code easier to read. Refer to the large
185 comment near the top of this file for an example.
186
187 These macros evaluate HASH only once. They evaluate their
188 other arguments many times. */
189 #define HMAPX_FOR_EACH_WITH_HASH(DATA, NODE, HASH, HMAPX) \
190 for ((NODE) = hmapx_first_with_hash (HMAPX, HASH); \
191 (NODE) != NULL ? ((DATA) = hmapx_node_data (NODE), 1) : 0; \
192 (NODE) = hmapx_next_with_hash (NODE))
193 #define HMAPX_FOR_EACH_WITH_HASH_SAFE(DATA, NODE, NEXT, HASH, HMAPX) \
194 for ((NODE) = hmapx_first_with_hash (HMAPX, HASH); \
195 ((NODE) != NULL \
196 ? ((DATA) = hmapx_node_data (NODE), \
197 (NEXT) = hmapx_next_with_hash (NODE), \
198 1) \
199 : 0); \
200 (NODE) = (NEXT))
201
202 /* Convenience macros for iteration.
203
204 These macros automatically use hmapx_node_data() to obtain the
205 data elements that encapsulate hmap nodes, which often saves
206 typing and can make code easier to read. Refer to the large
207 comment near the top of this file for an example.
208
209 These macros evaluate their arguments many times. */
210 #define HMAPX_FOR_EACH(DATA, NODE, HMAPX) \
211 for ((NODE) = hmapx_first (HMAPX); \
212 (NODE) != NULL ? ((DATA) = hmapx_node_data (NODE), 1) : 0; \
213 (NODE) = hmapx_next (HMAPX, NODE))
214 #define HMAPX_FOR_EACH_SAFE(DATA, NODE, NEXT, HMAPX) \
215 for ((NODE) = hmapx_first (HMAPX); \
216 ((NODE) != NULL \
217 ? ((DATA) = hmapx_node_data (NODE), \
218 (NEXT) = hmapx_next (HMAPX, NODE), \
219 1) \
220 : 0); \
221 (NODE) = (NEXT))
222
223 /* Inline definitions. */
224
225 /* Returns the data stored in NODE. */
226 static inline void *
hmapx_node_data(const struct hmapx_node * node)227 hmapx_node_data (const struct hmapx_node *node)
228 {
229 return node->data;
230 }
231
232 /* Returns the hash value stored in NODE */
233 static inline size_t
hmapx_node_hash(const struct hmapx_node * node)234 hmapx_node_hash (const struct hmapx_node *node)
235 {
236 return hmap_node_hash (&node->hmap_node);
237 }
238
239 /* Initializes MAP as a new hash map that is initially empty. */
240 static inline void
hmapx_init(struct hmapx * map)241 hmapx_init (struct hmapx *map)
242 {
243 hmap_init (&map->hmap);
244 }
245
246 /* Exchanges the contents of hash maps A and B. */
247 static inline void
hmapx_swap(struct hmapx * a,struct hmapx * b)248 hmapx_swap (struct hmapx *a, struct hmapx *b)
249 {
250 hmap_swap (&a->hmap, &b->hmap);
251 }
252
253 /* Ensures that MAP has sufficient space to store at least
254 CAPACITY data elements, allocating a new set of buckets and
255 rehashing if necessary. */
256 static inline void
hmapx_reserve(struct hmapx * map,size_t capacity)257 hmapx_reserve (struct hmapx *map, size_t capacity)
258 {
259 hmap_reserve (&map->hmap, capacity);
260 }
261
262 /* Shrinks MAP's set of buckets to the minimum number needed to
263 store its current number of elements, allocating a new set of
264 buckets and rehashing if that would save space. */
265 static inline void
hmapx_shrink(struct hmapx * map)266 hmapx_shrink (struct hmapx *map)
267 {
268 hmap_shrink (&map->hmap);
269 }
270
271 /* Returns the first node in MAP that has hash value HASH, or a
272 null pointer if MAP does not contain any node with that hash
273 value.
274
275 Assuming uniform hashing and no duplicate data items in MAP,
276 this function runs in constant time. (Amortized over an
277 iteration over all data items with a given HASH, its runtime
278 is proportional to the length of the hash chain for HASH, so
279 given a pathological hash function, e.g. one that returns a
280 constant value, its runtime degenerates to linear in the
281 length of NODE's hash chain.)
282
283 Nodes are returned in arbitrary order that may change whenever
284 the hash table's current capacity changes, as reported by
285 hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(),
286 and hmapx_shrink() can change the capacity of a hash map.
287 Inserting a node with hmapx_insert_fast() or deleting one with
288 hmapx_delete() will not change the relative ordering of nodes.
289
290 The HMAPX_FOR_EACH_WITH_HASH and HMAPX_FOR_EACH_WITH_HASH_SAFE
291 macros provide convenient ways to iterate over all the nodes
292 with a given hash. */
293 static inline struct hmapx_node *
hmapx_first_with_hash(struct hmapx * map,size_t hash)294 hmapx_first_with_hash (struct hmapx *map, size_t hash)
295 {
296 return HMAP_FIRST_WITH_HASH (struct hmapx_node, hmap_node, &map->hmap, hash);
297 }
298
299 /* Returns the next node in MAP after NODE that has the same hash
300 value as NODE, or a null pointer if MAP does not contain any
301 more nodes with that hash value.
302
303 Assuming uniform hashing and no duplicate data items in MAP,
304 this function runs in constant time. (Amortized over an
305 iteration over all data items with a given HASH, its runtime
306 is proportional to the length of the hash chain for HASH, so
307 given a pathological hash function, e.g. one that returns a
308 constant value, its runtime degenerates to linear in the
309 length of NODE's hash chain.)
310
311 Nodes are returned in arbitrary order that may change whenever
312 the hash table's current capacity changes, as reported by
313 hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(),
314 and hmapx_shrink() can change the capacity of a hash map.
315 Inserting a node with hmapx_insert_fast() or deleting one with
316 hmapx_delete() will not change the relative ordering of nodes.
317
318 The HMAPX_FOR_EACH_WITH_HASH and HMAPX_FOR_EACH_WITH_HASH_SAFE
319 macros provide convenient ways to iterate over all the nodes
320 with a given hash. */
321 static inline struct hmapx_node *
hmapx_next_with_hash(struct hmapx_node * node)322 hmapx_next_with_hash (struct hmapx_node *node)
323 {
324 return HMAP_NEXT_WITH_HASH (node, struct hmapx_node, hmap_node);
325 }
326
327 /* Removes NODE from MAP and frees NODE. The client is
328 responsible for freeing the user data associated with NODE, if
329 appropriate.
330
331 Assuming uniform hashing, this function runs in constant time.
332 (Its runtime is proportional to the position of NODE in its
333 hash chain, so given a pathological hash function, e.g. one
334 that returns a constant value, its runtime degenerates to
335 linear in the length of NODE's hash chain.)
336
337 This function never reduces the number of buckets in MAP.
338 When one deletes a large number of nodes from a hash table,
339 calling hmapx_shrink() afterward may therefore save a small
340 amount of memory. It is also more expensive to iterate
341 through a very sparse hash table than a denser one, so
342 shrinking the hash table could also save some time. However,
343 rehashing has an immediate cost that must be weighed against
344 these benefits.
345
346 hmapx_delete() does not change NODE's hash value reported by
347 hmapx_node_hash(). */
348 static inline void
hmapx_delete(struct hmapx * map,struct hmapx_node * node)349 hmapx_delete (struct hmapx *map, struct hmapx_node *node)
350 {
351 hmap_delete (&map->hmap, &node->hmap_node);
352 free (node);
353 }
354
355 /* Returns the first node in MAP, or a null pointer if MAP is
356 empty.
357
358 Amortized over iterating through every data element in MAP,
359 this function runs in constant time. However, this assumes
360 that MAP is not excessively sparse, that is, that
361 hmapx_capacity(MAP) is at most a constant factor greater than
362 hmapx_count(MAP). This will always be true unless many nodes
363 have been inserted into MAP and then most or all of them
364 deleted; in such a case, calling hmapx_shrink() is advised.
365
366 Nodes are returned in arbitrary order that may change whenever
367 the hash table's current capacity changes, as reported by
368 hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(),
369 and hmapx_shrink() can change the capacity of a hash map.
370 Inserting a node with hmapx_insert_fast() or deleting one with
371 hmapx_delete() will not change the relative ordering of nodes.
372
373 The HMAPX_FOR_EACH and HMAPX_FOR_EACH_SAFE macros provide
374 convenient ways to iterate over all the nodes in a hash
375 map. */
376 static inline struct hmapx_node *
hmapx_first(const struct hmapx * map)377 hmapx_first (const struct hmapx *map)
378 {
379 return HMAP_FIRST (struct hmapx_node, hmap_node, &map->hmap);
380 }
381
382 /* Returns the next node in MAP following NODE, or a null pointer
383 if NODE is the last node in MAP.
384
385 Amortized over iterating through every data element in MAP,
386 this function runs in constant time. However, this assumes
387 that MAP is not excessively sparse, that is, that
388 hmapx_capacity(MAP) is at most a constant factor greater than
389 hmapx_count(MAP). This will always be true unless many nodes
390 have been inserted into MAP and then most or all of them
391 deleted; in such a case, calling hmapx_shrink() is advised.
392
393 Nodes are returned in arbitrary order that may change whenever
394 the hash table's current capacity changes, as reported by
395 hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(),
396 and hmapx_shrink() can change the capacity of a hash map.
397 Inserting a node with hmapx_insert_fast() or deleting one with
398 hmapx_delete() will not change the relative ordering of nodes.
399
400 The HMAPX_FOR_EACH and HMAPX_FOR_EACH_SAFE macros provide
401 convenient ways to iterate over all the nodes in a hash
402 map. */
403 static inline struct hmapx_node *
hmapx_next(const struct hmapx * map,const struct hmapx_node * node)404 hmapx_next (const struct hmapx *map, const struct hmapx_node *node)
405 {
406 return HMAP_NEXT (node, struct hmapx_node, hmap_node, &map->hmap);
407 }
408
409 /* Returns true if MAP currently contains no data items, false
410 otherwise. */
411 static inline bool
hmapx_is_empty(const struct hmapx * map)412 hmapx_is_empty (const struct hmapx *map)
413 {
414 return hmap_is_empty (&map->hmap);
415 }
416
417 /* Returns the number of data items currently in MAP. */
418 static inline size_t
hmapx_count(const struct hmapx * map)419 hmapx_count (const struct hmapx *map)
420 {
421 return hmap_count (&map->hmap);
422 }
423
424 /* Returns the current capacity of MAP, that is, the maximum
425 number of data elements that MAP may hold before it becomes
426 advisable to rehash.
427
428 The capacity is advisory only: it is possible to insert any
429 number of data elements into a hash map regardless of its
430 capacity. However, inserting many more elements than the
431 map's capacity will degrade search performance. */
432 static inline size_t
hmapx_capacity(const struct hmapx * map)433 hmapx_capacity (const struct hmapx *map)
434 {
435 return hmap_capacity (&map->hmap);
436 }
437
438 /* Changes NODE's data to DATA and its hash value to NEW_HASH.
439 NODE must reside in MAP.
440
441 This function does not verify that MAP does not already
442 contain a data item that duplicates DATA. If duplicates
443 should be disallowed (which is the usual case), then the
444 client must check for duplicates before changing NODE's
445 value. */
446 static inline void
hmapx_change(struct hmapx * map,struct hmapx_node * node,void * data,size_t new_hash)447 hmapx_change (struct hmapx *map,
448 struct hmapx_node *node, void *data, size_t new_hash)
449 {
450 hmapx_move (node, data);
451 hmapx_changed (map, node, new_hash);
452 }
453
454 /* Moves NODE around in MAP to compensate for its hash value
455 having changed to NEW_HASH.
456
457 This function does not verify that MAP does not already
458 contain a data item that duplicates the new value of NODE's
459 data. If duplicates should be disallowed (which is the usual
460 case), then the client must check for duplicates before
461 changing NODE's value. */
462 static inline void
hmapx_changed(struct hmapx * map,struct hmapx_node * node,size_t new_hash)463 hmapx_changed (struct hmapx *map, struct hmapx_node *node, size_t new_hash)
464 {
465 hmap_changed (&map->hmap, &node->hmap_node, new_hash);
466 }
467
468 /* Updates NODE to compensate for its data item having moved
469 around in memory to new location DATA. The data item's value
470 and hash value should not have changed. (If they have
471 changed, call hmapx_change() instead.) */
472 static inline void
hmapx_move(struct hmapx_node * node,void * data)473 hmapx_move (struct hmapx_node *node, void *data)
474 {
475 node->data = data;
476 }
477
478 #endif /* libpspp/hmapx.h */
479