1 /*
2 ** Copyright (C) 2001-2020 by Carnegie Mellon University.
3 **
4 ** @OPENSOURCE_LICENSE_START@
5 ** See license information in ../../LICENSE.txt
6 ** @OPENSOURCE_LICENSE_END@
7 */
8 
9 /*
10 **  hashlib.h
11 **
12 **    Defines interface to core hash library functions.
13 **
14 */
15 
16 #ifndef _HASHLIB_H
17 #define _HASHLIB_H
18 #ifdef __cplusplus
19 extern "C" {
20 #endif
21 
22 #include <silk/silk.h>
23 
24 RCSIDENTVAR(rcsID_HASHLIB_H, "$SiLK: hashlib.h ef14e54179be 2020-04-14 21:57:45Z mthomas $");
25 
26 /**
27  *  @file
28  *
29  *    Implementation of a hashtable.
30  *
31  *    This file is part of libsilk.
32  */
33 
34 
35 /*
36  *  **************************************************************************
37  */
38 
39 
40 /**
41  *    Return codes for functions in hashlib. Note that >= 0 are
42  *    success codes.
43  */
44 
45 /**  function was successful */
46 #define OK 0
47 /**  entry already exists */
48 #define OK_DUPLICATE 1
49 /**  could not find entry */
50 #define ERR_NOTFOUND -1
51 /**  no more entries available */
52 #define ERR_NOMOREENTRIES -2
53 /**  no longer in use */
54 #define ERR_INDEXOUTOFBOUNDS -3
55 /**  could not open a file */
56 #define ERR_FILEOPENERROR -4
57 /**  illegal argument value */
58 #define ERR_BADARGUMENT -5
59 /**  corrupt internal state */
60 #define ERR_INTERNALERROR -6
61 /**  operation not supported for this table*/
62 #define ERR_NOTSUPPORTED -7
63 /**  read error (corrupt data file) */
64 #define ERR_FILEREADERROR -8
65 /**  write error */
66 #define ERR_FILEWRITEERROR -9
67 /**  attempt to operate on a sorted table */
68 #define ERR_SORTTABLE -10
69 /**  attempted to aloc > max. # of blocks */
70 #define ERR_NOMOREBLOCKS -254
71 /**  a call to malloc failed */
72 #define ERR_OUTOFMEMORY -255
73 
74 /* Types of hash tables */
75 
76 /**
77  *    UNUSED.
78  */
79 #define HTT_INPLACE 0
80 
81 /**
82  *    UNUSED.
83  */
84 #define HTT_BYREFERENCE 1
85 
86 /**
87  *    Indicates table allows deletion. Items are only removed from the
88  *    table after a rehash.  Deleted items have the value
89  *    del_value_ptr.
90  *
91  *    UNSUPPORTED.
92  */
93 #define HTT_ALLOWDELETION 0
94 
95 /**
96  *    Default load is 185 (72.27%). Generally, this is the value that
97  *    should be passed to hashlib_create_table for the load factor.
98  */
99 #define DEFAULT_LOAD_FACTOR 185
100 
101 /**
102  *    Maximum number of block-indexes allowed by the hash iterator.
103  */
104 #define HASHLIB_ITER_MAX_BLOCKS 16
105 
106 /**
107  *    Maximum byte-length of the key
108  */
109 #define HASHLIB_MAX_KEY_WIDTH    UINT8_MAX
110 
111 /**
112  *    Maximum byte-length of the value.
113  */
114 #define HASHLIB_MAX_VALUE_WIDTH  UINT8_MAX
115 
116 
117 /**
118  *    The HashTable structure.
119  */
120 typedef struct HashTable_st HashTable;
121 
122 /**  HashTable iteration object */
123 typedef struct HashIter_st {
124     /* Current block. Negative value is beginning or end */
125     int         block;
126     /* Current index into block */
127     uint64_t    index;
128     /* When working with a sorted table, index into each block */
129     uint32_t    block_idx[HASHLIB_ITER_MAX_BLOCKS];
130 } HASH_ITER;
131 
132 
133 /**
134  *    Signature of a callback function used by
135  *    hashlib_sort_entries_usercmp() to sort the keys in a HashTable
136  *    prior to iterating over them.
137  *
138  *    The function is expected to compare the keys in 'key_a' and
139  *    'key_b' and return -1, 0, or 1 depending on whether 'key_a' is
140  *    less than, equal to, or greater than 'key_b'.
141  *
142  *    The 'cmp_userdata' parameter is a context pointer for the
143  *    function to use.
144  */
145 typedef int
146 (*hashlib_sort_key_cmp_fn)(
147     const void         *key_a,
148     const void         *key_b,
149     void               *cmp_userdata);
150 
151 
152 /**
153  *    Creates a new hash table. The initial table includes a single
154  *    block big enough to accomodate estimated_size entries with less
155  *    than the specified load_factor.
156  *
157  *    Parameters:
158  *
159  *    key_width:      The width of a key in bytes.
160  *    value_width:    The width of a value in bytes
161  *    data_type:      UNUSED.
162  *    no_value_ptr:   A sequence of value_width bytes used to represent
163  *                    "no value" (i.e., an empty entry).  The hash table
164  *                    makes a copy of this value.  If 'no_value_ptr' is
165  *                    NULL, values will be initialized to all 0.
166  *    app_data_ptr:   UNUSED.
167  *    app_data_size:  UNUSED.
168  *    estimated_size: An estimate of the number of unique entries that will
169  *                    ultimately be inserted into the table.
170  *    load_factor:    Generally, simply use DEFAULT_LOAD_FACTOR here. This
171  *                    specifies what load level triggers the allocation of
172  *                    a new hash block or a rehash (i.e., the maximum load
173  *                    for a block). This is a percentage expressed as a
174  *                    fraction of 255.
175  *
176  *    Returns:
177  *
178  *    A pointer to the new table.
179  *    Will return NULL in the case of a memory allocation error.
180  */
181 HashTable *
182 hashlib_create_table(
183     uint8_t             key_width,
184     uint8_t             value_width,
185     uint8_t             data_type,
186     uint8_t            *no_value_ptr,
187     uint8_t            *app_data_ptr,
188     uint32_t            app_data_size,
189     uint64_t            estimated_size,
190     uint8_t             load_factor);
191 
192 
193 /**
194  *    Modifies the hash table so that hashlib_iterate() returns
195  *    the entries sorted by their key.
196  *
197  *    NOTE THAT hashlib_iterate() WILL CONTINUE TO USE 'cmp_fn' AND
198  *    'cmp_userdata'.  THE 'cmp_fn' AND 'user_data' MUST REMAIN VALID
199  *    THE UNTIL THE TABLE IS DESTROYED!
200  *
201  *    The comparison function will be passed three parameters; the
202  *    first two parameters are hash entry keys.  'cmp_fn' must return
203  *    a value less than 0 if key 'a' is less than key 'b', greater
204  *    than 0 if key 'a' is greater than key 'b', or 0 if the two keys
205  *    are identical.  The 'user_data' parameter will be passed
206  *    unchanged to the comparison function, which allows the
207  *    comparison function to use additional data without using global
208  *    variables.
209  *
210  *    NOTE: Once the hash table has been sorted, insert, lookup, and
211  *    rehash are invalid.  One may only iterate over a sorted table or
212  *    delete it.
213  *
214  *    Parameters:
215  *
216  *    table_ptr:      A pointer to the table to sort.
217  *
218  *    cmp_fn:         The comparison function to use to compare keys
219  *
220  *    cmp_userdata:   Value passed unchanged to 'cmp_fn'.
221  *
222  *    Returns:
223  *
224  *    OK in the case when the data in the table has been successfully
225  *    sorted.
226  *
227  *    ERR_BADARGUMENT when no 'cmp_fn' is provided.
228  */
229 int
230 hashlib_sort_entries_usercmp(
231     HashTable              *table_ptr,
232     hashlib_sort_key_cmp_fn cmp_fn,
233     void                   *cmp_userdata);
234 
235 
236 /**
237  *    A wrapper around hashlib_sort_entries_usercmp() that uses
238  *    memcmp() to compare the keys.
239  */
240 int
241 hashlib_sort_entries(
242     HashTable          *table_ptr);
243 
244 
245 /*
246  *    NOT IMPLEMENTED
247  */
248 int
249 hashlib_mark_deleted(
250     HashTable          *table_ptr,
251     const uint8_t      *key_ptr);
252 
253 
254 /**
255  *    Inserts a new entry into the hash table, and returns a pointer
256  *    to the memory in the hash table used to store the value. The
257  *    application should store the value associated with the entry in
258  *    *(value_pptr). If the entry already exists, *(value_pptr) will
259  *    point to the value currently associated with the given key.
260  *
261  *    NOTE: An application should never store the sequence of bytes
262  *    used to represent "no value" here.
263  *
264  *    Parameters:
265  *
266  *    table_ptr:      A pointer to the table to modify.
267  *    key_ptr:        A pointer to a sequence of bytes (table_ptr->key_width
268  *                    bytes in length) corresponding to the key.
269  *    value_pptr:     A reference to a pointer that points to existing value
270  *                    memory associated with a key, or a newly created entry.
271  *
272  *    Returns:
273  *
274  *    OK in the case when a new entry has been added successfully,
275  *
276  *    OK_DUPLICATE if an entry with the given key already exists. May
277  *    return
278  *
279  *    ERR_OUTOFMEMORY in the case of a memory allocation failure.
280  *
281  *    ERR_SORTTABLE if hashlib_sort_entries() has been called on the table.
282  */
283 int
284 hashlib_insert(
285     HashTable          *table_ptr,
286     const uint8_t      *key_ptr,
287     uint8_t           **value_pptr);
288 
289 
290 /**
291  *    Looks up an entry with the given key in the hash table. If an
292  *    entry with the given key does not exist, value_pptr is
293  *    untouched. Otherwise, a reference to the value memory is
294  *    returned in *value_pptr as above.
295  *
296  *    Parameters:
297  *
298  *    table_ptr:      A pointer to the table to modify.
299  *    key_ptr:        A pointer to a sequence of bytes (table_ptr->key_width
300  *                    bytes in length) corresponding to the key.
301  *    value_pptr:     A reference to a pointer that points to existing value
302  *                    memory associated with a key (if in the table).
303  *
304  *    Returns:
305  *
306  *    OK if the entry exists.
307  *
308  *    ERR_NOTFOUND if the entry does not exist in the table.
309  *
310  *    ERR_SORTTABLE if hashlib_sort_entries() has been called on the table.
311  */
312 int
313 hashlib_lookup(
314     const HashTable    *table_ptr,
315     const uint8_t      *key_ptr,
316     uint8_t           **value_pptr);
317 
318 
319 /**
320  *    Creates an iterator. Calling this function is the first step in
321  *    iterating over the contents of a table. See hashlib_iterate.
322  *
323  *    table_ptr:      A pointer to the table to iterate over.
324  *
325  *    Returns:
326  *
327  *    An iterator to use in subsequent calls to hashlib_iterate().
328  */
329 HASH_ITER
330 hashlib_create_iterator(
331     const HashTable    *table_ptr);
332 
333 
334 /**
335  *    Retrieves next available entry during iteration. After calling
336  *    hashlib_create_iterator(), this function should be called
337  *    repeatedly until ERR_NOMOREENTRIES is returned. References to
338  *    the key and value associated with the entry are returned as
339  *    *(key_pptr) and *(val_pptr).
340  *
341  *    Parameters:
342  *
343  *    table_ptr:      A pointer to the current table being iterated over.
344  *    iter_ptr:       A pointer to the iterator being used.
345  *    key_pptr:       A reference to a pointer whose value will be set
346  *                    to the key in the current entry (i.e., *key_pptr
347  *                    will point to the key stored in the table).
348  *    val_pptr:       A reference to a pointer whose value will be set
349  *                    to the value in the current entry (i.e., *val_pptr
350  *                    will point to the value in the table).
351  *
352  *    Returns:
353  *
354  *    OK until the end of the table is reached.
355  *
356  *    ERR_NOMOREENTRIES to indicate the iterator has visited all entries.
357  */
358 int
359 hashlib_iterate(
360     const HashTable    *table_ptr,
361     HASH_ITER          *iter_ptr,
362     uint8_t           **key_pptr,
363     uint8_t           **val_pptr);
364 
365 
366 /**
367  *    Frees the memory associated with a table.  Does nothing if
368  *    'table_ptr' is NULL.
369  *
370  *    Parameters:
371  *
372  *    table_ptr:      The table to free.
373  */
374 void
375 hashlib_free_table(
376     HashTable          *table_ptr);
377 
378 
379 /**
380  *    Force a rehash of a table. Generally, this will be used when a
381  *    series of insertions has been completed before a very large
382  *    number of lookups (relative to the number of inserts). That is,
383  *    when it is likely that the rehash cost is less than the cost
384  *    associated with performing searches over multiple blocks.  This
385  *    is an advanced function. As a rule, it never make sense to call
386  *    this function when the table holds a single block.
387  *
388  *    Parameters:
389  *
390  *    table_ptr:      The table to rehash.
391  *
392  *    Returns:
393  *
394  *    Returns either OK or ERR_OUTOFMEMORY in the case of a memory
395  *    allocation failure.
396  *
397  *    ERR_SORTTABLE if hashlib_sort_entries() has been called on the table.
398  */
399 int
400 hashlib_rehash(
401     HashTable          *table_ptr);
402 
403 
404 /**
405  *    Returns the total number of buckets that have been allocated.
406  *
407  *    Parameters:
408  *
409  *    table_ptr:      A pointer to a table.
410  *
411  *    Returns:
412  *
413  *    A count
414  */
415 uint64_t
416 hashlib_count_buckets(
417     const HashTable    *table_ptr);
418 
419 
420 /**
421  *    Returns the total number of entries in the table.  This function
422  *    sums the current entry count for each of the blocks that make up
423  *    the table.  The return value should be equal to
424  *    hashlib_count_nonempties().
425  *
426  *    Parameters:
427  *
428  *    table_ptr:      A pointer to a table.
429  *
430  *    Returns:
431  *
432  *    A count
433  */
434 uint64_t
435 hashlib_count_entries(
436     const HashTable    *table_ptr);
437 
438 
439 /**
440  *    Returns the total number of entries in the table.  This function
441  *    does a complete scan of the table, looking at each bucket to see
442  *    if it is empty.  hashlib_count_entries() should produce the same
443  *    result in much less time.
444  *
445  *    Parameters:
446  *
447  *    table_ptr:      A pointer to a table.
448  *
449  *    Returns:
450  *
451  *    A count
452  */
453 uint64_t
454 hashlib_count_nonempties(
455     const HashTable    *table_ptr);
456 
457 
458 /*
459  *    NOT IMPLEMENTED
460  */
461 uint64_t
462 hashlib_count_nondeleted(
463     const HashTable    *table_ptr);
464 
465 
466 /*
467  *    Debugging functions for printing table info as text
468  */
469 void
470 hashlib_dump_table_header(
471     FILE               *fp,
472     const HashTable    *table_ptr);
473 void
474 hashlib_dump_table(
475     FILE               *fp,
476     const HashTable    *table_ptr);
477 
478 
479 /* #define HASHLIB_RECORD_STATS */
480 
481 /* Statistics */
482 #ifdef HASHLIB_RECORD_STATS
483 struct hashlib_stats_st {
484     /* number of allocations */
485     uint32_t blocks_allocated;
486     /* number of times table rehashed */
487     uint32_t rehashes;
488     /* number of inserts due to rehash */
489     uint64_t rehash_inserts;
490     /* number of inserts */
491     uint64_t inserts;
492     /* number of lookups */
493     uint64_t lookups;
494     /* number of finds (due to insert & lookup) */
495     uint64_t find_entries;
496     /* number of collisions */
497     uint64_t find_collisions;
498     /* number of steps required to resolve collisions */
499     uint64_t collision_hops;
500 };
501 typedef struct hashlib_stats_st hashlib_stats_t;
502 
503 void
504 hashlib_clear_stats(
505     HashTable          *table_ptr);
506 
507 void
508 hashlib_get_stats(
509     const HashTable    *table_ptr,
510     hashlib_stats_t    *hash_stats);
511 
512 void
513 hashlib_print_stats(
514     FILE               *fp,
515     const HashTable    *table_ptr);
516 #endif /* HASHLIB_RECORD_STATS */
517 
518 #ifdef __cplusplus
519 }
520 #endif
521 #endif /* _HASHLIB_H */
522 
523 /*
524 ** Local Variables:
525 ** mode:c
526 ** indent-tabs-mode:nil
527 ** c-basic-offset:4
528 ** End:
529 */
530