1 /*
2  * Routines to provide a memory-efficient hashtable.
3  *
4  * Copyright (C) 2007-2009 Wayne Davison
5  *
6  * Modified for BackupPC to use arbitrary-length binary keys, and supporting
7  * a rudimentary delete feature by Craig Barratt.  In 6/2016 rewrote to
8  * make the storage an array of pointers to entries, instead of inplace.
9  * That way entries fetched from the hashtable are still value after a
10  * resize.  Still no chaining.
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 3 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License along
23  * with this program; if not, visit the http://fsf.org website.
24  */
25 
26 #include "backuppc.h"
27 
28 /*
29  * Simple freelist of hash table entries.  We maintain a single linked list of
30  * unused entries of each size, indexed by the FREELIST_SIZE2IDX() macro.
31  *
32  * FreeList[0] isn't used,
33  * FreeList[1] is a free list of blocks of size 8,
34  * FreeList[2] is a free list of blocks of size 16, ...
35  *
36  * eg, if you ask for a block of size 9, a block of size 16 will be returned.
37  */
38 static bpc_hashtable_key **FreeList;
39 static uint32 FreeListSz;
40 
41 /*
42  * to map size to the FreeList index we round up to the nearest multiple of 8
43  */
44 #define FREELIST_SIZE2IDX(size)         (((size) + 7) / 8)
45 #define FREELIST_IDX2SIZE(idx)          ((idx) * 8)
46 /*
47  * how many new blocks to allocate when the free list is empty
48  */
49 #define FREELIST_ALLOC_CNT              (512)
50 
51 /*
52  * allocate a single block of a given size by grabbing one off the FreeList
53  */
bpc_hashtable_entryAlloc(uint32 size)54 static bpc_hashtable_key *bpc_hashtable_entryAlloc(uint32 size)
55 {
56     uint32 freeListIdx = FREELIST_SIZE2IDX(size);
57     bpc_hashtable_key *key;
58 
59     size = FREELIST_IDX2SIZE(freeListIdx);
60     if ( freeListIdx >= FreeListSz ) {
61         /*
62          * need a bigger array of freelists
63          */
64         if ( !(FreeList = (bpc_hashtable_key**)realloc(FreeList, 2 * freeListIdx * sizeof(bpc_hashtable_key*))) ) {
65             bpc_logErrf("bpc_hashtable_entryAlloc: out of memory\n");
66             return NULL;
67         }
68         memset(FreeList + FreeListSz, 0, (2 * freeListIdx - FreeListSz) * sizeof(bpc_hashtable_key*));
69         FreeListSz = 2 * freeListIdx;
70     }
71     if ( !FreeList[freeListIdx] ) {
72         char *newBuf;
73         uint32 i;
74         /*
75          * need to populate the freelist with more blocks
76          */
77         if ( !(newBuf = (char*)malloc(size * FREELIST_ALLOC_CNT)) ) {
78             bpc_logErrf("bpc_hashtable_entryAlloc: out of memory\n");
79             return NULL;
80         }
81         FreeList[freeListIdx] = (bpc_hashtable_key*)newBuf;
82         /*
83          * chain all the buffers together in a linked list
84          */
85         key = (bpc_hashtable_key*)newBuf;
86         for ( i = 0 ; i < FREELIST_ALLOC_CNT - 1 ; i++ ) {
87             key->key = (void*)key + size;
88             key = key->key;
89         }
90         key->key = NULL;
91     }
92     key = FreeList[freeListIdx];
93     FreeList[freeListIdx] = key->key;
94     memset(key, 0, size);
95     return key;
96 }
97 
98 /*
99  * free a block of a given size by putting it back on the FreeList
100  */
bpc_hashtable_entryFree(bpc_hashtable_key * key,uint32 size)101 static void bpc_hashtable_entryFree(bpc_hashtable_key *key, uint32 size)
102 {
103     uint32 freeListIdx = FREELIST_SIZE2IDX(size);
104 
105     key->key = FreeList[freeListIdx];
106     FreeList[freeListIdx] = key;
107 }
108 
109 #define HASH_LOAD_LIMIT(size) ((size)*3/4)
110 
111 /*
112  * This implements a very simple linear hash table (no chaining etc).
113  *
114  * It has rudimentary support for delete, by flagging the deleted node.  It doesn't
115  * shift other nodes on delete, but can re-use a deleted node on insert.
116  */
117 
118 /*
119  * Create a hash table of the initial given size, with entries of size nodeSize
120  */
bpc_hashtable_create(bpc_hashtable * tbl,uint32 size,uint32 nodeSize)121 void bpc_hashtable_create(bpc_hashtable *tbl, uint32 size, uint32 nodeSize)
122 {
123     /* Pick a power of 2 that can hold the requested size. */
124     if ( (size & (size-1)) || size < 16 ) {
125         uint32 req = size;
126         size = 16;
127         while ( size < req ) {
128             size *= 2;
129         }
130     }
131     if ( !(tbl->nodes = calloc(size, sizeof(tbl->nodes[0]))) ) {
132         bpc_logErrf("bpc_hashtable_create: out of memory\n");
133         return;
134     }
135     tbl->size       = size;
136     tbl->entries    = 0;
137     tbl->entriesDel = 0;
138     tbl->nodeSize   = nodeSize;
139 
140     return;
141 }
142 
bpc_hashtable_destroy(bpc_hashtable * tbl)143 void bpc_hashtable_destroy(bpc_hashtable *tbl)
144 {
145     uint32 i;
146     for ( i = 0 ; i < tbl->size ; i++ ) {
147         if ( tbl->nodes[i] ) {
148             bpc_hashtable_entryFree(tbl->nodes[i], tbl->nodeSize);
149         }
150     }
151     free(tbl->nodes);
152 }
153 
bpc_hashtable_erase(bpc_hashtable * tbl)154 void bpc_hashtable_erase(bpc_hashtable *tbl)
155 {
156     uint32 i;
157     for ( i = 0 ; i < tbl->size ; i++ ) {
158         if ( tbl->nodes[i] ) {
159             bpc_hashtable_entryFree(tbl->nodes[i], tbl->nodeSize);
160         }
161     }
162     memset(tbl->nodes, 0, tbl->size * sizeof(tbl->nodes[0]));
163     tbl->entries    = 0;
164     tbl->entriesDel = 0;
165 }
166 
167 /*
168  * Compute a hash for a given key.  Note that it is *not* modulo the table size - the returned
169  * hash is independent of the table size, so we don't have to recompute this hash if we
170  * resize the table.  However, the current implementation does recompute the hash when
171  * we resize the hash table :(.  Oh well.
172  */
bpc_hashtable_hash(uchar * key,uint32 keyLen)173 uint32 bpc_hashtable_hash(uchar *key, uint32 keyLen)
174 {
175     /* Based on Jenkins One-at-a-time hash. */
176     uint32 ndx;
177 
178     for ( ndx = 0 ; keyLen > 0 ; keyLen-- ) {
179         ndx += *key++;
180         ndx += (ndx << 10);
181         ndx ^= (ndx >> 6);
182     }
183     ndx += (ndx << 3);
184     ndx ^= (ndx >> 11);
185     ndx += (ndx << 15);
186 
187     return ndx;
188 }
189 
190 #if 0
191 static void bpc_hashttable_check(bpc_hashtable *tbl, char *str)
192 {
193     bpc_hashtable_key **node = tbl->nodes;
194     uint i, entries = 0, entriesDel = 0;
195 
196     for ( i = 0 ; i < tbl->size ; i++, node++ ) {
197         bpc_hashtable_key *keyInfo = *node;
198         if ( !keyInfo ) {
199             continue;
200         }
201         if ( !keyInfo->key && keyInfo->keyLen == 1 ) {
202             entriesDel++;
203         } else {
204             entries++;
205         }
206     }
207     if ( entries != tbl->entries ) {
208         bpc_logErrf("bpc_hashttable_check: botch at %s on HT (%u,%u): got %u entries vs %u expected\n",
209                                 str, tbl->size, tbl->nodeSize, entries, tbl->entries);
210         tbl->entries = entries;
211     }
212     if ( entriesDel != tbl->entriesDel ) {
213         bpc_logErrf("bpc_hashttable_check: botch at %s on HT (%u,%u): got %u entriesDel vs %u expected\n",
214                                 str, tbl->size, tbl->nodeSize, entriesDel, tbl->entriesDel);
215         tbl->entriesDel = entriesDel;
216     }
217 }
218 #endif
219 
220 /*
221  * Ensure the hash table is of size at least newSize
222  */
bpc_hashtable_growSize(bpc_hashtable * tbl,uint32 newSize)223 void bpc_hashtable_growSize(bpc_hashtable *tbl, uint32 newSize)
224 {
225     bpc_hashtable_key **old_nodes = tbl->nodes;
226     bpc_hashtable_key **old_node  = tbl->nodes;
227     uint32 oldSize  = tbl->size;
228     uint i, j, ndx;
229 
230     /* Pick a power of 2 that can hold the requested newSize. */
231     if ( (newSize & (newSize-1)) || newSize < 16 ) {
232         uint32 req = newSize;
233         newSize = 16;
234         while ( newSize < req ) {
235             newSize *= 2;
236         }
237     }
238     if ( tbl->size >= newSize ) return;
239     if ( !(tbl->nodes = (bpc_hashtable_key**)calloc(newSize, sizeof(tbl->nodes[0]))) ) {
240         bpc_logErrf("bpc_hashtable_create: out of memory\n");
241         return;
242     }
243     tbl->entries    = 0;
244     tbl->entriesDel = 0;
245     tbl->size       = newSize;
246 
247     for ( i = 0 ; i < oldSize ; i++, old_node++ ) {
248         bpc_hashtable_key *keyInfo = *old_node;
249 
250         /* empty slot */
251         if ( !keyInfo ) continue;
252 
253         /* deleted slot: free it and don't reinsert */
254         if ( !keyInfo->key && keyInfo->keyLen == 1 ) {
255             bpc_hashtable_entryFree(keyInfo, tbl->nodeSize);
256             continue;
257         }
258         ndx = keyInfo->keyHash & (tbl->size - 1);
259         for ( j = 0 ; j < tbl->size ; j++, ndx++ ) {
260             if ( ndx >= tbl->size ) ndx = 0;
261             if ( tbl->nodes[ndx] ) continue;
262             tbl->nodes[ndx] = keyInfo;
263             tbl->entries++;
264             break;
265         }
266         if ( j >= tbl->size ) {
267             bpc_logErrf("bpc_hashtable_growSize: botch on filling new hashtable (%d,%d)\n", newSize, tbl->entries);
268             return;
269         }
270     }
271     free(old_nodes);
272 }
273 
274 /*
275  * This returns the node for the indicated key, either newly created or
276  * already existing.  Returns NULL if not allocating and not found.
277  */
bpc_hashtable_find(bpc_hashtable * tbl,unsigned char * key,unsigned int keyLen,int allocate_if_missing)278 void *bpc_hashtable_find(bpc_hashtable *tbl, unsigned char *key, unsigned int keyLen, int allocate_if_missing)
279 {
280     bpc_hashtable_key *keyInfo, *keyDeleted = NULL;
281     uint32 i, ndx, keyHash;
282 
283     if ( allocate_if_missing && tbl->entries + tbl->entriesDel > HASH_LOAD_LIMIT(tbl->size) ) {
284         bpc_hashtable_growSize(tbl, tbl->size * 2);
285     }
286 
287     /* bpc_hashttable_check(tbl, "find"); */
288 
289     /*
290      * If it already exists, return the node.  If we're not
291      * allocating, return NULL if the key is not found.
292      */
293     ndx = keyHash = bpc_hashtable_hash(key, keyLen);
294     ndx &= tbl->size - 1;
295     for ( i = 0 ; i < tbl->size ; i++ ) {
296         keyInfo = tbl->nodes[ndx];
297 
298         if ( !keyInfo ) {
299             /*
300              * Not found since we hit an empty node (ie: not a deleted one)
301              * If requested, place the new at a prior deleted node, or here
302              */
303             if ( allocate_if_missing ) {
304                 tbl->entries++;
305                 if ( keyDeleted ) {
306                     /*
307                      * we found a prior deleted entry, so use it instead
308                      */
309                     keyInfo = keyDeleted;
310                     tbl->entriesDel--;
311                 } else {
312                     tbl->nodes[ndx] = keyInfo = bpc_hashtable_entryAlloc(tbl->nodeSize);
313                 }
314                 keyInfo->key     = key;
315                 keyInfo->keyLen  = keyLen;
316                 keyInfo->keyHash = keyHash;
317                 /* TODO - check this? */
318                 if ( !key ) {
319                     bpc_logErrf("bpc_hashtable_find: botch adding NULL key to hT (%d,%d)\n", tbl->size, tbl->nodeSize);
320                 }
321                 return (void*)keyInfo;
322             }
323             return (void*)NULL;
324         } else {
325             if ( !keyInfo->key && keyInfo->keyLen == 1 ) {
326                 if ( !keyDeleted ) {
327                     /*
328                      * this is the first deleted slot, which we remember so we can insert a new entry
329                      * here if we don't find the desired entry, and allocate_if_missing != 0
330                      */
331                     keyDeleted = keyInfo;
332                 }
333             } else if ( keyInfo->keyHash == keyHash && keyInfo->keyLen == keyLen && !memcmp(key, keyInfo->key, keyLen) ) {
334                 return (void*)keyInfo;
335             }
336         }
337         ndx++;
338         if ( ndx >= tbl->size ) ndx = 0;
339     }
340     return (void*)NULL;
341 }
342 
343 /*
344  * Remove a node from the hash table.  Node must be a valid node returned by bpc_hashtable_find!
345  * Node gets cleared.
346  */
bpc_hashtable_nodeDelete(bpc_hashtable * tbl,void * node)347 void bpc_hashtable_nodeDelete(bpc_hashtable *tbl, void *node)
348 {
349     bpc_hashtable_key *keyInfo = (bpc_hashtable_key*)node;
350 
351     memset(node, 0, tbl->nodeSize);
352     /*
353      * special delete flag (key is NULL, keyLen is 1), so that the linear hash table continues
354      * finding entries past this point.
355      * TODO optimization: if the next entry is empty, then we can make this empty too.
356      */
357     keyInfo->keyLen = 1;
358     tbl->entries--;
359     tbl->entriesDel++;
360 
361     /* bpc_hashttable_check(tbl, "delete"); */
362 }
363 
364 /*
365  * Iterate over all the entries in the hash table, calling a callback for each valid entry
366  *
367  * Note: this function won't work if the callback adds new entries to the hash table while
368  * iterating over the entries.  You can update or delete entries, but adding an entry might
369  * cause the * hash table size to be bumped, which breaks the indexing.  So don't add new
370  * entries while iterating over the table.
371  */
bpc_hashtable_iterate(bpc_hashtable * tbl,void (* callback)(void *,void *),void * arg1)372 void bpc_hashtable_iterate(bpc_hashtable *tbl, void (*callback)(void*, void*), void *arg1)
373 {
374     uint i, entries = 0, entriesDel = 0;
375 
376     /* bpc_hashttable_check(tbl, "iterate"); */
377 
378     for ( i = 0 ; i < tbl->size ; i++ ) {
379         bpc_hashtable_key *keyInfo = tbl->nodes[i];
380 
381         if ( !keyInfo ) continue;
382         if ( !keyInfo->key ) {
383             if ( keyInfo->keyLen == 1 ) entriesDel++;
384             continue;
385         }
386         (*callback)((void*)keyInfo, arg1);
387         if ( !keyInfo->key ) {
388             if ( keyInfo->keyLen == 1 ) entriesDel++;
389             continue;
390         } else {
391             entries++;
392         }
393     }
394     if ( entries != tbl->entries ) {
395         bpc_logErrf("bpc_hashtable_iterate: botch on HT (%u,%u): got %u entries vs %u expected\n",
396                                 tbl->size, tbl->nodeSize, entries, tbl->entries);
397         tbl->entries = entries;
398     }
399     if ( entriesDel != tbl->entriesDel ) {
400         bpc_logErrf("bpc_hashtable_iterate: botch on HT (%u,%u): got %u entriesDel vs %u expected\n",
401                                 tbl->size, tbl->nodeSize, entriesDel, tbl->entriesDel);
402         tbl->entriesDel = entriesDel;
403     }
404 }
405 
406 /*
407  * An alternative way to iterate over all the hash table entries.  Initially index should
408  * be zero, and is updated on each call.  A pointer to each entry is returned.  After
409  * the last entry, NULL is returned, and idx is set back to zero.
410  *
411  * Note: this function won't work if you add new entries to the hash table while iterating
412  * over the entries.  You can update or delete entries, but adding an entry might cause the
413  * hash table size to be bumped, which breaks the indexing.  So don't add new entries while
414  * iterating over the table.
415  */
bpc_hashtable_nextEntry(bpc_hashtable * tbl,uint * idx)416 void *bpc_hashtable_nextEntry(bpc_hashtable *tbl, uint *idx)
417 {
418     uint i = *idx;
419 
420     /* bpc_hashttable_check(tbl, "next entry"); */
421 
422     for ( ; i < (uint)tbl->size ; i++ ) {
423         bpc_hashtable_key *keyInfo = tbl->nodes[i];
424         if ( !keyInfo || !keyInfo->key ) continue;
425         *idx = i + 1;
426         return (void*)keyInfo;
427     }
428     *idx = 0;
429     return NULL;
430 }
431 
432 /*
433  * Return the number of entries in the hash table
434  */
bpc_hashtable_entryCount(bpc_hashtable * tbl)435 int bpc_hashtable_entryCount(bpc_hashtable *tbl)
436 {
437     /* bpc_hashttable_check(tbl, "entryCount"); */
438     return tbl->entries;
439 }
440