1 /*
2 * Routines to provide a memory-efficient hashtable.
3 *
4 * Copyright (C) 2007-2009 Wayne Davison
5 *
6 * Modified for BackupPC to use arbitrary-length binary keys, and supporting
7 * a rudimentary delete feature by Craig Barratt. In 6/2016 rewrote to
8 * make the storage an array of pointers to entries, instead of inplace.
9 * That way entries fetched from the hashtable are still value after a
10 * resize. Still no chaining.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 3 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, visit the http://fsf.org website.
24 */
25
26 #include "backuppc.h"
27
28 /*
29 * Simple freelist of hash table entries. We maintain a single linked list of
30 * unused entries of each size, indexed by the FREELIST_SIZE2IDX() macro.
31 *
32 * FreeList[0] isn't used,
33 * FreeList[1] is a free list of blocks of size 8,
34 * FreeList[2] is a free list of blocks of size 16, ...
35 *
36 * eg, if you ask for a block of size 9, a block of size 16 will be returned.
37 */
38 static bpc_hashtable_key **FreeList;
39 static uint32 FreeListSz;
40
41 /*
42 * to map size to the FreeList index we round up to the nearest multiple of 8
43 */
44 #define FREELIST_SIZE2IDX(size) (((size) + 7) / 8)
45 #define FREELIST_IDX2SIZE(idx) ((idx) * 8)
46 /*
47 * how many new blocks to allocate when the free list is empty
48 */
49 #define FREELIST_ALLOC_CNT (512)
50
51 /*
52 * allocate a single block of a given size by grabbing one off the FreeList
53 */
bpc_hashtable_entryAlloc(uint32 size)54 static bpc_hashtable_key *bpc_hashtable_entryAlloc(uint32 size)
55 {
56 uint32 freeListIdx = FREELIST_SIZE2IDX(size);
57 bpc_hashtable_key *key;
58
59 size = FREELIST_IDX2SIZE(freeListIdx);
60 if ( freeListIdx >= FreeListSz ) {
61 /*
62 * need a bigger array of freelists
63 */
64 if ( !(FreeList = (bpc_hashtable_key**)realloc(FreeList, 2 * freeListIdx * sizeof(bpc_hashtable_key*))) ) {
65 bpc_logErrf("bpc_hashtable_entryAlloc: out of memory\n");
66 return NULL;
67 }
68 memset(FreeList + FreeListSz, 0, (2 * freeListIdx - FreeListSz) * sizeof(bpc_hashtable_key*));
69 FreeListSz = 2 * freeListIdx;
70 }
71 if ( !FreeList[freeListIdx] ) {
72 char *newBuf;
73 uint32 i;
74 /*
75 * need to populate the freelist with more blocks
76 */
77 if ( !(newBuf = (char*)malloc(size * FREELIST_ALLOC_CNT)) ) {
78 bpc_logErrf("bpc_hashtable_entryAlloc: out of memory\n");
79 return NULL;
80 }
81 FreeList[freeListIdx] = (bpc_hashtable_key*)newBuf;
82 /*
83 * chain all the buffers together in a linked list
84 */
85 key = (bpc_hashtable_key*)newBuf;
86 for ( i = 0 ; i < FREELIST_ALLOC_CNT - 1 ; i++ ) {
87 key->key = (void*)key + size;
88 key = key->key;
89 }
90 key->key = NULL;
91 }
92 key = FreeList[freeListIdx];
93 FreeList[freeListIdx] = key->key;
94 memset(key, 0, size);
95 return key;
96 }
97
98 /*
99 * free a block of a given size by putting it back on the FreeList
100 */
bpc_hashtable_entryFree(bpc_hashtable_key * key,uint32 size)101 static void bpc_hashtable_entryFree(bpc_hashtable_key *key, uint32 size)
102 {
103 uint32 freeListIdx = FREELIST_SIZE2IDX(size);
104
105 key->key = FreeList[freeListIdx];
106 FreeList[freeListIdx] = key;
107 }
108
109 #define HASH_LOAD_LIMIT(size) ((size)*3/4)
110
111 /*
112 * This implements a very simple linear hash table (no chaining etc).
113 *
114 * It has rudimentary support for delete, by flagging the deleted node. It doesn't
115 * shift other nodes on delete, but can re-use a deleted node on insert.
116 */
117
118 /*
119 * Create a hash table of the initial given size, with entries of size nodeSize
120 */
bpc_hashtable_create(bpc_hashtable * tbl,uint32 size,uint32 nodeSize)121 void bpc_hashtable_create(bpc_hashtable *tbl, uint32 size, uint32 nodeSize)
122 {
123 /* Pick a power of 2 that can hold the requested size. */
124 if ( (size & (size-1)) || size < 16 ) {
125 uint32 req = size;
126 size = 16;
127 while ( size < req ) {
128 size *= 2;
129 }
130 }
131 if ( !(tbl->nodes = calloc(size, sizeof(tbl->nodes[0]))) ) {
132 bpc_logErrf("bpc_hashtable_create: out of memory\n");
133 return;
134 }
135 tbl->size = size;
136 tbl->entries = 0;
137 tbl->entriesDel = 0;
138 tbl->nodeSize = nodeSize;
139
140 return;
141 }
142
bpc_hashtable_destroy(bpc_hashtable * tbl)143 void bpc_hashtable_destroy(bpc_hashtable *tbl)
144 {
145 uint32 i;
146 for ( i = 0 ; i < tbl->size ; i++ ) {
147 if ( tbl->nodes[i] ) {
148 bpc_hashtable_entryFree(tbl->nodes[i], tbl->nodeSize);
149 }
150 }
151 free(tbl->nodes);
152 }
153
bpc_hashtable_erase(bpc_hashtable * tbl)154 void bpc_hashtable_erase(bpc_hashtable *tbl)
155 {
156 uint32 i;
157 for ( i = 0 ; i < tbl->size ; i++ ) {
158 if ( tbl->nodes[i] ) {
159 bpc_hashtable_entryFree(tbl->nodes[i], tbl->nodeSize);
160 }
161 }
162 memset(tbl->nodes, 0, tbl->size * sizeof(tbl->nodes[0]));
163 tbl->entries = 0;
164 tbl->entriesDel = 0;
165 }
166
167 /*
168 * Compute a hash for a given key. Note that it is *not* modulo the table size - the returned
169 * hash is independent of the table size, so we don't have to recompute this hash if we
170 * resize the table. However, the current implementation does recompute the hash when
171 * we resize the hash table :(. Oh well.
172 */
bpc_hashtable_hash(uchar * key,uint32 keyLen)173 uint32 bpc_hashtable_hash(uchar *key, uint32 keyLen)
174 {
175 /* Based on Jenkins One-at-a-time hash. */
176 uint32 ndx;
177
178 for ( ndx = 0 ; keyLen > 0 ; keyLen-- ) {
179 ndx += *key++;
180 ndx += (ndx << 10);
181 ndx ^= (ndx >> 6);
182 }
183 ndx += (ndx << 3);
184 ndx ^= (ndx >> 11);
185 ndx += (ndx << 15);
186
187 return ndx;
188 }
189
190 #if 0
191 static void bpc_hashttable_check(bpc_hashtable *tbl, char *str)
192 {
193 bpc_hashtable_key **node = tbl->nodes;
194 uint i, entries = 0, entriesDel = 0;
195
196 for ( i = 0 ; i < tbl->size ; i++, node++ ) {
197 bpc_hashtable_key *keyInfo = *node;
198 if ( !keyInfo ) {
199 continue;
200 }
201 if ( !keyInfo->key && keyInfo->keyLen == 1 ) {
202 entriesDel++;
203 } else {
204 entries++;
205 }
206 }
207 if ( entries != tbl->entries ) {
208 bpc_logErrf("bpc_hashttable_check: botch at %s on HT (%u,%u): got %u entries vs %u expected\n",
209 str, tbl->size, tbl->nodeSize, entries, tbl->entries);
210 tbl->entries = entries;
211 }
212 if ( entriesDel != tbl->entriesDel ) {
213 bpc_logErrf("bpc_hashttable_check: botch at %s on HT (%u,%u): got %u entriesDel vs %u expected\n",
214 str, tbl->size, tbl->nodeSize, entriesDel, tbl->entriesDel);
215 tbl->entriesDel = entriesDel;
216 }
217 }
218 #endif
219
220 /*
221 * Ensure the hash table is of size at least newSize
222 */
bpc_hashtable_growSize(bpc_hashtable * tbl,uint32 newSize)223 void bpc_hashtable_growSize(bpc_hashtable *tbl, uint32 newSize)
224 {
225 bpc_hashtable_key **old_nodes = tbl->nodes;
226 bpc_hashtable_key **old_node = tbl->nodes;
227 uint32 oldSize = tbl->size;
228 uint i, j, ndx;
229
230 /* Pick a power of 2 that can hold the requested newSize. */
231 if ( (newSize & (newSize-1)) || newSize < 16 ) {
232 uint32 req = newSize;
233 newSize = 16;
234 while ( newSize < req ) {
235 newSize *= 2;
236 }
237 }
238 if ( tbl->size >= newSize ) return;
239 if ( !(tbl->nodes = (bpc_hashtable_key**)calloc(newSize, sizeof(tbl->nodes[0]))) ) {
240 bpc_logErrf("bpc_hashtable_create: out of memory\n");
241 return;
242 }
243 tbl->entries = 0;
244 tbl->entriesDel = 0;
245 tbl->size = newSize;
246
247 for ( i = 0 ; i < oldSize ; i++, old_node++ ) {
248 bpc_hashtable_key *keyInfo = *old_node;
249
250 /* empty slot */
251 if ( !keyInfo ) continue;
252
253 /* deleted slot: free it and don't reinsert */
254 if ( !keyInfo->key && keyInfo->keyLen == 1 ) {
255 bpc_hashtable_entryFree(keyInfo, tbl->nodeSize);
256 continue;
257 }
258 ndx = keyInfo->keyHash & (tbl->size - 1);
259 for ( j = 0 ; j < tbl->size ; j++, ndx++ ) {
260 if ( ndx >= tbl->size ) ndx = 0;
261 if ( tbl->nodes[ndx] ) continue;
262 tbl->nodes[ndx] = keyInfo;
263 tbl->entries++;
264 break;
265 }
266 if ( j >= tbl->size ) {
267 bpc_logErrf("bpc_hashtable_growSize: botch on filling new hashtable (%d,%d)\n", newSize, tbl->entries);
268 return;
269 }
270 }
271 free(old_nodes);
272 }
273
274 /*
275 * This returns the node for the indicated key, either newly created or
276 * already existing. Returns NULL if not allocating and not found.
277 */
bpc_hashtable_find(bpc_hashtable * tbl,unsigned char * key,unsigned int keyLen,int allocate_if_missing)278 void *bpc_hashtable_find(bpc_hashtable *tbl, unsigned char *key, unsigned int keyLen, int allocate_if_missing)
279 {
280 bpc_hashtable_key *keyInfo, *keyDeleted = NULL;
281 uint32 i, ndx, keyHash;
282
283 if ( allocate_if_missing && tbl->entries + tbl->entriesDel > HASH_LOAD_LIMIT(tbl->size) ) {
284 bpc_hashtable_growSize(tbl, tbl->size * 2);
285 }
286
287 /* bpc_hashttable_check(tbl, "find"); */
288
289 /*
290 * If it already exists, return the node. If we're not
291 * allocating, return NULL if the key is not found.
292 */
293 ndx = keyHash = bpc_hashtable_hash(key, keyLen);
294 ndx &= tbl->size - 1;
295 for ( i = 0 ; i < tbl->size ; i++ ) {
296 keyInfo = tbl->nodes[ndx];
297
298 if ( !keyInfo ) {
299 /*
300 * Not found since we hit an empty node (ie: not a deleted one)
301 * If requested, place the new at a prior deleted node, or here
302 */
303 if ( allocate_if_missing ) {
304 tbl->entries++;
305 if ( keyDeleted ) {
306 /*
307 * we found a prior deleted entry, so use it instead
308 */
309 keyInfo = keyDeleted;
310 tbl->entriesDel--;
311 } else {
312 tbl->nodes[ndx] = keyInfo = bpc_hashtable_entryAlloc(tbl->nodeSize);
313 }
314 keyInfo->key = key;
315 keyInfo->keyLen = keyLen;
316 keyInfo->keyHash = keyHash;
317 /* TODO - check this? */
318 if ( !key ) {
319 bpc_logErrf("bpc_hashtable_find: botch adding NULL key to hT (%d,%d)\n", tbl->size, tbl->nodeSize);
320 }
321 return (void*)keyInfo;
322 }
323 return (void*)NULL;
324 } else {
325 if ( !keyInfo->key && keyInfo->keyLen == 1 ) {
326 if ( !keyDeleted ) {
327 /*
328 * this is the first deleted slot, which we remember so we can insert a new entry
329 * here if we don't find the desired entry, and allocate_if_missing != 0
330 */
331 keyDeleted = keyInfo;
332 }
333 } else if ( keyInfo->keyHash == keyHash && keyInfo->keyLen == keyLen && !memcmp(key, keyInfo->key, keyLen) ) {
334 return (void*)keyInfo;
335 }
336 }
337 ndx++;
338 if ( ndx >= tbl->size ) ndx = 0;
339 }
340 return (void*)NULL;
341 }
342
343 /*
344 * Remove a node from the hash table. Node must be a valid node returned by bpc_hashtable_find!
345 * Node gets cleared.
346 */
bpc_hashtable_nodeDelete(bpc_hashtable * tbl,void * node)347 void bpc_hashtable_nodeDelete(bpc_hashtable *tbl, void *node)
348 {
349 bpc_hashtable_key *keyInfo = (bpc_hashtable_key*)node;
350
351 memset(node, 0, tbl->nodeSize);
352 /*
353 * special delete flag (key is NULL, keyLen is 1), so that the linear hash table continues
354 * finding entries past this point.
355 * TODO optimization: if the next entry is empty, then we can make this empty too.
356 */
357 keyInfo->keyLen = 1;
358 tbl->entries--;
359 tbl->entriesDel++;
360
361 /* bpc_hashttable_check(tbl, "delete"); */
362 }
363
364 /*
365 * Iterate over all the entries in the hash table, calling a callback for each valid entry
366 *
367 * Note: this function won't work if the callback adds new entries to the hash table while
368 * iterating over the entries. You can update or delete entries, but adding an entry might
369 * cause the * hash table size to be bumped, which breaks the indexing. So don't add new
370 * entries while iterating over the table.
371 */
bpc_hashtable_iterate(bpc_hashtable * tbl,void (* callback)(void *,void *),void * arg1)372 void bpc_hashtable_iterate(bpc_hashtable *tbl, void (*callback)(void*, void*), void *arg1)
373 {
374 uint i, entries = 0, entriesDel = 0;
375
376 /* bpc_hashttable_check(tbl, "iterate"); */
377
378 for ( i = 0 ; i < tbl->size ; i++ ) {
379 bpc_hashtable_key *keyInfo = tbl->nodes[i];
380
381 if ( !keyInfo ) continue;
382 if ( !keyInfo->key ) {
383 if ( keyInfo->keyLen == 1 ) entriesDel++;
384 continue;
385 }
386 (*callback)((void*)keyInfo, arg1);
387 if ( !keyInfo->key ) {
388 if ( keyInfo->keyLen == 1 ) entriesDel++;
389 continue;
390 } else {
391 entries++;
392 }
393 }
394 if ( entries != tbl->entries ) {
395 bpc_logErrf("bpc_hashtable_iterate: botch on HT (%u,%u): got %u entries vs %u expected\n",
396 tbl->size, tbl->nodeSize, entries, tbl->entries);
397 tbl->entries = entries;
398 }
399 if ( entriesDel != tbl->entriesDel ) {
400 bpc_logErrf("bpc_hashtable_iterate: botch on HT (%u,%u): got %u entriesDel vs %u expected\n",
401 tbl->size, tbl->nodeSize, entriesDel, tbl->entriesDel);
402 tbl->entriesDel = entriesDel;
403 }
404 }
405
406 /*
407 * An alternative way to iterate over all the hash table entries. Initially index should
408 * be zero, and is updated on each call. A pointer to each entry is returned. After
409 * the last entry, NULL is returned, and idx is set back to zero.
410 *
411 * Note: this function won't work if you add new entries to the hash table while iterating
412 * over the entries. You can update or delete entries, but adding an entry might cause the
413 * hash table size to be bumped, which breaks the indexing. So don't add new entries while
414 * iterating over the table.
415 */
bpc_hashtable_nextEntry(bpc_hashtable * tbl,uint * idx)416 void *bpc_hashtable_nextEntry(bpc_hashtable *tbl, uint *idx)
417 {
418 uint i = *idx;
419
420 /* bpc_hashttable_check(tbl, "next entry"); */
421
422 for ( ; i < (uint)tbl->size ; i++ ) {
423 bpc_hashtable_key *keyInfo = tbl->nodes[i];
424 if ( !keyInfo || !keyInfo->key ) continue;
425 *idx = i + 1;
426 return (void*)keyInfo;
427 }
428 *idx = 0;
429 return NULL;
430 }
431
432 /*
433 * Return the number of entries in the hash table
434 */
bpc_hashtable_entryCount(bpc_hashtable * tbl)435 int bpc_hashtable_entryCount(bpc_hashtable *tbl)
436 {
437 /* bpc_hashttable_check(tbl, "entryCount"); */
438 return tbl->entries;
439 }
440