1 /*-------------------------------------------------------------------------
2  *
3  * dshash.c
4  *	  Concurrent hash tables backed by dynamic shared memory areas.
5  *
6  * This is an open hashing hash table, with a linked list at each table
7  * entry.  It supports dynamic resizing, as required to prevent the linked
8  * lists from growing too long on average.  Currently, only growing is
9  * supported: the hash table never becomes smaller.
10  *
11  * To deal with concurrency, it has a fixed size set of partitions, each of
12  * which is independently locked.  Each bucket maps to a partition; so insert,
13  * find and iterate operations normally only acquire one lock.  Therefore,
14  * good concurrency is achieved whenever such operations don't collide at the
15  * lock partition level.  However, when a resize operation begins, all
16  * partition locks must be acquired simultaneously for a brief period.  This
17  * is only expected to happen a small number of times until a stable size is
18  * found, since growth is geometric.
19  *
20  * Future versions may support iterators and incremental resizing; for now
21  * the implementation is minimalist.
22  *
23  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
24  * Portions Copyright (c) 1994, Regents of the University of California
25  *
26  * IDENTIFICATION
27  *	  src/backend/lib/dshash.c
28  *
29  *-------------------------------------------------------------------------
30  */
31 
32 #include "postgres.h"
33 
34 #include "lib/dshash.h"
35 #include "storage/ipc.h"
36 #include "storage/lwlock.h"
37 #include "utils/dsa.h"
38 #include "utils/hsearch.h"
39 #include "utils/memutils.h"
40 
41 /*
42  * An item in the hash table.  This wraps the user's entry object in an
43  * envelop that holds a pointer back to the bucket and a pointer to the next
44  * item in the bucket.
45  */
46 struct dshash_table_item
47 {
48 	/* The next item in the same bucket. */
49 	dsa_pointer next;
50 	/* The hashed key, to avoid having to recompute it. */
51 	dshash_hash hash;
52 	/* The user's entry object follows here.  See ENTRY_FROM_ITEM(item). */
53 };
54 
55 /*
56  * The number of partitions for locking purposes.  This is set to match
57  * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
58  * the buffer pool must be good enough for any other purpose.  This could
59  * become a runtime parameter in future.
60  */
61 #define DSHASH_NUM_PARTITIONS_LOG2 7
62 #define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
63 
64 /* A magic value used to identify our hash tables. */
65 #define DSHASH_MAGIC 0x75ff6a20
66 
67 /*
68  * Tracking information for each lock partition.  Initially, each partition
69  * corresponds to one bucket, but each time the hash table grows, the buckets
70  * covered by each partition split so the number of buckets covered doubles.
71  *
72  * We might want to add padding here so that each partition is on a different
73  * cache line, but doing so would bloat this structure considerably.
74  */
75 typedef struct dshash_partition
76 {
77 	LWLock		lock;			/* Protects all buckets in this partition. */
78 	size_t		count;			/* # of items in this partition's buckets */
79 } dshash_partition;
80 
81 /*
82  * The head object for a hash table.  This will be stored in dynamic shared
83  * memory.
84  */
85 typedef struct dshash_table_control
86 {
87 	dshash_table_handle handle;
88 	uint32		magic;
89 	dshash_partition partitions[DSHASH_NUM_PARTITIONS];
90 	int			lwlock_tranche_id;
91 
92 	/*
93 	 * The following members are written to only when ALL partitions locks are
94 	 * held.  They can be read when any one partition lock is held.
95 	 */
96 
97 	/* Number of buckets expressed as power of 2 (8 = 256 buckets). */
98 	size_t		size_log2;		/* log2(number of buckets) */
99 	dsa_pointer buckets;		/* current bucket array */
100 } dshash_table_control;
101 
102 /*
103  * Per-backend state for a dynamic hash table.
104  */
105 struct dshash_table
106 {
107 	dsa_area   *area;			/* Backing dynamic shared memory area. */
108 	dshash_parameters params;	/* Parameters. */
109 	void	   *arg;			/* User-supplied data pointer. */
110 	dshash_table_control *control;	/* Control object in DSM. */
111 	dsa_pointer *buckets;		/* Current bucket pointers in DSM. */
112 	size_t		size_log2;		/* log2(number of buckets) */
113 	bool		find_locked;	/* Is any partition lock held by 'find'? */
114 	bool		find_exclusively_locked;	/* ... exclusively? */
115 };
116 
117 /* Given a pointer to an item, find the entry (user data) it holds. */
118 #define ENTRY_FROM_ITEM(item) \
119 	((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
120 
121 /* Given a pointer to an entry, find the item that holds it. */
122 #define ITEM_FROM_ENTRY(entry)											\
123 	((dshash_table_item *)((char *)(entry) -							\
124 							 MAXALIGN(sizeof(dshash_table_item))))
125 
126 /* How many resize operations (bucket splits) have there been? */
127 #define NUM_SPLITS(size_log2)					\
128 	(size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
129 
130 /* How many buckets are there in each partition at a given size? */
131 #define BUCKETS_PER_PARTITION(size_log2)		\
132 	(((size_t) 1) << NUM_SPLITS(size_log2))
133 
134 /* Max entries before we need to grow.  Half + quarter = 75% load factor. */
135 #define MAX_COUNT_PER_PARTITION(hash_table)				\
136 	(BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
137 	 BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
138 
139 /* Choose partition based on the highest order bits of the hash. */
140 #define PARTITION_FOR_HASH(hash)										\
141 	(hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
142 
143 /*
144  * Find the bucket index for a given hash and table size.  Each time the table
145  * doubles in size, the appropriate bucket for a given hash value doubles and
146  * possibly adds one, depending on the newly revealed bit, so that all buckets
147  * are split.
148  */
149 #define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2)		\
150 	(hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
151 
152 /* The index of the first bucket in a given partition. */
153 #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2)	\
154 	((partition) << NUM_SPLITS(size_log2))
155 
156 /* The head of the active bucket for a given hash value (lvalue). */
157 #define BUCKET_FOR_HASH(hash_table, hash)								\
158 	(hash_table->buckets[												\
159 		BUCKET_INDEX_FOR_HASH_AND_SIZE(hash,							\
160 									   hash_table->size_log2)])
161 
162 static void delete_item(dshash_table *hash_table,
163 						dshash_table_item *item);
164 static void resize(dshash_table *hash_table, size_t new_size);
165 static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
166 static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
167 												const void *key,
168 												dsa_pointer item_pointer);
169 static void insert_item_into_bucket(dshash_table *hash_table,
170 									dsa_pointer item_pointer,
171 									dshash_table_item *item,
172 									dsa_pointer *bucket);
173 static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
174 											 const void *key,
175 											 dsa_pointer *bucket);
176 static bool delete_key_from_bucket(dshash_table *hash_table,
177 								   const void *key,
178 								   dsa_pointer *bucket_head);
179 static bool delete_item_from_bucket(dshash_table *hash_table,
180 									dshash_table_item *item,
181 									dsa_pointer *bucket_head);
182 static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
183 static inline bool equal_keys(dshash_table *hash_table,
184 							  const void *a, const void *b);
185 
186 #define PARTITION_LOCK(hash_table, i)			\
187 	(&(hash_table)->control->partitions[(i)].lock)
188 
189 /*
190  * Create a new hash table backed by the given dynamic shared area, with the
191  * given parameters.  The returned object is allocated in backend-local memory
192  * using the current MemoryContext.  'arg' will be passed through to the
193  * compare and hash functions.
194  */
195 dshash_table *
196 dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
197 {
198 	dshash_table *hash_table;
199 	dsa_pointer control;
200 
201 	/* Allocate the backend-local object representing the hash table. */
202 	hash_table = palloc(sizeof(dshash_table));
203 
204 	/* Allocate the control object in shared memory. */
205 	control = dsa_allocate(area, sizeof(dshash_table_control));
206 
207 	/* Set up the local and shared hash table structs. */
208 	hash_table->area = area;
209 	hash_table->params = *params;
210 	hash_table->arg = arg;
211 	hash_table->control = dsa_get_address(area, control);
212 	hash_table->control->handle = control;
213 	hash_table->control->magic = DSHASH_MAGIC;
214 	hash_table->control->lwlock_tranche_id = params->tranche_id;
215 
216 	/* Set up the array of lock partitions. */
217 	{
218 		dshash_partition *partitions = hash_table->control->partitions;
219 		int			tranche_id = hash_table->control->lwlock_tranche_id;
220 		int			i;
221 
222 		for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
223 		{
224 			LWLockInitialize(&partitions[i].lock, tranche_id);
225 			partitions[i].count = 0;
226 		}
227 	}
228 
229 	hash_table->find_locked = false;
230 	hash_table->find_exclusively_locked = false;
231 
232 	/*
233 	 * Set up the initial array of buckets.  Our initial size is the same as
234 	 * the number of partitions.
235 	 */
236 	hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
237 	hash_table->control->buckets =
238 		dsa_allocate_extended(area,
239 							  sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
240 							  DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
241 	if (!DsaPointerIsValid(hash_table->control->buckets))
242 	{
243 		dsa_free(area, control);
244 		ereport(ERROR,
245 				(errcode(ERRCODE_OUT_OF_MEMORY),
246 				 errmsg("out of memory"),
247 				 errdetail("Failed on DSA request of size %zu.",
248 						   sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
249 	}
250 	hash_table->buckets = dsa_get_address(area,
251 										  hash_table->control->buckets);
252 	hash_table->size_log2 = hash_table->control->size_log2;
253 
254 	return hash_table;
255 }
256 
257 /*
258  * Attach to an existing hash table using a handle.  The returned object is
259  * allocated in backend-local memory using the current MemoryContext.  'arg'
260  * will be passed through to the compare and hash functions.
261  */
262 dshash_table *
263 dshash_attach(dsa_area *area, const dshash_parameters *params,
264 			  dshash_table_handle handle, void *arg)
265 {
266 	dshash_table *hash_table;
267 	dsa_pointer control;
268 
269 	/* Allocate the backend-local object representing the hash table. */
270 	hash_table = palloc(sizeof(dshash_table));
271 
272 	/* Find the control object in shared memory. */
273 	control = handle;
274 
275 	/* Set up the local hash table struct. */
276 	hash_table->area = area;
277 	hash_table->params = *params;
278 	hash_table->arg = arg;
279 	hash_table->control = dsa_get_address(area, control);
280 	hash_table->find_locked = false;
281 	hash_table->find_exclusively_locked = false;
282 	Assert(hash_table->control->magic == DSHASH_MAGIC);
283 
284 	/*
285 	 * These will later be set to the correct values by
286 	 * ensure_valid_bucket_pointers(), at which time we'll be holding a
287 	 * partition lock for interlocking against concurrent resizing.
288 	 */
289 	hash_table->buckets = NULL;
290 	hash_table->size_log2 = 0;
291 
292 	return hash_table;
293 }
294 
295 /*
296  * Detach from a hash table.  This frees backend-local resources associated
297  * with the hash table, but the hash table will continue to exist until it is
298  * either explicitly destroyed (by a backend that is still attached to it), or
299  * the area that backs it is returned to the operating system.
300  */
301 void
302 dshash_detach(dshash_table *hash_table)
303 {
304 	Assert(!hash_table->find_locked);
305 
306 	/* The hash table may have been destroyed.  Just free local memory. */
307 	pfree(hash_table);
308 }
309 
310 /*
311  * Destroy a hash table, returning all memory to the area.  The caller must be
312  * certain that no other backend will attempt to access the hash table before
313  * calling this function.  Other backend must explicitly call dshash_detach to
314  * free up backend-local memory associated with the hash table.  The backend
315  * that calls dshash_destroy must not call dshash_detach.
316  */
317 void
318 dshash_destroy(dshash_table *hash_table)
319 {
320 	size_t		size;
321 	size_t		i;
322 
323 	Assert(hash_table->control->magic == DSHASH_MAGIC);
324 	ensure_valid_bucket_pointers(hash_table);
325 
326 	/* Free all the entries. */
327 	size = ((size_t) 1) << hash_table->size_log2;
328 	for (i = 0; i < size; ++i)
329 	{
330 		dsa_pointer item_pointer = hash_table->buckets[i];
331 
332 		while (DsaPointerIsValid(item_pointer))
333 		{
334 			dshash_table_item *item;
335 			dsa_pointer next_item_pointer;
336 
337 			item = dsa_get_address(hash_table->area, item_pointer);
338 			next_item_pointer = item->next;
339 			dsa_free(hash_table->area, item_pointer);
340 			item_pointer = next_item_pointer;
341 		}
342 	}
343 
344 	/*
345 	 * Vandalize the control block to help catch programming errors where
346 	 * other backends access the memory formerly occupied by this hash table.
347 	 */
348 	hash_table->control->magic = 0;
349 
350 	/* Free the active table and control object. */
351 	dsa_free(hash_table->area, hash_table->control->buckets);
352 	dsa_free(hash_table->area, hash_table->control->handle);
353 
354 	pfree(hash_table);
355 }
356 
357 /*
358  * Get a handle that can be used by other processes to attach to this hash
359  * table.
360  */
361 dshash_table_handle
362 dshash_get_hash_table_handle(dshash_table *hash_table)
363 {
364 	Assert(hash_table->control->magic == DSHASH_MAGIC);
365 
366 	return hash_table->control->handle;
367 }
368 
369 /*
370  * Look up an entry, given a key.  Returns a pointer to an entry if one can be
371  * found with the given key.  Returns NULL if the key is not found.  If a
372  * non-NULL value is returned, the entry is locked and must be released by
373  * calling dshash_release_lock.  If an error is raised before
374  * dshash_release_lock is called, the lock will be released automatically, but
375  * the caller must take care to ensure that the entry is not left corrupted.
376  * The lock mode is either shared or exclusive depending on 'exclusive'.
377  *
378  * The caller must not hold a lock already.
379  *
380  * Note that the lock held is in fact an LWLock, so interrupts will be held on
381  * return from this function, and not resumed until dshash_release_lock is
382  * called.  It is a very good idea for the caller to release the lock quickly.
383  */
384 void *
385 dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
386 {
387 	dshash_hash hash;
388 	size_t		partition;
389 	dshash_table_item *item;
390 
391 	hash = hash_key(hash_table, key);
392 	partition = PARTITION_FOR_HASH(hash);
393 
394 	Assert(hash_table->control->magic == DSHASH_MAGIC);
395 	Assert(!hash_table->find_locked);
396 
397 	LWLockAcquire(PARTITION_LOCK(hash_table, partition),
398 				  exclusive ? LW_EXCLUSIVE : LW_SHARED);
399 	ensure_valid_bucket_pointers(hash_table);
400 
401 	/* Search the active bucket. */
402 	item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
403 
404 	if (!item)
405 	{
406 		/* Not found. */
407 		LWLockRelease(PARTITION_LOCK(hash_table, partition));
408 		return NULL;
409 	}
410 	else
411 	{
412 		/* The caller will free the lock by calling dshash_release. */
413 		hash_table->find_locked = true;
414 		hash_table->find_exclusively_locked = exclusive;
415 		return ENTRY_FROM_ITEM(item);
416 	}
417 }
418 
419 /*
420  * Returns a pointer to an exclusively locked item which must be released with
421  * dshash_release_lock.  If the key is found in the hash table, 'found' is set
422  * to true and a pointer to the existing entry is returned.  If the key is not
423  * found, 'found' is set to false, and a pointer to a newly created entry is
424  * returned.
425  *
426  * Notes above dshash_find() regarding locking and error handling equally
427  * apply here.
428  */
429 void *
430 dshash_find_or_insert(dshash_table *hash_table,
431 					  const void *key,
432 					  bool *found)
433 {
434 	dshash_hash hash;
435 	size_t		partition_index;
436 	dshash_partition *partition;
437 	dshash_table_item *item;
438 
439 	hash = hash_key(hash_table, key);
440 	partition_index = PARTITION_FOR_HASH(hash);
441 	partition = &hash_table->control->partitions[partition_index];
442 
443 	Assert(hash_table->control->magic == DSHASH_MAGIC);
444 	Assert(!hash_table->find_locked);
445 
446 restart:
447 	LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
448 				  LW_EXCLUSIVE);
449 	ensure_valid_bucket_pointers(hash_table);
450 
451 	/* Search the active bucket. */
452 	item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
453 
454 	if (item)
455 		*found = true;
456 	else
457 	{
458 		*found = false;
459 
460 		/* Check if we are getting too full. */
461 		if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
462 		{
463 			/*
464 			 * The load factor (= keys / buckets) for all buckets protected by
465 			 * this partition is > 0.75.  Presumably the same applies
466 			 * generally across the whole hash table (though we don't attempt
467 			 * to track that directly to avoid contention on some kind of
468 			 * central counter; we just assume that this partition is
469 			 * representative).  This is a good time to resize.
470 			 *
471 			 * Give up our existing lock first, because resizing needs to
472 			 * reacquire all the locks in the right order to avoid deadlocks.
473 			 */
474 			LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
475 			resize(hash_table, hash_table->size_log2 + 1);
476 
477 			goto restart;
478 		}
479 
480 		/* Finally we can try to insert the new item. */
481 		item = insert_into_bucket(hash_table, key,
482 								  &BUCKET_FOR_HASH(hash_table, hash));
483 		item->hash = hash;
484 		/* Adjust per-lock-partition counter for load factor knowledge. */
485 		++partition->count;
486 	}
487 
488 	/* The caller must release the lock with dshash_release_lock. */
489 	hash_table->find_locked = true;
490 	hash_table->find_exclusively_locked = true;
491 	return ENTRY_FROM_ITEM(item);
492 }
493 
494 /*
495  * Remove an entry by key.  Returns true if the key was found and the
496  * corresponding entry was removed.
497  *
498  * To delete an entry that you already have a pointer to, see
499  * dshash_delete_entry.
500  */
501 bool
502 dshash_delete_key(dshash_table *hash_table, const void *key)
503 {
504 	dshash_hash hash;
505 	size_t		partition;
506 	bool		found;
507 
508 	Assert(hash_table->control->magic == DSHASH_MAGIC);
509 	Assert(!hash_table->find_locked);
510 
511 	hash = hash_key(hash_table, key);
512 	partition = PARTITION_FOR_HASH(hash);
513 
514 	LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
515 	ensure_valid_bucket_pointers(hash_table);
516 
517 	if (delete_key_from_bucket(hash_table, key,
518 							   &BUCKET_FOR_HASH(hash_table, hash)))
519 	{
520 		Assert(hash_table->control->partitions[partition].count > 0);
521 		found = true;
522 		--hash_table->control->partitions[partition].count;
523 	}
524 	else
525 		found = false;
526 
527 	LWLockRelease(PARTITION_LOCK(hash_table, partition));
528 
529 	return found;
530 }
531 
532 /*
533  * Remove an entry.  The entry must already be exclusively locked, and must
534  * have been obtained by dshash_find or dshash_find_or_insert.  Note that this
535  * function releases the lock just like dshash_release_lock.
536  *
537  * To delete an entry by key, see dshash_delete_key.
538  */
539 void
540 dshash_delete_entry(dshash_table *hash_table, void *entry)
541 {
542 	dshash_table_item *item = ITEM_FROM_ENTRY(entry);
543 	size_t		partition = PARTITION_FOR_HASH(item->hash);
544 
545 	Assert(hash_table->control->magic == DSHASH_MAGIC);
546 	Assert(hash_table->find_locked);
547 	Assert(hash_table->find_exclusively_locked);
548 	Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
549 								LW_EXCLUSIVE));
550 
551 	delete_item(hash_table, item);
552 	hash_table->find_locked = false;
553 	hash_table->find_exclusively_locked = false;
554 	LWLockRelease(PARTITION_LOCK(hash_table, partition));
555 }
556 
557 /*
558  * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
559  */
560 void
561 dshash_release_lock(dshash_table *hash_table, void *entry)
562 {
563 	dshash_table_item *item = ITEM_FROM_ENTRY(entry);
564 	size_t		partition_index = PARTITION_FOR_HASH(item->hash);
565 
566 	Assert(hash_table->control->magic == DSHASH_MAGIC);
567 	Assert(hash_table->find_locked);
568 	Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition_index),
569 								hash_table->find_exclusively_locked
570 								? LW_EXCLUSIVE : LW_SHARED));
571 
572 	hash_table->find_locked = false;
573 	hash_table->find_exclusively_locked = false;
574 	LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
575 }
576 
577 /*
578  * A compare function that forwards to memcmp.
579  */
580 int
581 dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
582 {
583 	return memcmp(a, b, size);
584 }
585 
586 /*
587  * A hash function that forwards to tag_hash.
588  */
589 dshash_hash
590 dshash_memhash(const void *v, size_t size, void *arg)
591 {
592 	return tag_hash(v, size);
593 }
594 
595 /*
596  * Print debugging information about the internal state of the hash table to
597  * stderr.  The caller must hold no partition locks.
598  */
599 void
600 dshash_dump(dshash_table *hash_table)
601 {
602 	size_t		i;
603 	size_t		j;
604 
605 	Assert(hash_table->control->magic == DSHASH_MAGIC);
606 	Assert(!hash_table->find_locked);
607 
608 	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
609 	{
610 		Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
611 		LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
612 	}
613 
614 	ensure_valid_bucket_pointers(hash_table);
615 
616 	fprintf(stderr,
617 			"hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
618 	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
619 	{
620 		dshash_partition *partition = &hash_table->control->partitions[i];
621 		size_t		begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
622 		size_t		end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
623 
624 		fprintf(stderr, "  partition %zu\n", i);
625 		fprintf(stderr,
626 				"    active buckets (key count = %zu)\n", partition->count);
627 
628 		for (j = begin; j < end; ++j)
629 		{
630 			size_t		count = 0;
631 			dsa_pointer bucket = hash_table->buckets[j];
632 
633 			while (DsaPointerIsValid(bucket))
634 			{
635 				dshash_table_item *item;
636 
637 				item = dsa_get_address(hash_table->area, bucket);
638 
639 				bucket = item->next;
640 				++count;
641 			}
642 			fprintf(stderr, "      bucket %zu (key count = %zu)\n", j, count);
643 		}
644 	}
645 
646 	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
647 		LWLockRelease(PARTITION_LOCK(hash_table, i));
648 }
649 
650 /*
651  * Delete a locked item to which we have a pointer.
652  */
653 static void
654 delete_item(dshash_table *hash_table, dshash_table_item *item)
655 {
656 	size_t		hash = item->hash;
657 	size_t		partition = PARTITION_FOR_HASH(hash);
658 
659 	Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
660 
661 	if (delete_item_from_bucket(hash_table, item,
662 								&BUCKET_FOR_HASH(hash_table, hash)))
663 	{
664 		Assert(hash_table->control->partitions[partition].count > 0);
665 		--hash_table->control->partitions[partition].count;
666 	}
667 	else
668 	{
669 		Assert(false);
670 	}
671 }
672 
673 /*
674  * Grow the hash table if necessary to the requested number of buckets.  The
675  * requested size must be double some previously observed size.
676  *
677  * Must be called without any partition lock held.
678  */
679 static void
680 resize(dshash_table *hash_table, size_t new_size_log2)
681 {
682 	dsa_pointer old_buckets;
683 	dsa_pointer new_buckets_shared;
684 	dsa_pointer *new_buckets;
685 	size_t		size;
686 	size_t		new_size = ((size_t) 1) << new_size_log2;
687 	size_t		i;
688 
689 	/*
690 	 * Acquire the locks for all lock partitions.  This is expensive, but we
691 	 * shouldn't have to do it many times.
692 	 */
693 	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
694 	{
695 		Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
696 
697 		LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
698 		if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
699 		{
700 			/*
701 			 * Another backend has already increased the size; we can avoid
702 			 * obtaining all the locks and return early.
703 			 */
704 			LWLockRelease(PARTITION_LOCK(hash_table, 0));
705 			return;
706 		}
707 	}
708 
709 	Assert(new_size_log2 == hash_table->control->size_log2 + 1);
710 
711 	/* Allocate the space for the new table. */
712 	new_buckets_shared = dsa_allocate0(hash_table->area,
713 									   sizeof(dsa_pointer) * new_size);
714 	new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
715 
716 	/*
717 	 * We've allocated the new bucket array; all that remains to do now is to
718 	 * reinsert all items, which amounts to adjusting all the pointers.
719 	 */
720 	size = ((size_t) 1) << hash_table->control->size_log2;
721 	for (i = 0; i < size; ++i)
722 	{
723 		dsa_pointer item_pointer = hash_table->buckets[i];
724 
725 		while (DsaPointerIsValid(item_pointer))
726 		{
727 			dshash_table_item *item;
728 			dsa_pointer next_item_pointer;
729 
730 			item = dsa_get_address(hash_table->area, item_pointer);
731 			next_item_pointer = item->next;
732 			insert_item_into_bucket(hash_table, item_pointer, item,
733 									&new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
734 																				new_size_log2)]);
735 			item_pointer = next_item_pointer;
736 		}
737 	}
738 
739 	/* Swap the hash table into place and free the old one. */
740 	old_buckets = hash_table->control->buckets;
741 	hash_table->control->buckets = new_buckets_shared;
742 	hash_table->control->size_log2 = new_size_log2;
743 	hash_table->buckets = new_buckets;
744 	dsa_free(hash_table->area, old_buckets);
745 
746 	/* Release all the locks. */
747 	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
748 		LWLockRelease(PARTITION_LOCK(hash_table, i));
749 }
750 
751 /*
752  * Make sure that our backend-local bucket pointers are up to date.  The
753  * caller must have locked one lock partition, which prevents resize() from
754  * running concurrently.
755  */
756 static inline void
757 ensure_valid_bucket_pointers(dshash_table *hash_table)
758 {
759 	if (hash_table->size_log2 != hash_table->control->size_log2)
760 	{
761 		hash_table->buckets = dsa_get_address(hash_table->area,
762 											  hash_table->control->buckets);
763 		hash_table->size_log2 = hash_table->control->size_log2;
764 	}
765 }
766 
767 /*
768  * Scan a locked bucket for a match, using the provided compare function.
769  */
770 static inline dshash_table_item *
771 find_in_bucket(dshash_table *hash_table, const void *key,
772 			   dsa_pointer item_pointer)
773 {
774 	while (DsaPointerIsValid(item_pointer))
775 	{
776 		dshash_table_item *item;
777 
778 		item = dsa_get_address(hash_table->area, item_pointer);
779 		if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
780 			return item;
781 		item_pointer = item->next;
782 	}
783 	return NULL;
784 }
785 
786 /*
787  * Insert an already-allocated item into a bucket.
788  */
789 static void
790 insert_item_into_bucket(dshash_table *hash_table,
791 						dsa_pointer item_pointer,
792 						dshash_table_item *item,
793 						dsa_pointer *bucket)
794 {
795 	Assert(item == dsa_get_address(hash_table->area, item_pointer));
796 
797 	item->next = *bucket;
798 	*bucket = item_pointer;
799 }
800 
801 /*
802  * Allocate space for an entry with the given key and insert it into the
803  * provided bucket.
804  */
805 static dshash_table_item *
806 insert_into_bucket(dshash_table *hash_table,
807 				   const void *key,
808 				   dsa_pointer *bucket)
809 {
810 	dsa_pointer item_pointer;
811 	dshash_table_item *item;
812 
813 	item_pointer = dsa_allocate(hash_table->area,
814 								hash_table->params.entry_size +
815 								MAXALIGN(sizeof(dshash_table_item)));
816 	item = dsa_get_address(hash_table->area, item_pointer);
817 	memcpy(ENTRY_FROM_ITEM(item), key, hash_table->params.key_size);
818 	insert_item_into_bucket(hash_table, item_pointer, item, bucket);
819 	return item;
820 }
821 
822 /*
823  * Search a bucket for a matching key and delete it.
824  */
825 static bool
826 delete_key_from_bucket(dshash_table *hash_table,
827 					   const void *key,
828 					   dsa_pointer *bucket_head)
829 {
830 	while (DsaPointerIsValid(*bucket_head))
831 	{
832 		dshash_table_item *item;
833 
834 		item = dsa_get_address(hash_table->area, *bucket_head);
835 
836 		if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
837 		{
838 			dsa_pointer next;
839 
840 			next = item->next;
841 			dsa_free(hash_table->area, *bucket_head);
842 			*bucket_head = next;
843 
844 			return true;
845 		}
846 		bucket_head = &item->next;
847 	}
848 	return false;
849 }
850 
851 /*
852  * Delete the specified item from the bucket.
853  */
854 static bool
855 delete_item_from_bucket(dshash_table *hash_table,
856 						dshash_table_item *item,
857 						dsa_pointer *bucket_head)
858 {
859 	while (DsaPointerIsValid(*bucket_head))
860 	{
861 		dshash_table_item *bucket_item;
862 
863 		bucket_item = dsa_get_address(hash_table->area, *bucket_head);
864 
865 		if (bucket_item == item)
866 		{
867 			dsa_pointer next;
868 
869 			next = item->next;
870 			dsa_free(hash_table->area, *bucket_head);
871 			*bucket_head = next;
872 			return true;
873 		}
874 		bucket_head = &bucket_item->next;
875 	}
876 	return false;
877 }
878 
879 /*
880  * Compute the hash value for a key.
881  */
882 static inline dshash_hash
883 hash_key(dshash_table *hash_table, const void *key)
884 {
885 	return hash_table->params.hash_function(key,
886 											hash_table->params.key_size,
887 											hash_table->arg);
888 }
889 
890 /*
891  * Check whether two keys compare equal.
892  */
893 static inline bool
894 equal_keys(dshash_table *hash_table, const void *a, const void *b)
895 {
896 	return hash_table->params.compare_function(a, b,
897 											   hash_table->params.key_size,
898 											   hash_table->arg) == 0;
899 }
900