1 /*-------------------------------------------------------------------------
2  *
3  * dynahash.c
4  *	  dynamic hash tables
5  *
6  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7  * shared memory.  For shared hash tables, it is the caller's responsibility
8  * to provide appropriate access interlocking.  The simplest convention is
9  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
10  * hash_seq_search) need only shared lock, but any update requires exclusive
11  * lock.  For heavily-used shared tables, the single-lock approach creates a
12  * concurrency bottleneck, so we also support "partitioned" locking wherein
13  * there are multiple LWLocks guarding distinct subsets of the table.  To use
14  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
16  * Therefore, each hash bucket chain operates independently, and no fields
17  * of the hash header change after init except nentries and freeList.
18  * (A partitioned table uses multiple copies of those fields, guarded by
19  * spinlocks, for additional concurrency.)
20  * This lets any subset of the hash buckets be treated as a separately
21  * lockable partition.  We expect callers to use the low-order bits of a
22  * lookup key's hash value as a partition number --- this will work because
23  * of the way calc_bucket() maps hash values to bucket numbers.
24  *
25  * For hash tables in shared memory, the memory allocator function should
26  * match malloc's semantics of returning NULL on failure.  For hash tables
27  * in local memory, we typically use palloc() which will throw error on
28  * failure.  The code in this file has to cope with both cases.
29  *
30  * dynahash.c provides support for these types of lookup keys:
31  *
32  * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33  * compared as though by strcmp().  This is the default behavior.
34  *
35  * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
36  * (Caller must ensure there are no undefined padding bits in the keys!)
37  * This is selected by specifying HASH_BLOBS flag to hash_create.
38  *
39  * 3. More complex key behavior can be selected by specifying user-supplied
40  * hashing, comparison, and/or key-copying functions.  At least a hashing
41  * function must be supplied; comparison defaults to memcmp() and key copying
42  * to memcpy() when a user-defined hashing function is selected.
43  *
44  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
45  * Portions Copyright (c) 1994, Regents of the University of California
46  *
47  *
48  * IDENTIFICATION
49  *	  src/backend/utils/hash/dynahash.c
50  *
51  *-------------------------------------------------------------------------
52  */
53 
54 /*
55  * Original comments:
56  *
57  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
58  * Coded into C, with minor code improvements, and with hsearch(3) interface,
59  * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
60  * also, hcreate/hdestroy routines added to simulate hsearch(3).
61  *
62  * These routines simulate hsearch(3) and family, with the important
63  * difference that the hash table is dynamic - can grow indefinitely
64  * beyond its original size (as supplied to hcreate()).
65  *
66  * Performance appears to be comparable to that of hsearch(3).
67  * The 'source-code' options referred to in hsearch(3)'s 'man' page
68  * are not implemented; otherwise functionality is identical.
69  *
70  * Compilation controls:
71  * HASH_DEBUG controls some informative traces, mainly for debugging.
72  * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
73  * when combined with HASH_DEBUG, these are displayed by hdestroy().
74  *
75  * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
76  * concatenation property, in probably unnecessary code 'optimization'.
77  *
78  * Modified margo@postgres.berkeley.edu February 1990
79  *		added multiple table interface
80  * Modified by sullivan@postgres.berkeley.edu April 1990
81  *		changed ctl structure for shared memory
82  */
83 
84 #include "postgres.h"
85 
86 #include <limits.h>
87 
88 #include "access/xact.h"
89 #include "storage/shmem.h"
90 #include "storage/spin.h"
91 #include "utils/dynahash.h"
92 #include "utils/memutils.h"
93 
94 
95 /*
96  * Constants
97  *
98  * A hash table has a top-level "directory", each of whose entries points
99  * to a "segment" of ssize bucket headers.  The maximum number of hash
100  * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
101  * the number of records in the table can be larger, but we don't want a
102  * whole lot of records per bucket or performance goes down.
103  *
104  * In a hash table allocated in shared memory, the directory cannot be
105  * expanded because it must stay at a fixed address.  The directory size
106  * should be selected using hash_select_dirsize (and you'd better have
107  * a good idea of the maximum number of entries!).  For non-shared hash
108  * tables, the initial directory size can be left at the default.
109  */
110 #define DEF_SEGSIZE			   256
111 #define DEF_SEGSIZE_SHIFT	   8	/* must be log2(DEF_SEGSIZE) */
112 #define DEF_DIRSIZE			   256
113 #define DEF_FFACTOR			   1	/* default fill factor */
114 
115 /* Number of freelists to be used for a partitioned hash table. */
116 #define NUM_FREELISTS			32
117 
118 /* A hash bucket is a linked list of HASHELEMENTs */
119 typedef HASHELEMENT *HASHBUCKET;
120 
121 /* A hash segment is an array of bucket headers */
122 typedef HASHBUCKET *HASHSEGMENT;
123 
124 /*
125  * Per-freelist data.
126  *
127  * In a partitioned hash table, each freelist is associated with a specific
128  * set of hashcodes, as determined by the FREELIST_IDX() macro below.
129  * nentries tracks the number of live hashtable entries having those hashcodes
130  * (NOT the number of entries in the freelist, as you might expect).
131  *
132  * The coverage of a freelist might be more or less than one partition, so it
133  * needs its own lock rather than relying on caller locking.  Relying on that
134  * wouldn't work even if the coverage was the same, because of the occasional
135  * need to "borrow" entries from another freelist; see get_hash_entry().
136  *
137  * Using an array of FreeListData instead of separate arrays of mutexes,
138  * nentries and freeLists helps to reduce sharing of cache lines between
139  * different mutexes.
140  */
141 typedef struct
142 {
143 	slock_t		mutex;			/* spinlock for this freelist */
144 	long		nentries;		/* number of entries in associated buckets */
145 	HASHELEMENT *freeList;		/* chain of free elements */
146 } FreeListData;
147 
148 /*
149  * Header structure for a hash table --- contains all changeable info
150  *
151  * In a shared-memory hash table, the HASHHDR is in shared memory, while
152  * each backend has a local HTAB struct.  For a non-shared table, there isn't
153  * any functional difference between HASHHDR and HTAB, but we separate them
154  * anyway to share code between shared and non-shared tables.
155  */
156 struct HASHHDR
157 {
158 	/*
159 	 * The freelist can become a point of contention in high-concurrency hash
160 	 * tables, so we use an array of freelists, each with its own mutex and
161 	 * nentries count, instead of just a single one.  Although the freelists
162 	 * normally operate independently, we will scavenge entries from freelists
163 	 * other than a hashcode's default freelist when necessary.
164 	 *
165 	 * If the hash table is not partitioned, only freeList[0] is used and its
166 	 * spinlock is not used at all; callers' locking is assumed sufficient.
167 	 */
168 	FreeListData freeList[NUM_FREELISTS];
169 
170 	/* These fields can change, but not in a partitioned table */
171 	/* Also, dsize can't change in a shared table, even if unpartitioned */
172 	long		dsize;			/* directory size */
173 	long		nsegs;			/* number of allocated segments (<= dsize) */
174 	uint32		max_bucket;		/* ID of maximum bucket in use */
175 	uint32		high_mask;		/* mask to modulo into entire table */
176 	uint32		low_mask;		/* mask to modulo into lower half of table */
177 
178 	/* These fields are fixed at hashtable creation */
179 	Size		keysize;		/* hash key length in bytes */
180 	Size		entrysize;		/* total user element size in bytes */
181 	long		num_partitions; /* # partitions (must be power of 2), or 0 */
182 	long		ffactor;		/* target fill factor */
183 	long		max_dsize;		/* 'dsize' limit if directory is fixed size */
184 	long		ssize;			/* segment size --- must be power of 2 */
185 	int			sshift;			/* segment shift = log2(ssize) */
186 	int			nelem_alloc;	/* number of entries to allocate at once */
187 
188 #ifdef HASH_STATISTICS
189 
190 	/*
191 	 * Count statistics here.  NB: stats code doesn't bother with mutex, so
192 	 * counts could be corrupted a bit in a partitioned table.
193 	 */
194 	long		accesses;
195 	long		collisions;
196 #endif
197 };
198 
199 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
200 
201 #define FREELIST_IDX(hctl, hashcode) \
202 	(IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
203 
204 /*
205  * Top control structure for a hashtable --- in a shared table, each backend
206  * has its own copy (OK since no fields change at runtime)
207  */
208 struct HTAB
209 {
210 	HASHHDR    *hctl;			/* => shared control information */
211 	HASHSEGMENT *dir;			/* directory of segment starts */
212 	HashValueFunc hash;			/* hash function */
213 	HashCompareFunc match;		/* key comparison function */
214 	HashCopyFunc keycopy;		/* key copying function */
215 	HashAllocFunc alloc;		/* memory allocator */
216 	MemoryContext hcxt;			/* memory context if default allocator used */
217 	char	   *tabname;		/* table name (for error messages) */
218 	bool		isshared;		/* true if table is in shared memory */
219 	bool		isfixed;		/* if true, don't enlarge */
220 
221 	/* freezing a shared table isn't allowed, so we can keep state here */
222 	bool		frozen;			/* true = no more inserts allowed */
223 
224 	/* We keep local copies of these fixed values to reduce contention */
225 	Size		keysize;		/* hash key length in bytes */
226 	long		ssize;			/* segment size --- must be power of 2 */
227 	int			sshift;			/* segment shift = log2(ssize) */
228 };
229 
230 /*
231  * Key (also entry) part of a HASHELEMENT
232  */
233 #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
234 
235 /*
236  * Obtain element pointer given pointer to key
237  */
238 #define ELEMENT_FROM_KEY(key)  \
239 	((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
240 
241 /*
242  * Fast MOD arithmetic, assuming that y is a power of 2 !
243  */
244 #define MOD(x,y)			   ((x) & ((y)-1))
245 
246 #if HASH_STATISTICS
247 static long hash_accesses,
248 			hash_collisions,
249 			hash_expansions;
250 #endif
251 
252 /*
253  * Private function prototypes
254  */
255 static void *DynaHashAlloc(Size size);
256 static HASHSEGMENT seg_alloc(HTAB *hashp);
257 static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
258 static bool dir_realloc(HTAB *hashp);
259 static bool expand_table(HTAB *hashp);
260 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
261 static void hdefault(HTAB *hashp);
262 static int	choose_nelem_alloc(Size entrysize);
263 static bool init_htab(HTAB *hashp, long nelem);
264 static void hash_corrupted(HTAB *hashp);
265 static long next_pow2_long(long num);
266 static int	next_pow2_int(long num);
267 static void register_seq_scan(HTAB *hashp);
268 static void deregister_seq_scan(HTAB *hashp);
269 static bool has_seq_scans(HTAB *hashp);
270 
271 
272 /*
273  * memory allocation support
274  */
275 static MemoryContext CurrentDynaHashCxt = NULL;
276 
277 static void *
DynaHashAlloc(Size size)278 DynaHashAlloc(Size size)
279 {
280 	Assert(MemoryContextIsValid(CurrentDynaHashCxt));
281 	return MemoryContextAlloc(CurrentDynaHashCxt, size);
282 }
283 
284 
285 /*
286  * HashCompareFunc for string keys
287  *
288  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
289  * bytes, so we can only compare that many ... hence strncmp is almost but
290  * not quite the right thing.
291  */
292 static int
string_compare(const char * key1,const char * key2,Size keysize)293 string_compare(const char *key1, const char *key2, Size keysize)
294 {
295 	return strncmp(key1, key2, keysize - 1);
296 }
297 
298 
299 /************************** CREATE ROUTINES **********************/
300 
301 /*
302  * hash_create -- create a new dynamic hash table
303  *
304  *	tabname: a name for the table (for debugging purposes)
305  *	nelem: maximum number of elements expected
306  *	*info: additional table parameters, as indicated by flags
307  *	flags: bitmask indicating which parameters to take from *info
308  *
309  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
310  * estimate, since we can't expand the table on the fly.  But an unshared
311  * hashtable can be expanded on-the-fly, so it's better for nelem to be
312  * on the small side and let the table grow if it's exceeded.  An overly
313  * large nelem will penalize hash_seq_search speed without buying much.
314  */
315 HTAB *
hash_create(const char * tabname,long nelem,HASHCTL * info,int flags)316 hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
317 {
318 	HTAB	   *hashp;
319 	HASHHDR    *hctl;
320 
321 	/*
322 	 * For shared hash tables, we have a local hash header (HTAB struct) that
323 	 * we allocate in TopMemoryContext; all else is in shared memory.
324 	 *
325 	 * For non-shared hash tables, everything including the hash header is in
326 	 * a memory context created specially for the hash table --- this makes
327 	 * hash_destroy very simple.  The memory context is made a child of either
328 	 * a context specified by the caller, or TopMemoryContext if nothing is
329 	 * specified.
330 	 */
331 	if (flags & HASH_SHARED_MEM)
332 	{
333 		/* Set up to allocate the hash header */
334 		CurrentDynaHashCxt = TopMemoryContext;
335 	}
336 	else
337 	{
338 		/* Create the hash table's private memory context */
339 		if (flags & HASH_CONTEXT)
340 			CurrentDynaHashCxt = info->hcxt;
341 		else
342 			CurrentDynaHashCxt = TopMemoryContext;
343 		CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
344 												   tabname,
345 												   ALLOCSET_DEFAULT_SIZES);
346 	}
347 
348 	/* Initialize the hash header, plus a copy of the table name */
349 	hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
350 	MemSet(hashp, 0, sizeof(HTAB));
351 
352 	hashp->tabname = (char *) (hashp + 1);
353 	strcpy(hashp->tabname, tabname);
354 
355 	/*
356 	 * Select the appropriate hash function (see comments at head of file).
357 	 */
358 	if (flags & HASH_FUNCTION)
359 		hashp->hash = info->hash;
360 	else if (flags & HASH_BLOBS)
361 	{
362 		/* We can optimize hashing for common key sizes */
363 		Assert(flags & HASH_ELEM);
364 		if (info->keysize == sizeof(uint32))
365 			hashp->hash = uint32_hash;
366 		else
367 			hashp->hash = tag_hash;
368 	}
369 	else
370 		hashp->hash = string_hash;	/* default hash function */
371 
372 	/*
373 	 * If you don't specify a match function, it defaults to string_compare if
374 	 * you used string_hash (either explicitly or by default) and to memcmp
375 	 * otherwise.
376 	 *
377 	 * Note: explicitly specifying string_hash is deprecated, because this
378 	 * might not work for callers in loadable modules on some platforms due to
379 	 * referencing a trampoline instead of the string_hash function proper.
380 	 * Just let it default, eh?
381 	 */
382 	if (flags & HASH_COMPARE)
383 		hashp->match = info->match;
384 	else if (hashp->hash == string_hash)
385 		hashp->match = (HashCompareFunc) string_compare;
386 	else
387 		hashp->match = memcmp;
388 
389 	/*
390 	 * Similarly, the key-copying function defaults to strlcpy or memcpy.
391 	 */
392 	if (flags & HASH_KEYCOPY)
393 		hashp->keycopy = info->keycopy;
394 	else if (hashp->hash == string_hash)
395 		hashp->keycopy = (HashCopyFunc) strlcpy;
396 	else
397 		hashp->keycopy = memcpy;
398 
399 	/* And select the entry allocation function, too. */
400 	if (flags & HASH_ALLOC)
401 		hashp->alloc = info->alloc;
402 	else
403 		hashp->alloc = DynaHashAlloc;
404 
405 	if (flags & HASH_SHARED_MEM)
406 	{
407 		/*
408 		 * ctl structure and directory are preallocated for shared memory
409 		 * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
410 		 * well.
411 		 */
412 		hashp->hctl = info->hctl;
413 		hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
414 		hashp->hcxt = NULL;
415 		hashp->isshared = true;
416 
417 		/* hash table already exists, we're just attaching to it */
418 		if (flags & HASH_ATTACH)
419 		{
420 			/* make local copies of some heavily-used values */
421 			hctl = hashp->hctl;
422 			hashp->keysize = hctl->keysize;
423 			hashp->ssize = hctl->ssize;
424 			hashp->sshift = hctl->sshift;
425 
426 			return hashp;
427 		}
428 	}
429 	else
430 	{
431 		/* setup hash table defaults */
432 		hashp->hctl = NULL;
433 		hashp->dir = NULL;
434 		hashp->hcxt = CurrentDynaHashCxt;
435 		hashp->isshared = false;
436 	}
437 
438 	if (!hashp->hctl)
439 	{
440 		hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
441 		if (!hashp->hctl)
442 			ereport(ERROR,
443 					(errcode(ERRCODE_OUT_OF_MEMORY),
444 					 errmsg("out of memory")));
445 	}
446 
447 	hashp->frozen = false;
448 
449 	hdefault(hashp);
450 
451 	hctl = hashp->hctl;
452 
453 	if (flags & HASH_PARTITION)
454 	{
455 		/* Doesn't make sense to partition a local hash table */
456 		Assert(flags & HASH_SHARED_MEM);
457 
458 		/*
459 		 * The number of partitions had better be a power of 2. Also, it must
460 		 * be less than INT_MAX (see init_htab()), so call the int version of
461 		 * next_pow2.
462 		 */
463 		Assert(info->num_partitions == next_pow2_int(info->num_partitions));
464 
465 		hctl->num_partitions = info->num_partitions;
466 	}
467 
468 	if (flags & HASH_SEGMENT)
469 	{
470 		hctl->ssize = info->ssize;
471 		hctl->sshift = my_log2(info->ssize);
472 		/* ssize had better be a power of 2 */
473 		Assert(hctl->ssize == (1L << hctl->sshift));
474 	}
475 	if (flags & HASH_FFACTOR)
476 		hctl->ffactor = info->ffactor;
477 
478 	/*
479 	 * SHM hash tables have fixed directory size passed by the caller.
480 	 */
481 	if (flags & HASH_DIRSIZE)
482 	{
483 		hctl->max_dsize = info->max_dsize;
484 		hctl->dsize = info->dsize;
485 	}
486 
487 	/*
488 	 * hash table now allocates space for key and data but you have to say how
489 	 * much space to allocate
490 	 */
491 	if (flags & HASH_ELEM)
492 	{
493 		Assert(info->entrysize >= info->keysize);
494 		hctl->keysize = info->keysize;
495 		hctl->entrysize = info->entrysize;
496 	}
497 
498 	/* make local copies of heavily-used constant fields */
499 	hashp->keysize = hctl->keysize;
500 	hashp->ssize = hctl->ssize;
501 	hashp->sshift = hctl->sshift;
502 
503 	/* Build the hash directory structure */
504 	if (!init_htab(hashp, nelem))
505 		elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
506 
507 	/*
508 	 * For a shared hash table, preallocate the requested number of elements.
509 	 * This reduces problems with run-time out-of-shared-memory conditions.
510 	 *
511 	 * For a non-shared hash table, preallocate the requested number of
512 	 * elements if it's less than our chosen nelem_alloc.  This avoids wasting
513 	 * space if the caller correctly estimates a small table size.
514 	 */
515 	if ((flags & HASH_SHARED_MEM) ||
516 		nelem < hctl->nelem_alloc)
517 	{
518 		int			i,
519 					freelist_partitions,
520 					nelem_alloc,
521 					nelem_alloc_first;
522 
523 		/*
524 		 * If hash table is partitioned, give each freelist an equal share of
525 		 * the initial allocation.  Otherwise only freeList[0] is used.
526 		 */
527 		if (IS_PARTITIONED(hashp->hctl))
528 			freelist_partitions = NUM_FREELISTS;
529 		else
530 			freelist_partitions = 1;
531 
532 		nelem_alloc = nelem / freelist_partitions;
533 		if (nelem_alloc <= 0)
534 			nelem_alloc = 1;
535 
536 		/*
537 		 * Make sure we'll allocate all the requested elements; freeList[0]
538 		 * gets the excess if the request isn't divisible by NUM_FREELISTS.
539 		 */
540 		if (nelem_alloc * freelist_partitions < nelem)
541 			nelem_alloc_first =
542 				nelem - nelem_alloc * (freelist_partitions - 1);
543 		else
544 			nelem_alloc_first = nelem_alloc;
545 
546 		for (i = 0; i < freelist_partitions; i++)
547 		{
548 			int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
549 
550 			if (!element_alloc(hashp, temp, i))
551 				ereport(ERROR,
552 						(errcode(ERRCODE_OUT_OF_MEMORY),
553 						 errmsg("out of memory")));
554 		}
555 	}
556 
557 	if (flags & HASH_FIXED_SIZE)
558 		hashp->isfixed = true;
559 	return hashp;
560 }
561 
562 /*
563  * Set default HASHHDR parameters.
564  */
565 static void
hdefault(HTAB * hashp)566 hdefault(HTAB *hashp)
567 {
568 	HASHHDR    *hctl = hashp->hctl;
569 
570 	MemSet(hctl, 0, sizeof(HASHHDR));
571 
572 	hctl->dsize = DEF_DIRSIZE;
573 	hctl->nsegs = 0;
574 
575 	/* rather pointless defaults for key & entry size */
576 	hctl->keysize = sizeof(char *);
577 	hctl->entrysize = 2 * sizeof(char *);
578 
579 	hctl->num_partitions = 0;	/* not partitioned */
580 
581 	hctl->ffactor = DEF_FFACTOR;
582 
583 	/* table has no fixed maximum size */
584 	hctl->max_dsize = NO_MAX_DSIZE;
585 
586 	hctl->ssize = DEF_SEGSIZE;
587 	hctl->sshift = DEF_SEGSIZE_SHIFT;
588 
589 #ifdef HASH_STATISTICS
590 	hctl->accesses = hctl->collisions = 0;
591 #endif
592 }
593 
594 /*
595  * Given the user-specified entry size, choose nelem_alloc, ie, how many
596  * elements to add to the hash table when we need more.
597  */
598 static int
choose_nelem_alloc(Size entrysize)599 choose_nelem_alloc(Size entrysize)
600 {
601 	int			nelem_alloc;
602 	Size		elementSize;
603 	Size		allocSize;
604 
605 	/* Each element has a HASHELEMENT header plus user data. */
606 	/* NB: this had better match element_alloc() */
607 	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
608 
609 	/*
610 	 * The idea here is to choose nelem_alloc at least 32, but round up so
611 	 * that the allocation request will be a power of 2 or just less. This
612 	 * makes little difference for hash tables in shared memory, but for hash
613 	 * tables managed by palloc, the allocation request will be rounded up to
614 	 * a power of 2 anyway.  If we fail to take this into account, we'll waste
615 	 * as much as half the allocated space.
616 	 */
617 	allocSize = 32 * 4;			/* assume elementSize at least 8 */
618 	do
619 	{
620 		allocSize <<= 1;
621 		nelem_alloc = allocSize / elementSize;
622 	} while (nelem_alloc < 32);
623 
624 	return nelem_alloc;
625 }
626 
627 /*
628  * Compute derived fields of hctl and build the initial directory/segment
629  * arrays
630  */
631 static bool
init_htab(HTAB * hashp,long nelem)632 init_htab(HTAB *hashp, long nelem)
633 {
634 	HASHHDR    *hctl = hashp->hctl;
635 	HASHSEGMENT *segp;
636 	int			nbuckets;
637 	int			nsegs;
638 	int			i;
639 
640 	/*
641 	 * initialize mutexes if it's a partitioned table
642 	 */
643 	if (IS_PARTITIONED(hctl))
644 		for (i = 0; i < NUM_FREELISTS; i++)
645 			SpinLockInit(&(hctl->freeList[i].mutex));
646 
647 	/*
648 	 * Divide number of elements by the fill factor to determine a desired
649 	 * number of buckets.  Allocate space for the next greater power of two
650 	 * number of buckets
651 	 */
652 	nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1);
653 
654 	/*
655 	 * In a partitioned table, nbuckets must be at least equal to
656 	 * num_partitions; were it less, keys with apparently different partition
657 	 * numbers would map to the same bucket, breaking partition independence.
658 	 * (Normally nbuckets will be much bigger; this is just a safety check.)
659 	 */
660 	while (nbuckets < hctl->num_partitions)
661 		nbuckets <<= 1;
662 
663 	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
664 	hctl->high_mask = (nbuckets << 1) - 1;
665 
666 	/*
667 	 * Figure number of directory segments needed, round up to a power of 2
668 	 */
669 	nsegs = (nbuckets - 1) / hctl->ssize + 1;
670 	nsegs = next_pow2_int(nsegs);
671 
672 	/*
673 	 * Make sure directory is big enough. If pre-allocated directory is too
674 	 * small, choke (caller screwed up).
675 	 */
676 	if (nsegs > hctl->dsize)
677 	{
678 		if (!(hashp->dir))
679 			hctl->dsize = nsegs;
680 		else
681 			return false;
682 	}
683 
684 	/* Allocate a directory */
685 	if (!(hashp->dir))
686 	{
687 		CurrentDynaHashCxt = hashp->hcxt;
688 		hashp->dir = (HASHSEGMENT *)
689 			hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
690 		if (!hashp->dir)
691 			return false;
692 	}
693 
694 	/* Allocate initial segments */
695 	for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
696 	{
697 		*segp = seg_alloc(hashp);
698 		if (*segp == NULL)
699 			return false;
700 	}
701 
702 	/* Choose number of entries to allocate at a time */
703 	hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
704 
705 #if HASH_DEBUG
706 	fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
707 			"TABLE POINTER   ", hashp,
708 			"DIRECTORY SIZE  ", hctl->dsize,
709 			"SEGMENT SIZE    ", hctl->ssize,
710 			"SEGMENT SHIFT   ", hctl->sshift,
711 			"FILL FACTOR     ", hctl->ffactor,
712 			"MAX BUCKET      ", hctl->max_bucket,
713 			"HIGH MASK       ", hctl->high_mask,
714 			"LOW  MASK       ", hctl->low_mask,
715 			"NSEGS           ", hctl->nsegs);
716 #endif
717 	return true;
718 }
719 
720 /*
721  * Estimate the space needed for a hashtable containing the given number
722  * of entries of given size.
723  * NOTE: this is used to estimate the footprint of hashtables in shared
724  * memory; therefore it does not count HTAB which is in local memory.
725  * NB: assumes that all hash structure parameters have default values!
726  */
727 Size
hash_estimate_size(long num_entries,Size entrysize)728 hash_estimate_size(long num_entries, Size entrysize)
729 {
730 	Size		size;
731 	long		nBuckets,
732 				nSegments,
733 				nDirEntries,
734 				nElementAllocs,
735 				elementSize,
736 				elementAllocCnt;
737 
738 	/* estimate number of buckets wanted */
739 	nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
740 	/* # of segments needed for nBuckets */
741 	nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
742 	/* directory entries */
743 	nDirEntries = DEF_DIRSIZE;
744 	while (nDirEntries < nSegments)
745 		nDirEntries <<= 1;		/* dir_alloc doubles dsize at each call */
746 
747 	/* fixed control info */
748 	size = MAXALIGN(sizeof(HASHHDR));	/* but not HTAB, per above */
749 	/* directory */
750 	size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
751 	/* segments */
752 	size = add_size(size, mul_size(nSegments,
753 								   MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
754 	/* elements --- allocated in groups of choose_nelem_alloc() entries */
755 	elementAllocCnt = choose_nelem_alloc(entrysize);
756 	nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
757 	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
758 	size = add_size(size,
759 					mul_size(nElementAllocs,
760 							 mul_size(elementAllocCnt, elementSize)));
761 
762 	return size;
763 }
764 
765 /*
766  * Select an appropriate directory size for a hashtable with the given
767  * maximum number of entries.
768  * This is only needed for hashtables in shared memory, whose directories
769  * cannot be expanded dynamically.
770  * NB: assumes that all hash structure parameters have default values!
771  *
772  * XXX this had better agree with the behavior of init_htab()...
773  */
774 long
hash_select_dirsize(long num_entries)775 hash_select_dirsize(long num_entries)
776 {
777 	long		nBuckets,
778 				nSegments,
779 				nDirEntries;
780 
781 	/* estimate number of buckets wanted */
782 	nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
783 	/* # of segments needed for nBuckets */
784 	nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
785 	/* directory entries */
786 	nDirEntries = DEF_DIRSIZE;
787 	while (nDirEntries < nSegments)
788 		nDirEntries <<= 1;		/* dir_alloc doubles dsize at each call */
789 
790 	return nDirEntries;
791 }
792 
793 /*
794  * Compute the required initial memory allocation for a shared-memory
795  * hashtable with the given parameters.  We need space for the HASHHDR
796  * and for the (non expansible) directory.
797  */
798 Size
hash_get_shared_size(HASHCTL * info,int flags)799 hash_get_shared_size(HASHCTL *info, int flags)
800 {
801 	Assert(flags & HASH_DIRSIZE);
802 	Assert(info->dsize == info->max_dsize);
803 	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
804 }
805 
806 
807 /********************** DESTROY ROUTINES ************************/
808 
809 void
hash_destroy(HTAB * hashp)810 hash_destroy(HTAB *hashp)
811 {
812 	if (hashp != NULL)
813 	{
814 		/* allocation method must be one we know how to free, too */
815 		Assert(hashp->alloc == DynaHashAlloc);
816 		/* so this hashtable must have it's own context */
817 		Assert(hashp->hcxt != NULL);
818 
819 		hash_stats("destroy", hashp);
820 
821 		/*
822 		 * Free everything by destroying the hash table's memory context.
823 		 */
824 		MemoryContextDelete(hashp->hcxt);
825 	}
826 }
827 
828 void
hash_stats(const char * where,HTAB * hashp)829 hash_stats(const char *where, HTAB *hashp)
830 {
831 #if HASH_STATISTICS
832 	fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
833 			where, hashp->hctl->accesses, hashp->hctl->collisions);
834 
835 	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
836 			hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
837 			hashp->hctl->max_bucket, hashp->hctl->nsegs);
838 	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
839 			where, hash_accesses, hash_collisions);
840 	fprintf(stderr, "hash_stats: total expansions %ld\n",
841 			hash_expansions);
842 #endif
843 }
844 
845 /*******************************SEARCH ROUTINES *****************************/
846 
847 
848 /*
849  * get_hash_value -- exported routine to calculate a key's hash value
850  *
851  * We export this because for partitioned tables, callers need to compute
852  * the partition number (from the low-order bits of the hash value) before
853  * searching.
854  */
855 uint32
get_hash_value(HTAB * hashp,const void * keyPtr)856 get_hash_value(HTAB *hashp, const void *keyPtr)
857 {
858 	return hashp->hash(keyPtr, hashp->keysize);
859 }
860 
861 /* Convert a hash value to a bucket number */
862 static inline uint32
calc_bucket(HASHHDR * hctl,uint32 hash_val)863 calc_bucket(HASHHDR *hctl, uint32 hash_val)
864 {
865 	uint32		bucket;
866 
867 	bucket = hash_val & hctl->high_mask;
868 	if (bucket > hctl->max_bucket)
869 		bucket = bucket & hctl->low_mask;
870 
871 	return bucket;
872 }
873 
874 /*
875  * hash_search -- look up key in table and perform action
876  * hash_search_with_hash_value -- same, with key's hash value already computed
877  *
878  * action is one of:
879  *		HASH_FIND: look up key in table
880  *		HASH_ENTER: look up key in table, creating entry if not present
881  *		HASH_ENTER_NULL: same, but return NULL if out of memory
882  *		HASH_REMOVE: look up key in table, remove entry if present
883  *
884  * Return value is a pointer to the element found/entered/removed if any,
885  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
886  * the result is a dangling pointer that shouldn't be dereferenced!)
887  *
888  * HASH_ENTER will normally ereport a generic "out of memory" error if
889  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
890  * the same except it will return NULL if out of memory.  Note that
891  * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
892  * since palloc internally ereports on out-of-memory.
893  *
894  * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
895  * existing entry in the table, FALSE otherwise.  This is needed in the
896  * HASH_ENTER case, but is redundant with the return value otherwise.
897  *
898  * For hash_search_with_hash_value, the hashvalue parameter must have been
899  * calculated with get_hash_value().
900  */
901 void *
hash_search(HTAB * hashp,const void * keyPtr,HASHACTION action,bool * foundPtr)902 hash_search(HTAB *hashp,
903 			const void *keyPtr,
904 			HASHACTION action,
905 			bool *foundPtr)
906 {
907 	return hash_search_with_hash_value(hashp,
908 									   keyPtr,
909 									   hashp->hash(keyPtr, hashp->keysize),
910 									   action,
911 									   foundPtr);
912 }
913 
914 void *
hash_search_with_hash_value(HTAB * hashp,const void * keyPtr,uint32 hashvalue,HASHACTION action,bool * foundPtr)915 hash_search_with_hash_value(HTAB *hashp,
916 							const void *keyPtr,
917 							uint32 hashvalue,
918 							HASHACTION action,
919 							bool *foundPtr)
920 {
921 	HASHHDR    *hctl = hashp->hctl;
922 	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
923 	Size		keysize;
924 	uint32		bucket;
925 	long		segment_num;
926 	long		segment_ndx;
927 	HASHSEGMENT segp;
928 	HASHBUCKET	currBucket;
929 	HASHBUCKET *prevBucketPtr;
930 	HashCompareFunc match;
931 
932 #if HASH_STATISTICS
933 	hash_accesses++;
934 	hctl->accesses++;
935 #endif
936 
937 	/*
938 	 * If inserting, check if it is time to split a bucket.
939 	 *
940 	 * NOTE: failure to expand table is not a fatal error, it just means we
941 	 * have to run at higher fill factor than we wanted.  However, if we're
942 	 * using the palloc allocator then it will throw error anyway on
943 	 * out-of-memory, so we must do this before modifying the table.
944 	 */
945 	if (action == HASH_ENTER || action == HASH_ENTER_NULL)
946 	{
947 		/*
948 		 * Can't split if running in partitioned mode, nor if frozen, nor if
949 		 * table is the subject of any active hash_seq_search scans.  Strange
950 		 * order of these tests is to try to check cheaper conditions first.
951 		 */
952 		if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
953 			hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
954 			!has_seq_scans(hashp))
955 			(void) expand_table(hashp);
956 	}
957 
958 	/*
959 	 * Do the initial lookup
960 	 */
961 	bucket = calc_bucket(hctl, hashvalue);
962 
963 	segment_num = bucket >> hashp->sshift;
964 	segment_ndx = MOD(bucket, hashp->ssize);
965 
966 	segp = hashp->dir[segment_num];
967 
968 	if (segp == NULL)
969 		hash_corrupted(hashp);
970 
971 	prevBucketPtr = &segp[segment_ndx];
972 	currBucket = *prevBucketPtr;
973 
974 	/*
975 	 * Follow collision chain looking for matching key
976 	 */
977 	match = hashp->match;		/* save one fetch in inner loop */
978 	keysize = hashp->keysize;	/* ditto */
979 
980 	while (currBucket != NULL)
981 	{
982 		if (currBucket->hashvalue == hashvalue &&
983 			match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
984 			break;
985 		prevBucketPtr = &(currBucket->link);
986 		currBucket = *prevBucketPtr;
987 #if HASH_STATISTICS
988 		hash_collisions++;
989 		hctl->collisions++;
990 #endif
991 	}
992 
993 	if (foundPtr)
994 		*foundPtr = (bool) (currBucket != NULL);
995 
996 	/*
997 	 * OK, now what?
998 	 */
999 	switch (action)
1000 	{
1001 		case HASH_FIND:
1002 			if (currBucket != NULL)
1003 				return (void *) ELEMENTKEY(currBucket);
1004 			return NULL;
1005 
1006 		case HASH_REMOVE:
1007 			if (currBucket != NULL)
1008 			{
1009 				/* if partitioned, must lock to touch nentries and freeList */
1010 				if (IS_PARTITIONED(hctl))
1011 					SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1012 
1013 				/* delete the record from the appropriate nentries counter. */
1014 				Assert(hctl->freeList[freelist_idx].nentries > 0);
1015 				hctl->freeList[freelist_idx].nentries--;
1016 
1017 				/* remove record from hash bucket's chain. */
1018 				*prevBucketPtr = currBucket->link;
1019 
1020 				/* add the record to the appropriate freelist. */
1021 				currBucket->link = hctl->freeList[freelist_idx].freeList;
1022 				hctl->freeList[freelist_idx].freeList = currBucket;
1023 
1024 				if (IS_PARTITIONED(hctl))
1025 					SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1026 
1027 				/*
1028 				 * better hope the caller is synchronizing access to this
1029 				 * element, because someone else is going to reuse it the next
1030 				 * time something is added to the table
1031 				 */
1032 				return (void *) ELEMENTKEY(currBucket);
1033 			}
1034 			return NULL;
1035 
1036 		case HASH_ENTER_NULL:
1037 			/* ENTER_NULL does not work with palloc-based allocator */
1038 			Assert(hashp->alloc != DynaHashAlloc);
1039 			/* FALL THRU */
1040 
1041 		case HASH_ENTER:
1042 			/* Return existing element if found, else create one */
1043 			if (currBucket != NULL)
1044 				return (void *) ELEMENTKEY(currBucket);
1045 
1046 			/* disallow inserts if frozen */
1047 			if (hashp->frozen)
1048 				elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1049 					 hashp->tabname);
1050 
1051 			currBucket = get_hash_entry(hashp, freelist_idx);
1052 			if (currBucket == NULL)
1053 			{
1054 				/* out of memory */
1055 				if (action == HASH_ENTER_NULL)
1056 					return NULL;
1057 				/* report a generic message */
1058 				if (hashp->isshared)
1059 					ereport(ERROR,
1060 							(errcode(ERRCODE_OUT_OF_MEMORY),
1061 							 errmsg("out of shared memory")));
1062 				else
1063 					ereport(ERROR,
1064 							(errcode(ERRCODE_OUT_OF_MEMORY),
1065 							 errmsg("out of memory")));
1066 			}
1067 
1068 			/* link into hashbucket chain */
1069 			*prevBucketPtr = currBucket;
1070 			currBucket->link = NULL;
1071 
1072 			/* copy key into record */
1073 			currBucket->hashvalue = hashvalue;
1074 			hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1075 
1076 			/*
1077 			 * Caller is expected to fill the data field on return.  DO NOT
1078 			 * insert any code that could possibly throw error here, as doing
1079 			 * so would leave the table entry incomplete and hence corrupt the
1080 			 * caller's data structure.
1081 			 */
1082 
1083 			return (void *) ELEMENTKEY(currBucket);
1084 	}
1085 
1086 	elog(ERROR, "unrecognized hash action code: %d", (int) action);
1087 
1088 	return NULL;				/* keep compiler quiet */
1089 }
1090 
1091 /*
1092  * hash_update_hash_key -- change the hash key of an existing table entry
1093  *
1094  * This is equivalent to removing the entry, making a new entry, and copying
1095  * over its data, except that the entry never goes to the table's freelist.
1096  * Therefore this cannot suffer an out-of-memory failure, even if there are
1097  * other processes operating in other partitions of the hashtable.
1098  *
1099  * Returns TRUE if successful, FALSE if the requested new hash key is already
1100  * present.  Throws error if the specified entry pointer isn't actually a
1101  * table member.
1102  *
1103  * NB: currently, there is no special case for old and new hash keys being
1104  * identical, which means we'll report FALSE for that situation.  This is
1105  * preferable for existing uses.
1106  *
1107  * NB: for a partitioned hashtable, caller must hold lock on both relevant
1108  * partitions, if the new hash key would belong to a different partition.
1109  */
1110 bool
hash_update_hash_key(HTAB * hashp,void * existingEntry,const void * newKeyPtr)1111 hash_update_hash_key(HTAB *hashp,
1112 					 void *existingEntry,
1113 					 const void *newKeyPtr)
1114 {
1115 	HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1116 	HASHHDR    *hctl = hashp->hctl;
1117 	uint32		newhashvalue;
1118 	Size		keysize;
1119 	uint32		bucket;
1120 	uint32		newbucket;
1121 	long		segment_num;
1122 	long		segment_ndx;
1123 	HASHSEGMENT segp;
1124 	HASHBUCKET	currBucket;
1125 	HASHBUCKET *prevBucketPtr;
1126 	HASHBUCKET *oldPrevPtr;
1127 	HashCompareFunc match;
1128 
1129 #if HASH_STATISTICS
1130 	hash_accesses++;
1131 	hctl->accesses++;
1132 #endif
1133 
1134 	/* disallow updates if frozen */
1135 	if (hashp->frozen)
1136 		elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1137 			 hashp->tabname);
1138 
1139 	/*
1140 	 * Lookup the existing element using its saved hash value.  We need to do
1141 	 * this to be able to unlink it from its hash chain, but as a side benefit
1142 	 * we can verify the validity of the passed existingEntry pointer.
1143 	 */
1144 	bucket = calc_bucket(hctl, existingElement->hashvalue);
1145 
1146 	segment_num = bucket >> hashp->sshift;
1147 	segment_ndx = MOD(bucket, hashp->ssize);
1148 
1149 	segp = hashp->dir[segment_num];
1150 
1151 	if (segp == NULL)
1152 		hash_corrupted(hashp);
1153 
1154 	prevBucketPtr = &segp[segment_ndx];
1155 	currBucket = *prevBucketPtr;
1156 
1157 	while (currBucket != NULL)
1158 	{
1159 		if (currBucket == existingElement)
1160 			break;
1161 		prevBucketPtr = &(currBucket->link);
1162 		currBucket = *prevBucketPtr;
1163 	}
1164 
1165 	if (currBucket == NULL)
1166 		elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1167 			 hashp->tabname);
1168 
1169 	oldPrevPtr = prevBucketPtr;
1170 
1171 	/*
1172 	 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1173 	 * chain we want to put the entry into.
1174 	 */
1175 	newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1176 
1177 	newbucket = calc_bucket(hctl, newhashvalue);
1178 
1179 	segment_num = newbucket >> hashp->sshift;
1180 	segment_ndx = MOD(newbucket, hashp->ssize);
1181 
1182 	segp = hashp->dir[segment_num];
1183 
1184 	if (segp == NULL)
1185 		hash_corrupted(hashp);
1186 
1187 	prevBucketPtr = &segp[segment_ndx];
1188 	currBucket = *prevBucketPtr;
1189 
1190 	/*
1191 	 * Follow collision chain looking for matching key
1192 	 */
1193 	match = hashp->match;		/* save one fetch in inner loop */
1194 	keysize = hashp->keysize;	/* ditto */
1195 
1196 	while (currBucket != NULL)
1197 	{
1198 		if (currBucket->hashvalue == newhashvalue &&
1199 			match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1200 			break;
1201 		prevBucketPtr = &(currBucket->link);
1202 		currBucket = *prevBucketPtr;
1203 #if HASH_STATISTICS
1204 		hash_collisions++;
1205 		hctl->collisions++;
1206 #endif
1207 	}
1208 
1209 	if (currBucket != NULL)
1210 		return false;			/* collision with an existing entry */
1211 
1212 	currBucket = existingElement;
1213 
1214 	/*
1215 	 * If old and new hash values belong to the same bucket, we need not
1216 	 * change any chain links, and indeed should not since this simplistic
1217 	 * update will corrupt the list if currBucket is the last element.  (We
1218 	 * cannot fall out earlier, however, since we need to scan the bucket to
1219 	 * check for duplicate keys.)
1220 	 */
1221 	if (bucket != newbucket)
1222 	{
1223 		/* OK to remove record from old hash bucket's chain. */
1224 		*oldPrevPtr = currBucket->link;
1225 
1226 		/* link into new hashbucket chain */
1227 		*prevBucketPtr = currBucket;
1228 		currBucket->link = NULL;
1229 	}
1230 
1231 	/* copy new key into record */
1232 	currBucket->hashvalue = newhashvalue;
1233 	hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1234 
1235 	/* rest of record is untouched */
1236 
1237 	return true;
1238 }
1239 
1240 /*
1241  * Allocate a new hashtable entry if possible; return NULL if out of memory.
1242  * (Or, if the underlying space allocator throws error for out-of-memory,
1243  * we won't return at all.)
1244  */
1245 static HASHBUCKET
get_hash_entry(HTAB * hashp,int freelist_idx)1246 get_hash_entry(HTAB *hashp, int freelist_idx)
1247 {
1248 	HASHHDR    *hctl = hashp->hctl;
1249 	HASHBUCKET	newElement;
1250 
1251 	for (;;)
1252 	{
1253 		/* if partitioned, must lock to touch nentries and freeList */
1254 		if (IS_PARTITIONED(hctl))
1255 			SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1256 
1257 		/* try to get an entry from the freelist */
1258 		newElement = hctl->freeList[freelist_idx].freeList;
1259 
1260 		if (newElement != NULL)
1261 			break;
1262 
1263 		if (IS_PARTITIONED(hctl))
1264 			SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1265 
1266 		/*
1267 		 * No free elements in this freelist.  In a partitioned table, there
1268 		 * might be entries in other freelists, but to reduce contention we
1269 		 * prefer to first try to get another chunk of buckets from the main
1270 		 * shmem allocator.  If that fails, though, we *MUST* root through all
1271 		 * the other freelists before giving up.  There are multiple callers
1272 		 * that assume that they can allocate every element in the initially
1273 		 * requested table size, or that deleting an element guarantees they
1274 		 * can insert a new element, even if shared memory is entirely full.
1275 		 * Failing because the needed element is in a different freelist is
1276 		 * not acceptable.
1277 		 */
1278 		if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1279 		{
1280 			int			borrow_from_idx;
1281 
1282 			if (!IS_PARTITIONED(hctl))
1283 				return NULL;	/* out of memory */
1284 
1285 			/* try to borrow element from another freelist */
1286 			borrow_from_idx = freelist_idx;
1287 			for (;;)
1288 			{
1289 				borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1290 				if (borrow_from_idx == freelist_idx)
1291 					break;		/* examined all freelists, fail */
1292 
1293 				SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1294 				newElement = hctl->freeList[borrow_from_idx].freeList;
1295 
1296 				if (newElement != NULL)
1297 				{
1298 					hctl->freeList[borrow_from_idx].freeList = newElement->link;
1299 					SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1300 
1301 					/* careful: count the new element in its proper freelist */
1302 					SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1303 					hctl->freeList[freelist_idx].nentries++;
1304 					SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1305 
1306 					return newElement;
1307 				}
1308 
1309 				SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1310 			}
1311 
1312 			/* no elements available to borrow either, so out of memory */
1313 			return NULL;
1314 		}
1315 	}
1316 
1317 	/* remove entry from freelist, bump nentries */
1318 	hctl->freeList[freelist_idx].freeList = newElement->link;
1319 	hctl->freeList[freelist_idx].nentries++;
1320 
1321 	if (IS_PARTITIONED(hctl))
1322 		SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1323 
1324 	return newElement;
1325 }
1326 
1327 /*
1328  * hash_get_num_entries -- get the number of entries in a hashtable
1329  */
1330 long
hash_get_num_entries(HTAB * hashp)1331 hash_get_num_entries(HTAB *hashp)
1332 {
1333 	int			i;
1334 	long		sum = hashp->hctl->freeList[0].nentries;
1335 
1336 	/*
1337 	 * We currently don't bother with acquiring the mutexes; it's only
1338 	 * sensible to call this function if you've got lock on all partitions of
1339 	 * the table.
1340 	 */
1341 	if (IS_PARTITIONED(hashp->hctl))
1342 	{
1343 		for (i = 1; i < NUM_FREELISTS; i++)
1344 			sum += hashp->hctl->freeList[i].nentries;
1345 	}
1346 
1347 	return sum;
1348 }
1349 
1350 /*
1351  * hash_seq_init/_search/_term
1352  *			Sequentially search through hash table and return
1353  *			all the elements one by one, return NULL when no more.
1354  *
1355  * hash_seq_term should be called if and only if the scan is abandoned before
1356  * completion; if hash_seq_search returns NULL then it has already done the
1357  * end-of-scan cleanup.
1358  *
1359  * NOTE: caller may delete the returned element before continuing the scan.
1360  * However, deleting any other element while the scan is in progress is
1361  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
1362  * if elements are added to the table while the scan is in progress, it is
1363  * unspecified whether they will be visited by the scan or not.
1364  *
1365  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1366  * worry about hash_seq_term cleanup, if the hashtable is first locked against
1367  * further insertions by calling hash_freeze.
1368  *
1369  * NOTE: to use this with a partitioned hashtable, caller had better hold
1370  * at least shared lock on all partitions of the table throughout the scan!
1371  * We can cope with insertions or deletions by our own backend, but *not*
1372  * with concurrent insertions or deletions by another.
1373  */
1374 void
hash_seq_init(HASH_SEQ_STATUS * status,HTAB * hashp)1375 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1376 {
1377 	status->hashp = hashp;
1378 	status->curBucket = 0;
1379 	status->curEntry = NULL;
1380 	if (!hashp->frozen)
1381 		register_seq_scan(hashp);
1382 }
1383 
1384 void *
hash_seq_search(HASH_SEQ_STATUS * status)1385 hash_seq_search(HASH_SEQ_STATUS *status)
1386 {
1387 	HTAB	   *hashp;
1388 	HASHHDR    *hctl;
1389 	uint32		max_bucket;
1390 	long		ssize;
1391 	long		segment_num;
1392 	long		segment_ndx;
1393 	HASHSEGMENT segp;
1394 	uint32		curBucket;
1395 	HASHELEMENT *curElem;
1396 
1397 	if ((curElem = status->curEntry) != NULL)
1398 	{
1399 		/* Continuing scan of curBucket... */
1400 		status->curEntry = curElem->link;
1401 		if (status->curEntry == NULL)	/* end of this bucket */
1402 			++status->curBucket;
1403 		return (void *) ELEMENTKEY(curElem);
1404 	}
1405 
1406 	/*
1407 	 * Search for next nonempty bucket starting at curBucket.
1408 	 */
1409 	curBucket = status->curBucket;
1410 	hashp = status->hashp;
1411 	hctl = hashp->hctl;
1412 	ssize = hashp->ssize;
1413 	max_bucket = hctl->max_bucket;
1414 
1415 	if (curBucket > max_bucket)
1416 	{
1417 		hash_seq_term(status);
1418 		return NULL;			/* search is done */
1419 	}
1420 
1421 	/*
1422 	 * first find the right segment in the table directory.
1423 	 */
1424 	segment_num = curBucket >> hashp->sshift;
1425 	segment_ndx = MOD(curBucket, ssize);
1426 
1427 	segp = hashp->dir[segment_num];
1428 
1429 	/*
1430 	 * Pick up the first item in this bucket's chain.  If chain is not empty
1431 	 * we can begin searching it.  Otherwise we have to advance to find the
1432 	 * next nonempty bucket.  We try to optimize that case since searching a
1433 	 * near-empty hashtable has to iterate this loop a lot.
1434 	 */
1435 	while ((curElem = segp[segment_ndx]) == NULL)
1436 	{
1437 		/* empty bucket, advance to next */
1438 		if (++curBucket > max_bucket)
1439 		{
1440 			status->curBucket = curBucket;
1441 			hash_seq_term(status);
1442 			return NULL;		/* search is done */
1443 		}
1444 		if (++segment_ndx >= ssize)
1445 		{
1446 			segment_num++;
1447 			segment_ndx = 0;
1448 			segp = hashp->dir[segment_num];
1449 		}
1450 	}
1451 
1452 	/* Begin scan of curBucket... */
1453 	status->curEntry = curElem->link;
1454 	if (status->curEntry == NULL)	/* end of this bucket */
1455 		++curBucket;
1456 	status->curBucket = curBucket;
1457 	return (void *) ELEMENTKEY(curElem);
1458 }
1459 
1460 void
hash_seq_term(HASH_SEQ_STATUS * status)1461 hash_seq_term(HASH_SEQ_STATUS *status)
1462 {
1463 	if (!status->hashp->frozen)
1464 		deregister_seq_scan(status->hashp);
1465 }
1466 
1467 /*
1468  * hash_freeze
1469  *			Freeze a hashtable against future insertions (deletions are
1470  *			still allowed)
1471  *
1472  * The reason for doing this is that by preventing any more bucket splits,
1473  * we no longer need to worry about registering hash_seq_search scans,
1474  * and thus caller need not be careful about ensuring hash_seq_term gets
1475  * called at the right times.
1476  *
1477  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1478  * with active scans (since hash_seq_term would then do the wrong thing).
1479  */
1480 void
hash_freeze(HTAB * hashp)1481 hash_freeze(HTAB *hashp)
1482 {
1483 	if (hashp->isshared)
1484 		elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1485 	if (!hashp->frozen && has_seq_scans(hashp))
1486 		elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1487 			 hashp->tabname);
1488 	hashp->frozen = true;
1489 }
1490 
1491 
1492 /********************************* UTILITIES ************************/
1493 
1494 /*
1495  * Expand the table by adding one more hash bucket.
1496  */
1497 static bool
expand_table(HTAB * hashp)1498 expand_table(HTAB *hashp)
1499 {
1500 	HASHHDR    *hctl = hashp->hctl;
1501 	HASHSEGMENT old_seg,
1502 				new_seg;
1503 	long		old_bucket,
1504 				new_bucket;
1505 	long		new_segnum,
1506 				new_segndx;
1507 	long		old_segnum,
1508 				old_segndx;
1509 	HASHBUCKET *oldlink,
1510 			   *newlink;
1511 	HASHBUCKET	currElement,
1512 				nextElement;
1513 
1514 	Assert(!IS_PARTITIONED(hctl));
1515 
1516 #ifdef HASH_STATISTICS
1517 	hash_expansions++;
1518 #endif
1519 
1520 	new_bucket = hctl->max_bucket + 1;
1521 	new_segnum = new_bucket >> hashp->sshift;
1522 	new_segndx = MOD(new_bucket, hashp->ssize);
1523 
1524 	if (new_segnum >= hctl->nsegs)
1525 	{
1526 		/* Allocate new segment if necessary -- could fail if dir full */
1527 		if (new_segnum >= hctl->dsize)
1528 			if (!dir_realloc(hashp))
1529 				return false;
1530 		if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1531 			return false;
1532 		hctl->nsegs++;
1533 	}
1534 
1535 	/* OK, we created a new bucket */
1536 	hctl->max_bucket++;
1537 
1538 	/*
1539 	 * *Before* changing masks, find old bucket corresponding to same hash
1540 	 * values; values in that bucket may need to be relocated to new bucket.
1541 	 * Note that new_bucket is certainly larger than low_mask at this point,
1542 	 * so we can skip the first step of the regular hash mask calc.
1543 	 */
1544 	old_bucket = (new_bucket & hctl->low_mask);
1545 
1546 	/*
1547 	 * If we crossed a power of 2, readjust masks.
1548 	 */
1549 	if ((uint32) new_bucket > hctl->high_mask)
1550 	{
1551 		hctl->low_mask = hctl->high_mask;
1552 		hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1553 	}
1554 
1555 	/*
1556 	 * Relocate records to the new bucket.  NOTE: because of the way the hash
1557 	 * masking is done in calc_bucket, only one old bucket can need to be
1558 	 * split at this point.  With a different way of reducing the hash value,
1559 	 * that might not be true!
1560 	 */
1561 	old_segnum = old_bucket >> hashp->sshift;
1562 	old_segndx = MOD(old_bucket, hashp->ssize);
1563 
1564 	old_seg = hashp->dir[old_segnum];
1565 	new_seg = hashp->dir[new_segnum];
1566 
1567 	oldlink = &old_seg[old_segndx];
1568 	newlink = &new_seg[new_segndx];
1569 
1570 	for (currElement = *oldlink;
1571 		 currElement != NULL;
1572 		 currElement = nextElement)
1573 	{
1574 		nextElement = currElement->link;
1575 		if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1576 		{
1577 			*oldlink = currElement;
1578 			oldlink = &currElement->link;
1579 		}
1580 		else
1581 		{
1582 			*newlink = currElement;
1583 			newlink = &currElement->link;
1584 		}
1585 	}
1586 	/* don't forget to terminate the rebuilt hash chains... */
1587 	*oldlink = NULL;
1588 	*newlink = NULL;
1589 
1590 	return true;
1591 }
1592 
1593 
1594 static bool
dir_realloc(HTAB * hashp)1595 dir_realloc(HTAB *hashp)
1596 {
1597 	HASHSEGMENT *p;
1598 	HASHSEGMENT *old_p;
1599 	long		new_dsize;
1600 	long		old_dirsize;
1601 	long		new_dirsize;
1602 
1603 	if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1604 		return false;
1605 
1606 	/* Reallocate directory */
1607 	new_dsize = hashp->hctl->dsize << 1;
1608 	old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1609 	new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1610 
1611 	old_p = hashp->dir;
1612 	CurrentDynaHashCxt = hashp->hcxt;
1613 	p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1614 
1615 	if (p != NULL)
1616 	{
1617 		memcpy(p, old_p, old_dirsize);
1618 		MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1619 		hashp->dir = p;
1620 		hashp->hctl->dsize = new_dsize;
1621 
1622 		/* XXX assume the allocator is palloc, so we know how to free */
1623 		Assert(hashp->alloc == DynaHashAlloc);
1624 		pfree(old_p);
1625 
1626 		return true;
1627 	}
1628 
1629 	return false;
1630 }
1631 
1632 
1633 static HASHSEGMENT
seg_alloc(HTAB * hashp)1634 seg_alloc(HTAB *hashp)
1635 {
1636 	HASHSEGMENT segp;
1637 
1638 	CurrentDynaHashCxt = hashp->hcxt;
1639 	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1640 
1641 	if (!segp)
1642 		return NULL;
1643 
1644 	MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1645 
1646 	return segp;
1647 }
1648 
1649 /*
1650  * allocate some new elements and link them into the indicated free list
1651  */
1652 static bool
element_alloc(HTAB * hashp,int nelem,int freelist_idx)1653 element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1654 {
1655 	HASHHDR    *hctl = hashp->hctl;
1656 	Size		elementSize;
1657 	HASHELEMENT *firstElement;
1658 	HASHELEMENT *tmpElement;
1659 	HASHELEMENT *prevElement;
1660 	int			i;
1661 
1662 	if (hashp->isfixed)
1663 		return false;
1664 
1665 	/* Each element has a HASHELEMENT header plus user data. */
1666 	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1667 
1668 	CurrentDynaHashCxt = hashp->hcxt;
1669 	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1670 
1671 	if (!firstElement)
1672 		return false;
1673 
1674 	/* prepare to link all the new entries into the freelist */
1675 	prevElement = NULL;
1676 	tmpElement = firstElement;
1677 	for (i = 0; i < nelem; i++)
1678 	{
1679 		tmpElement->link = prevElement;
1680 		prevElement = tmpElement;
1681 		tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1682 	}
1683 
1684 	/* if partitioned, must lock to touch freeList */
1685 	if (IS_PARTITIONED(hctl))
1686 		SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1687 
1688 	/* freelist could be nonempty if two backends did this concurrently */
1689 	firstElement->link = hctl->freeList[freelist_idx].freeList;
1690 	hctl->freeList[freelist_idx].freeList = prevElement;
1691 
1692 	if (IS_PARTITIONED(hctl))
1693 		SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1694 
1695 	return true;
1696 }
1697 
1698 /* complain when we have detected a corrupted hashtable */
1699 static void
hash_corrupted(HTAB * hashp)1700 hash_corrupted(HTAB *hashp)
1701 {
1702 	/*
1703 	 * If the corruption is in a shared hashtable, we'd better force a
1704 	 * systemwide restart.  Otherwise, just shut down this one backend.
1705 	 */
1706 	if (hashp->isshared)
1707 		elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1708 	else
1709 		elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1710 }
1711 
1712 /* calculate ceil(log base 2) of num */
1713 int
my_log2(long num)1714 my_log2(long num)
1715 {
1716 	int			i;
1717 	long		limit;
1718 
1719 	/* guard against too-large input, which would put us into infinite loop */
1720 	if (num > LONG_MAX / 2)
1721 		num = LONG_MAX / 2;
1722 
1723 	for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
1724 		;
1725 	return i;
1726 }
1727 
1728 /* calculate first power of 2 >= num, bounded to what will fit in a long */
1729 static long
next_pow2_long(long num)1730 next_pow2_long(long num)
1731 {
1732 	/* my_log2's internal range check is sufficient */
1733 	return 1L << my_log2(num);
1734 }
1735 
1736 /* calculate first power of 2 >= num, bounded to what will fit in an int */
1737 static int
next_pow2_int(long num)1738 next_pow2_int(long num)
1739 {
1740 	if (num > INT_MAX / 2)
1741 		num = INT_MAX / 2;
1742 	return 1 << my_log2(num);
1743 }
1744 
1745 
1746 /************************* SEQ SCAN TRACKING ************************/
1747 
1748 /*
1749  * We track active hash_seq_search scans here.  The need for this mechanism
1750  * comes from the fact that a scan will get confused if a bucket split occurs
1751  * while it's in progress: it might visit entries twice, or even miss some
1752  * entirely (if it's partway through the same bucket that splits).  Hence
1753  * we want to inhibit bucket splits if there are any active scans on the
1754  * table being inserted into.  This is a fairly rare case in current usage,
1755  * so just postponing the split until the next insertion seems sufficient.
1756  *
1757  * Given present usages of the function, only a few scans are likely to be
1758  * open concurrently; so a finite-size stack of open scans seems sufficient,
1759  * and we don't worry that linear search is too slow.  Note that we do
1760  * allow multiple scans of the same hashtable to be open concurrently.
1761  *
1762  * This mechanism can support concurrent scan and insertion in a shared
1763  * hashtable if it's the same backend doing both.  It would fail otherwise,
1764  * but locking reasons seem to preclude any such scenario anyway, so we don't
1765  * worry.
1766  *
1767  * This arrangement is reasonably robust if a transient hashtable is deleted
1768  * without notifying us.  The absolute worst case is we might inhibit splits
1769  * in another table created later at exactly the same address.  We will give
1770  * a warning at transaction end for reference leaks, so any bugs leading to
1771  * lack of notification should be easy to catch.
1772  */
1773 
1774 #define MAX_SEQ_SCANS 100
1775 
1776 static HTAB *seq_scan_tables[MAX_SEQ_SCANS];	/* tables being scanned */
1777 static int	seq_scan_level[MAX_SEQ_SCANS];	/* subtransaction nest level */
1778 static int	num_seq_scans = 0;
1779 
1780 
1781 /* Register a table as having an active hash_seq_search scan */
1782 static void
register_seq_scan(HTAB * hashp)1783 register_seq_scan(HTAB *hashp)
1784 {
1785 	if (num_seq_scans >= MAX_SEQ_SCANS)
1786 		elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1787 			 hashp->tabname);
1788 	seq_scan_tables[num_seq_scans] = hashp;
1789 	seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1790 	num_seq_scans++;
1791 }
1792 
1793 /* Deregister an active scan */
1794 static void
deregister_seq_scan(HTAB * hashp)1795 deregister_seq_scan(HTAB *hashp)
1796 {
1797 	int			i;
1798 
1799 	/* Search backward since it's most likely at the stack top */
1800 	for (i = num_seq_scans - 1; i >= 0; i--)
1801 	{
1802 		if (seq_scan_tables[i] == hashp)
1803 		{
1804 			seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1805 			seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1806 			num_seq_scans--;
1807 			return;
1808 		}
1809 	}
1810 	elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1811 		 hashp->tabname);
1812 }
1813 
1814 /* Check if a table has any active scan */
1815 static bool
has_seq_scans(HTAB * hashp)1816 has_seq_scans(HTAB *hashp)
1817 {
1818 	int			i;
1819 
1820 	for (i = 0; i < num_seq_scans; i++)
1821 	{
1822 		if (seq_scan_tables[i] == hashp)
1823 			return true;
1824 	}
1825 	return false;
1826 }
1827 
1828 /* Clean up any open scans at end of transaction */
1829 void
AtEOXact_HashTables(bool isCommit)1830 AtEOXact_HashTables(bool isCommit)
1831 {
1832 	/*
1833 	 * During abort cleanup, open scans are expected; just silently clean 'em
1834 	 * out.  An open scan at commit means someone forgot a hash_seq_term()
1835 	 * call, so complain.
1836 	 *
1837 	 * Note: it's tempting to try to print the tabname here, but refrain for
1838 	 * fear of touching deallocated memory.  This isn't a user-facing message
1839 	 * anyway, so it needn't be pretty.
1840 	 */
1841 	if (isCommit)
1842 	{
1843 		int			i;
1844 
1845 		for (i = 0; i < num_seq_scans; i++)
1846 		{
1847 			elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1848 				 seq_scan_tables[i]);
1849 		}
1850 	}
1851 	num_seq_scans = 0;
1852 }
1853 
1854 /* Clean up any open scans at end of subtransaction */
1855 void
AtEOSubXact_HashTables(bool isCommit,int nestDepth)1856 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1857 {
1858 	int			i;
1859 
1860 	/*
1861 	 * Search backward to make cleanup easy.  Note we must check all entries,
1862 	 * not only those at the end of the array, because deletion technique
1863 	 * doesn't keep them in order.
1864 	 */
1865 	for (i = num_seq_scans - 1; i >= 0; i--)
1866 	{
1867 		if (seq_scan_level[i] >= nestDepth)
1868 		{
1869 			if (isCommit)
1870 				elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1871 					 seq_scan_tables[i]);
1872 			seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1873 			seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1874 			num_seq_scans--;
1875 		}
1876 	}
1877 }
1878