1 /*-------------------------------------------------------------------------
2 *
3 * dynahash.c
4 * dynamic hash tables
5 *
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * (A partitioned table uses multiple copies of those fields, guarded by
19 * spinlocks, for additional concurrency.)
20 * This lets any subset of the hash buckets be treated as a separately
21 * lockable partition. We expect callers to use the low-order bits of a
22 * lookup key's hash value as a partition number --- this will work because
23 * of the way calc_bucket() maps hash values to bucket numbers.
24 *
25 * For hash tables in shared memory, the memory allocator function should
26 * match malloc's semantics of returning NULL on failure. For hash tables
27 * in local memory, we typically use palloc() which will throw error on
28 * failure. The code in this file has to cope with both cases.
29 *
30 * dynahash.c provides support for these types of lookup keys:
31 *
32 * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 * compared as though by strcmp(). This is the default behavior.
34 *
35 * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
36 * (Caller must ensure there are no undefined padding bits in the keys!)
37 * This is selected by specifying HASH_BLOBS flag to hash_create.
38 *
39 * 3. More complex key behavior can be selected by specifying user-supplied
40 * hashing, comparison, and/or key-copying functions. At least a hashing
41 * function must be supplied; comparison defaults to memcmp() and key copying
42 * to memcpy() when a user-defined hashing function is selected.
43 *
44 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
45 * Portions Copyright (c) 1994, Regents of the University of California
46 *
47 *
48 * IDENTIFICATION
49 * src/backend/utils/hash/dynahash.c
50 *
51 *-------------------------------------------------------------------------
52 */
53
54 /*
55 * Original comments:
56 *
57 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
58 * Coded into C, with minor code improvements, and with hsearch(3) interface,
59 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
60 * also, hcreate/hdestroy routines added to simulate hsearch(3).
61 *
62 * These routines simulate hsearch(3) and family, with the important
63 * difference that the hash table is dynamic - can grow indefinitely
64 * beyond its original size (as supplied to hcreate()).
65 *
66 * Performance appears to be comparable to that of hsearch(3).
67 * The 'source-code' options referred to in hsearch(3)'s 'man' page
68 * are not implemented; otherwise functionality is identical.
69 *
70 * Compilation controls:
71 * HASH_DEBUG controls some informative traces, mainly for debugging.
72 * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
73 * when combined with HASH_DEBUG, these are displayed by hdestroy().
74 *
75 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
76 * concatenation property, in probably unnecessary code 'optimization'.
77 *
78 * Modified margo@postgres.berkeley.edu February 1990
79 * added multiple table interface
80 * Modified by sullivan@postgres.berkeley.edu April 1990
81 * changed ctl structure for shared memory
82 */
83
84 #include "postgres.h"
85
86 #include <limits.h>
87
88 #include "access/xact.h"
89 #include "common/hashfn.h"
90 #include "port/pg_bitutils.h"
91 #include "storage/shmem.h"
92 #include "storage/spin.h"
93 #include "utils/dynahash.h"
94 #include "utils/memutils.h"
95
96
97 /*
98 * Constants
99 *
100 * A hash table has a top-level "directory", each of whose entries points
101 * to a "segment" of ssize bucket headers. The maximum number of hash
102 * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
103 * the number of records in the table can be larger, but we don't want a
104 * whole lot of records per bucket or performance goes down.
105 *
106 * In a hash table allocated in shared memory, the directory cannot be
107 * expanded because it must stay at a fixed address. The directory size
108 * should be selected using hash_select_dirsize (and you'd better have
109 * a good idea of the maximum number of entries!). For non-shared hash
110 * tables, the initial directory size can be left at the default.
111 */
112 #define DEF_SEGSIZE 256
113 #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
114 #define DEF_DIRSIZE 256
115 #define DEF_FFACTOR 1 /* default fill factor */
116
117 /* Number of freelists to be used for a partitioned hash table. */
118 #define NUM_FREELISTS 32
119
120 /* A hash bucket is a linked list of HASHELEMENTs */
121 typedef HASHELEMENT *HASHBUCKET;
122
123 /* A hash segment is an array of bucket headers */
124 typedef HASHBUCKET *HASHSEGMENT;
125
126 /*
127 * Per-freelist data.
128 *
129 * In a partitioned hash table, each freelist is associated with a specific
130 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
131 * nentries tracks the number of live hashtable entries having those hashcodes
132 * (NOT the number of entries in the freelist, as you might expect).
133 *
134 * The coverage of a freelist might be more or less than one partition, so it
135 * needs its own lock rather than relying on caller locking. Relying on that
136 * wouldn't work even if the coverage was the same, because of the occasional
137 * need to "borrow" entries from another freelist; see get_hash_entry().
138 *
139 * Using an array of FreeListData instead of separate arrays of mutexes,
140 * nentries and freeLists helps to reduce sharing of cache lines between
141 * different mutexes.
142 */
143 typedef struct
144 {
145 slock_t mutex; /* spinlock for this freelist */
146 long nentries; /* number of entries in associated buckets */
147 HASHELEMENT *freeList; /* chain of free elements */
148 } FreeListData;
149
150 /*
151 * Header structure for a hash table --- contains all changeable info
152 *
153 * In a shared-memory hash table, the HASHHDR is in shared memory, while
154 * each backend has a local HTAB struct. For a non-shared table, there isn't
155 * any functional difference between HASHHDR and HTAB, but we separate them
156 * anyway to share code between shared and non-shared tables.
157 */
158 struct HASHHDR
159 {
160 /*
161 * The freelist can become a point of contention in high-concurrency hash
162 * tables, so we use an array of freelists, each with its own mutex and
163 * nentries count, instead of just a single one. Although the freelists
164 * normally operate independently, we will scavenge entries from freelists
165 * other than a hashcode's default freelist when necessary.
166 *
167 * If the hash table is not partitioned, only freeList[0] is used and its
168 * spinlock is not used at all; callers' locking is assumed sufficient.
169 */
170 FreeListData freeList[NUM_FREELISTS];
171
172 /* These fields can change, but not in a partitioned table */
173 /* Also, dsize can't change in a shared table, even if unpartitioned */
174 long dsize; /* directory size */
175 long nsegs; /* number of allocated segments (<= dsize) */
176 uint32 max_bucket; /* ID of maximum bucket in use */
177 uint32 high_mask; /* mask to modulo into entire table */
178 uint32 low_mask; /* mask to modulo into lower half of table */
179
180 /* These fields are fixed at hashtable creation */
181 Size keysize; /* hash key length in bytes */
182 Size entrysize; /* total user element size in bytes */
183 long num_partitions; /* # partitions (must be power of 2), or 0 */
184 long ffactor; /* target fill factor */
185 long max_dsize; /* 'dsize' limit if directory is fixed size */
186 long ssize; /* segment size --- must be power of 2 */
187 int sshift; /* segment shift = log2(ssize) */
188 int nelem_alloc; /* number of entries to allocate at once */
189
190 #ifdef HASH_STATISTICS
191
192 /*
193 * Count statistics here. NB: stats code doesn't bother with mutex, so
194 * counts could be corrupted a bit in a partitioned table.
195 */
196 long accesses;
197 long collisions;
198 #endif
199 };
200
201 #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
202
203 #define FREELIST_IDX(hctl, hashcode) \
204 (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
205
206 /*
207 * Top control structure for a hashtable --- in a shared table, each backend
208 * has its own copy (OK since no fields change at runtime)
209 */
210 struct HTAB
211 {
212 HASHHDR *hctl; /* => shared control information */
213 HASHSEGMENT *dir; /* directory of segment starts */
214 HashValueFunc hash; /* hash function */
215 HashCompareFunc match; /* key comparison function */
216 HashCopyFunc keycopy; /* key copying function */
217 HashAllocFunc alloc; /* memory allocator */
218 MemoryContext hcxt; /* memory context if default allocator used */
219 char *tabname; /* table name (for error messages) */
220 bool isshared; /* true if table is in shared memory */
221 bool isfixed; /* if true, don't enlarge */
222
223 /* freezing a shared table isn't allowed, so we can keep state here */
224 bool frozen; /* true = no more inserts allowed */
225
226 /* We keep local copies of these fixed values to reduce contention */
227 Size keysize; /* hash key length in bytes */
228 long ssize; /* segment size --- must be power of 2 */
229 int sshift; /* segment shift = log2(ssize) */
230 };
231
232 /*
233 * Key (also entry) part of a HASHELEMENT
234 */
235 #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
236
237 /*
238 * Obtain element pointer given pointer to key
239 */
240 #define ELEMENT_FROM_KEY(key) \
241 ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
242
243 /*
244 * Fast MOD arithmetic, assuming that y is a power of 2 !
245 */
246 #define MOD(x,y) ((x) & ((y)-1))
247
248 #ifdef HASH_STATISTICS
249 static long hash_accesses,
250 hash_collisions,
251 hash_expansions;
252 #endif
253
254 /*
255 * Private function prototypes
256 */
257 static void *DynaHashAlloc(Size size);
258 static HASHSEGMENT seg_alloc(HTAB *hashp);
259 static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
260 static bool dir_realloc(HTAB *hashp);
261 static bool expand_table(HTAB *hashp);
262 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
263 static void hdefault(HTAB *hashp);
264 static int choose_nelem_alloc(Size entrysize);
265 static bool init_htab(HTAB *hashp, long nelem);
266 static void hash_corrupted(HTAB *hashp);
267 static long next_pow2_long(long num);
268 static int next_pow2_int(long num);
269 static void register_seq_scan(HTAB *hashp);
270 static void deregister_seq_scan(HTAB *hashp);
271 static bool has_seq_scans(HTAB *hashp);
272
273
274 /*
275 * memory allocation support
276 */
277 static MemoryContext CurrentDynaHashCxt = NULL;
278
279 static void *
DynaHashAlloc(Size size)280 DynaHashAlloc(Size size)
281 {
282 Assert(MemoryContextIsValid(CurrentDynaHashCxt));
283 return MemoryContextAlloc(CurrentDynaHashCxt, size);
284 }
285
286
287 /*
288 * HashCompareFunc for string keys
289 *
290 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
291 * bytes, so we can only compare that many ... hence strncmp is almost but
292 * not quite the right thing.
293 */
294 static int
string_compare(const char * key1,const char * key2,Size keysize)295 string_compare(const char *key1, const char *key2, Size keysize)
296 {
297 return strncmp(key1, key2, keysize - 1);
298 }
299
300
301 /************************** CREATE ROUTINES **********************/
302
303 /*
304 * hash_create -- create a new dynamic hash table
305 *
306 * tabname: a name for the table (for debugging purposes)
307 * nelem: maximum number of elements expected
308 * *info: additional table parameters, as indicated by flags
309 * flags: bitmask indicating which parameters to take from *info
310 *
311 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
312 * estimate, since we can't expand the table on the fly. But an unshared
313 * hashtable can be expanded on-the-fly, so it's better for nelem to be
314 * on the small side and let the table grow if it's exceeded. An overly
315 * large nelem will penalize hash_seq_search speed without buying much.
316 */
317 HTAB *
hash_create(const char * tabname,long nelem,HASHCTL * info,int flags)318 hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
319 {
320 HTAB *hashp;
321 HASHHDR *hctl;
322
323 /*
324 * For shared hash tables, we have a local hash header (HTAB struct) that
325 * we allocate in TopMemoryContext; all else is in shared memory.
326 *
327 * For non-shared hash tables, everything including the hash header is in
328 * a memory context created specially for the hash table --- this makes
329 * hash_destroy very simple. The memory context is made a child of either
330 * a context specified by the caller, or TopMemoryContext if nothing is
331 * specified.
332 */
333 if (flags & HASH_SHARED_MEM)
334 {
335 /* Set up to allocate the hash header */
336 CurrentDynaHashCxt = TopMemoryContext;
337 }
338 else
339 {
340 /* Create the hash table's private memory context */
341 if (flags & HASH_CONTEXT)
342 CurrentDynaHashCxt = info->hcxt;
343 else
344 CurrentDynaHashCxt = TopMemoryContext;
345 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
346 "dynahash",
347 ALLOCSET_DEFAULT_SIZES);
348 }
349
350 /* Initialize the hash header, plus a copy of the table name */
351 hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
352 MemSet(hashp, 0, sizeof(HTAB));
353
354 hashp->tabname = (char *) (hashp + 1);
355 strcpy(hashp->tabname, tabname);
356
357 /* If we have a private context, label it with hashtable's name */
358 if (!(flags & HASH_SHARED_MEM))
359 MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
360
361 /*
362 * Select the appropriate hash function (see comments at head of file).
363 */
364 if (flags & HASH_FUNCTION)
365 hashp->hash = info->hash;
366 else if (flags & HASH_BLOBS)
367 {
368 /* We can optimize hashing for common key sizes */
369 Assert(flags & HASH_ELEM);
370 if (info->keysize == sizeof(uint32))
371 hashp->hash = uint32_hash;
372 else
373 hashp->hash = tag_hash;
374 }
375 else
376 hashp->hash = string_hash; /* default hash function */
377
378 /*
379 * If you don't specify a match function, it defaults to string_compare if
380 * you used string_hash (either explicitly or by default) and to memcmp
381 * otherwise.
382 *
383 * Note: explicitly specifying string_hash is deprecated, because this
384 * might not work for callers in loadable modules on some platforms due to
385 * referencing a trampoline instead of the string_hash function proper.
386 * Just let it default, eh?
387 */
388 if (flags & HASH_COMPARE)
389 hashp->match = info->match;
390 else if (hashp->hash == string_hash)
391 hashp->match = (HashCompareFunc) string_compare;
392 else
393 hashp->match = memcmp;
394
395 /*
396 * Similarly, the key-copying function defaults to strlcpy or memcpy.
397 */
398 if (flags & HASH_KEYCOPY)
399 hashp->keycopy = info->keycopy;
400 else if (hashp->hash == string_hash)
401 hashp->keycopy = (HashCopyFunc) strlcpy;
402 else
403 hashp->keycopy = memcpy;
404
405 /* And select the entry allocation function, too. */
406 if (flags & HASH_ALLOC)
407 hashp->alloc = info->alloc;
408 else
409 hashp->alloc = DynaHashAlloc;
410
411 if (flags & HASH_SHARED_MEM)
412 {
413 /*
414 * ctl structure and directory are preallocated for shared memory
415 * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
416 * well.
417 */
418 hashp->hctl = info->hctl;
419 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
420 hashp->hcxt = NULL;
421 hashp->isshared = true;
422
423 /* hash table already exists, we're just attaching to it */
424 if (flags & HASH_ATTACH)
425 {
426 /* make local copies of some heavily-used values */
427 hctl = hashp->hctl;
428 hashp->keysize = hctl->keysize;
429 hashp->ssize = hctl->ssize;
430 hashp->sshift = hctl->sshift;
431
432 return hashp;
433 }
434 }
435 else
436 {
437 /* setup hash table defaults */
438 hashp->hctl = NULL;
439 hashp->dir = NULL;
440 hashp->hcxt = CurrentDynaHashCxt;
441 hashp->isshared = false;
442 }
443
444 if (!hashp->hctl)
445 {
446 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
447 if (!hashp->hctl)
448 ereport(ERROR,
449 (errcode(ERRCODE_OUT_OF_MEMORY),
450 errmsg("out of memory")));
451 }
452
453 hashp->frozen = false;
454
455 hdefault(hashp);
456
457 hctl = hashp->hctl;
458
459 if (flags & HASH_PARTITION)
460 {
461 /* Doesn't make sense to partition a local hash table */
462 Assert(flags & HASH_SHARED_MEM);
463
464 /*
465 * The number of partitions had better be a power of 2. Also, it must
466 * be less than INT_MAX (see init_htab()), so call the int version of
467 * next_pow2.
468 */
469 Assert(info->num_partitions == next_pow2_int(info->num_partitions));
470
471 hctl->num_partitions = info->num_partitions;
472 }
473
474 if (flags & HASH_SEGMENT)
475 {
476 hctl->ssize = info->ssize;
477 hctl->sshift = my_log2(info->ssize);
478 /* ssize had better be a power of 2 */
479 Assert(hctl->ssize == (1L << hctl->sshift));
480 }
481 if (flags & HASH_FFACTOR)
482 hctl->ffactor = info->ffactor;
483
484 /*
485 * SHM hash tables have fixed directory size passed by the caller.
486 */
487 if (flags & HASH_DIRSIZE)
488 {
489 hctl->max_dsize = info->max_dsize;
490 hctl->dsize = info->dsize;
491 }
492
493 /*
494 * hash table now allocates space for key and data but you have to say how
495 * much space to allocate
496 */
497 if (flags & HASH_ELEM)
498 {
499 Assert(info->entrysize >= info->keysize);
500 hctl->keysize = info->keysize;
501 hctl->entrysize = info->entrysize;
502 }
503
504 /* make local copies of heavily-used constant fields */
505 hashp->keysize = hctl->keysize;
506 hashp->ssize = hctl->ssize;
507 hashp->sshift = hctl->sshift;
508
509 /* Build the hash directory structure */
510 if (!init_htab(hashp, nelem))
511 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
512
513 /*
514 * For a shared hash table, preallocate the requested number of elements.
515 * This reduces problems with run-time out-of-shared-memory conditions.
516 *
517 * For a non-shared hash table, preallocate the requested number of
518 * elements if it's less than our chosen nelem_alloc. This avoids wasting
519 * space if the caller correctly estimates a small table size.
520 */
521 if ((flags & HASH_SHARED_MEM) ||
522 nelem < hctl->nelem_alloc)
523 {
524 int i,
525 freelist_partitions,
526 nelem_alloc,
527 nelem_alloc_first;
528
529 /*
530 * If hash table is partitioned, give each freelist an equal share of
531 * the initial allocation. Otherwise only freeList[0] is used.
532 */
533 if (IS_PARTITIONED(hashp->hctl))
534 freelist_partitions = NUM_FREELISTS;
535 else
536 freelist_partitions = 1;
537
538 nelem_alloc = nelem / freelist_partitions;
539 if (nelem_alloc <= 0)
540 nelem_alloc = 1;
541
542 /*
543 * Make sure we'll allocate all the requested elements; freeList[0]
544 * gets the excess if the request isn't divisible by NUM_FREELISTS.
545 */
546 if (nelem_alloc * freelist_partitions < nelem)
547 nelem_alloc_first =
548 nelem - nelem_alloc * (freelist_partitions - 1);
549 else
550 nelem_alloc_first = nelem_alloc;
551
552 for (i = 0; i < freelist_partitions; i++)
553 {
554 int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
555
556 if (!element_alloc(hashp, temp, i))
557 ereport(ERROR,
558 (errcode(ERRCODE_OUT_OF_MEMORY),
559 errmsg("out of memory")));
560 }
561 }
562
563 if (flags & HASH_FIXED_SIZE)
564 hashp->isfixed = true;
565 return hashp;
566 }
567
568 /*
569 * Set default HASHHDR parameters.
570 */
571 static void
hdefault(HTAB * hashp)572 hdefault(HTAB *hashp)
573 {
574 HASHHDR *hctl = hashp->hctl;
575
576 MemSet(hctl, 0, sizeof(HASHHDR));
577
578 hctl->dsize = DEF_DIRSIZE;
579 hctl->nsegs = 0;
580
581 /* rather pointless defaults for key & entry size */
582 hctl->keysize = sizeof(char *);
583 hctl->entrysize = 2 * sizeof(char *);
584
585 hctl->num_partitions = 0; /* not partitioned */
586
587 hctl->ffactor = DEF_FFACTOR;
588
589 /* table has no fixed maximum size */
590 hctl->max_dsize = NO_MAX_DSIZE;
591
592 hctl->ssize = DEF_SEGSIZE;
593 hctl->sshift = DEF_SEGSIZE_SHIFT;
594
595 #ifdef HASH_STATISTICS
596 hctl->accesses = hctl->collisions = 0;
597 #endif
598 }
599
600 /*
601 * Given the user-specified entry size, choose nelem_alloc, ie, how many
602 * elements to add to the hash table when we need more.
603 */
604 static int
choose_nelem_alloc(Size entrysize)605 choose_nelem_alloc(Size entrysize)
606 {
607 int nelem_alloc;
608 Size elementSize;
609 Size allocSize;
610
611 /* Each element has a HASHELEMENT header plus user data. */
612 /* NB: this had better match element_alloc() */
613 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
614
615 /*
616 * The idea here is to choose nelem_alloc at least 32, but round up so
617 * that the allocation request will be a power of 2 or just less. This
618 * makes little difference for hash tables in shared memory, but for hash
619 * tables managed by palloc, the allocation request will be rounded up to
620 * a power of 2 anyway. If we fail to take this into account, we'll waste
621 * as much as half the allocated space.
622 */
623 allocSize = 32 * 4; /* assume elementSize at least 8 */
624 do
625 {
626 allocSize <<= 1;
627 nelem_alloc = allocSize / elementSize;
628 } while (nelem_alloc < 32);
629
630 return nelem_alloc;
631 }
632
633 /*
634 * Compute derived fields of hctl and build the initial directory/segment
635 * arrays
636 */
637 static bool
init_htab(HTAB * hashp,long nelem)638 init_htab(HTAB *hashp, long nelem)
639 {
640 HASHHDR *hctl = hashp->hctl;
641 HASHSEGMENT *segp;
642 int nbuckets;
643 int nsegs;
644 int i;
645
646 /*
647 * initialize mutexes if it's a partitioned table
648 */
649 if (IS_PARTITIONED(hctl))
650 for (i = 0; i < NUM_FREELISTS; i++)
651 SpinLockInit(&(hctl->freeList[i].mutex));
652
653 /*
654 * Divide number of elements by the fill factor to determine a desired
655 * number of buckets. Allocate space for the next greater power of two
656 * number of buckets
657 */
658 nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1);
659
660 /*
661 * In a partitioned table, nbuckets must be at least equal to
662 * num_partitions; were it less, keys with apparently different partition
663 * numbers would map to the same bucket, breaking partition independence.
664 * (Normally nbuckets will be much bigger; this is just a safety check.)
665 */
666 while (nbuckets < hctl->num_partitions)
667 nbuckets <<= 1;
668
669 hctl->max_bucket = hctl->low_mask = nbuckets - 1;
670 hctl->high_mask = (nbuckets << 1) - 1;
671
672 /*
673 * Figure number of directory segments needed, round up to a power of 2
674 */
675 nsegs = (nbuckets - 1) / hctl->ssize + 1;
676 nsegs = next_pow2_int(nsegs);
677
678 /*
679 * Make sure directory is big enough. If pre-allocated directory is too
680 * small, choke (caller screwed up).
681 */
682 if (nsegs > hctl->dsize)
683 {
684 if (!(hashp->dir))
685 hctl->dsize = nsegs;
686 else
687 return false;
688 }
689
690 /* Allocate a directory */
691 if (!(hashp->dir))
692 {
693 CurrentDynaHashCxt = hashp->hcxt;
694 hashp->dir = (HASHSEGMENT *)
695 hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
696 if (!hashp->dir)
697 return false;
698 }
699
700 /* Allocate initial segments */
701 for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
702 {
703 *segp = seg_alloc(hashp);
704 if (*segp == NULL)
705 return false;
706 }
707
708 /* Choose number of entries to allocate at a time */
709 hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
710
711 #ifdef HASH_DEBUG
712 fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
713 "TABLE POINTER ", hashp,
714 "DIRECTORY SIZE ", hctl->dsize,
715 "SEGMENT SIZE ", hctl->ssize,
716 "SEGMENT SHIFT ", hctl->sshift,
717 "FILL FACTOR ", hctl->ffactor,
718 "MAX BUCKET ", hctl->max_bucket,
719 "HIGH MASK ", hctl->high_mask,
720 "LOW MASK ", hctl->low_mask,
721 "NSEGS ", hctl->nsegs);
722 #endif
723 return true;
724 }
725
726 /*
727 * Estimate the space needed for a hashtable containing the given number
728 * of entries of given size.
729 * NOTE: this is used to estimate the footprint of hashtables in shared
730 * memory; therefore it does not count HTAB which is in local memory.
731 * NB: assumes that all hash structure parameters have default values!
732 */
733 Size
hash_estimate_size(long num_entries,Size entrysize)734 hash_estimate_size(long num_entries, Size entrysize)
735 {
736 Size size;
737 long nBuckets,
738 nSegments,
739 nDirEntries,
740 nElementAllocs,
741 elementSize,
742 elementAllocCnt;
743
744 /* estimate number of buckets wanted */
745 nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
746 /* # of segments needed for nBuckets */
747 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
748 /* directory entries */
749 nDirEntries = DEF_DIRSIZE;
750 while (nDirEntries < nSegments)
751 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
752
753 /* fixed control info */
754 size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
755 /* directory */
756 size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
757 /* segments */
758 size = add_size(size, mul_size(nSegments,
759 MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
760 /* elements --- allocated in groups of choose_nelem_alloc() entries */
761 elementAllocCnt = choose_nelem_alloc(entrysize);
762 nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
763 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
764 size = add_size(size,
765 mul_size(nElementAllocs,
766 mul_size(elementAllocCnt, elementSize)));
767
768 return size;
769 }
770
771 /*
772 * Select an appropriate directory size for a hashtable with the given
773 * maximum number of entries.
774 * This is only needed for hashtables in shared memory, whose directories
775 * cannot be expanded dynamically.
776 * NB: assumes that all hash structure parameters have default values!
777 *
778 * XXX this had better agree with the behavior of init_htab()...
779 */
780 long
hash_select_dirsize(long num_entries)781 hash_select_dirsize(long num_entries)
782 {
783 long nBuckets,
784 nSegments,
785 nDirEntries;
786
787 /* estimate number of buckets wanted */
788 nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
789 /* # of segments needed for nBuckets */
790 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
791 /* directory entries */
792 nDirEntries = DEF_DIRSIZE;
793 while (nDirEntries < nSegments)
794 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
795
796 return nDirEntries;
797 }
798
799 /*
800 * Compute the required initial memory allocation for a shared-memory
801 * hashtable with the given parameters. We need space for the HASHHDR
802 * and for the (non expansible) directory.
803 */
804 Size
hash_get_shared_size(HASHCTL * info,int flags)805 hash_get_shared_size(HASHCTL *info, int flags)
806 {
807 Assert(flags & HASH_DIRSIZE);
808 Assert(info->dsize == info->max_dsize);
809 return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
810 }
811
812
813 /********************** DESTROY ROUTINES ************************/
814
815 void
hash_destroy(HTAB * hashp)816 hash_destroy(HTAB *hashp)
817 {
818 if (hashp != NULL)
819 {
820 /* allocation method must be one we know how to free, too */
821 Assert(hashp->alloc == DynaHashAlloc);
822 /* so this hashtable must have its own context */
823 Assert(hashp->hcxt != NULL);
824
825 hash_stats("destroy", hashp);
826
827 /*
828 * Free everything by destroying the hash table's memory context.
829 */
830 MemoryContextDelete(hashp->hcxt);
831 }
832 }
833
834 void
hash_stats(const char * where,HTAB * hashp)835 hash_stats(const char *where, HTAB *hashp)
836 {
837 #ifdef HASH_STATISTICS
838 fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
839 where, hashp->hctl->accesses, hashp->hctl->collisions);
840
841 fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
842 hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
843 hashp->hctl->max_bucket, hashp->hctl->nsegs);
844 fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
845 where, hash_accesses, hash_collisions);
846 fprintf(stderr, "hash_stats: total expansions %ld\n",
847 hash_expansions);
848 #endif
849 }
850
851 /*******************************SEARCH ROUTINES *****************************/
852
853
854 /*
855 * get_hash_value -- exported routine to calculate a key's hash value
856 *
857 * We export this because for partitioned tables, callers need to compute
858 * the partition number (from the low-order bits of the hash value) before
859 * searching.
860 */
861 uint32
get_hash_value(HTAB * hashp,const void * keyPtr)862 get_hash_value(HTAB *hashp, const void *keyPtr)
863 {
864 return hashp->hash(keyPtr, hashp->keysize);
865 }
866
867 /* Convert a hash value to a bucket number */
868 static inline uint32
calc_bucket(HASHHDR * hctl,uint32 hash_val)869 calc_bucket(HASHHDR *hctl, uint32 hash_val)
870 {
871 uint32 bucket;
872
873 bucket = hash_val & hctl->high_mask;
874 if (bucket > hctl->max_bucket)
875 bucket = bucket & hctl->low_mask;
876
877 return bucket;
878 }
879
880 /*
881 * hash_search -- look up key in table and perform action
882 * hash_search_with_hash_value -- same, with key's hash value already computed
883 *
884 * action is one of:
885 * HASH_FIND: look up key in table
886 * HASH_ENTER: look up key in table, creating entry if not present
887 * HASH_ENTER_NULL: same, but return NULL if out of memory
888 * HASH_REMOVE: look up key in table, remove entry if present
889 *
890 * Return value is a pointer to the element found/entered/removed if any,
891 * or NULL if no match was found. (NB: in the case of the REMOVE action,
892 * the result is a dangling pointer that shouldn't be dereferenced!)
893 *
894 * HASH_ENTER will normally ereport a generic "out of memory" error if
895 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
896 * the same except it will return NULL if out of memory. Note that
897 * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
898 * since palloc internally ereports on out-of-memory.
899 *
900 * If foundPtr isn't NULL, then *foundPtr is set true if we found an
901 * existing entry in the table, false otherwise. This is needed in the
902 * HASH_ENTER case, but is redundant with the return value otherwise.
903 *
904 * For hash_search_with_hash_value, the hashvalue parameter must have been
905 * calculated with get_hash_value().
906 */
907 void *
hash_search(HTAB * hashp,const void * keyPtr,HASHACTION action,bool * foundPtr)908 hash_search(HTAB *hashp,
909 const void *keyPtr,
910 HASHACTION action,
911 bool *foundPtr)
912 {
913 return hash_search_with_hash_value(hashp,
914 keyPtr,
915 hashp->hash(keyPtr, hashp->keysize),
916 action,
917 foundPtr);
918 }
919
920 void *
hash_search_with_hash_value(HTAB * hashp,const void * keyPtr,uint32 hashvalue,HASHACTION action,bool * foundPtr)921 hash_search_with_hash_value(HTAB *hashp,
922 const void *keyPtr,
923 uint32 hashvalue,
924 HASHACTION action,
925 bool *foundPtr)
926 {
927 HASHHDR *hctl = hashp->hctl;
928 int freelist_idx = FREELIST_IDX(hctl, hashvalue);
929 Size keysize;
930 uint32 bucket;
931 long segment_num;
932 long segment_ndx;
933 HASHSEGMENT segp;
934 HASHBUCKET currBucket;
935 HASHBUCKET *prevBucketPtr;
936 HashCompareFunc match;
937
938 #ifdef HASH_STATISTICS
939 hash_accesses++;
940 hctl->accesses++;
941 #endif
942
943 /*
944 * If inserting, check if it is time to split a bucket.
945 *
946 * NOTE: failure to expand table is not a fatal error, it just means we
947 * have to run at higher fill factor than we wanted. However, if we're
948 * using the palloc allocator then it will throw error anyway on
949 * out-of-memory, so we must do this before modifying the table.
950 */
951 if (action == HASH_ENTER || action == HASH_ENTER_NULL)
952 {
953 /*
954 * Can't split if running in partitioned mode, nor if frozen, nor if
955 * table is the subject of any active hash_seq_search scans. Strange
956 * order of these tests is to try to check cheaper conditions first.
957 */
958 if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
959 hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
960 !has_seq_scans(hashp))
961 (void) expand_table(hashp);
962 }
963
964 /*
965 * Do the initial lookup
966 */
967 bucket = calc_bucket(hctl, hashvalue);
968
969 segment_num = bucket >> hashp->sshift;
970 segment_ndx = MOD(bucket, hashp->ssize);
971
972 segp = hashp->dir[segment_num];
973
974 if (segp == NULL)
975 hash_corrupted(hashp);
976
977 prevBucketPtr = &segp[segment_ndx];
978 currBucket = *prevBucketPtr;
979
980 /*
981 * Follow collision chain looking for matching key
982 */
983 match = hashp->match; /* save one fetch in inner loop */
984 keysize = hashp->keysize; /* ditto */
985
986 while (currBucket != NULL)
987 {
988 if (currBucket->hashvalue == hashvalue &&
989 match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
990 break;
991 prevBucketPtr = &(currBucket->link);
992 currBucket = *prevBucketPtr;
993 #ifdef HASH_STATISTICS
994 hash_collisions++;
995 hctl->collisions++;
996 #endif
997 }
998
999 if (foundPtr)
1000 *foundPtr = (bool) (currBucket != NULL);
1001
1002 /*
1003 * OK, now what?
1004 */
1005 switch (action)
1006 {
1007 case HASH_FIND:
1008 if (currBucket != NULL)
1009 return (void *) ELEMENTKEY(currBucket);
1010 return NULL;
1011
1012 case HASH_REMOVE:
1013 if (currBucket != NULL)
1014 {
1015 /* if partitioned, must lock to touch nentries and freeList */
1016 if (IS_PARTITIONED(hctl))
1017 SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1018
1019 /* delete the record from the appropriate nentries counter. */
1020 Assert(hctl->freeList[freelist_idx].nentries > 0);
1021 hctl->freeList[freelist_idx].nentries--;
1022
1023 /* remove record from hash bucket's chain. */
1024 *prevBucketPtr = currBucket->link;
1025
1026 /* add the record to the appropriate freelist. */
1027 currBucket->link = hctl->freeList[freelist_idx].freeList;
1028 hctl->freeList[freelist_idx].freeList = currBucket;
1029
1030 if (IS_PARTITIONED(hctl))
1031 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1032
1033 /*
1034 * better hope the caller is synchronizing access to this
1035 * element, because someone else is going to reuse it the next
1036 * time something is added to the table
1037 */
1038 return (void *) ELEMENTKEY(currBucket);
1039 }
1040 return NULL;
1041
1042 case HASH_ENTER_NULL:
1043 /* ENTER_NULL does not work with palloc-based allocator */
1044 Assert(hashp->alloc != DynaHashAlloc);
1045 /* FALL THRU */
1046
1047 case HASH_ENTER:
1048 /* Return existing element if found, else create one */
1049 if (currBucket != NULL)
1050 return (void *) ELEMENTKEY(currBucket);
1051
1052 /* disallow inserts if frozen */
1053 if (hashp->frozen)
1054 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1055 hashp->tabname);
1056
1057 currBucket = get_hash_entry(hashp, freelist_idx);
1058 if (currBucket == NULL)
1059 {
1060 /* out of memory */
1061 if (action == HASH_ENTER_NULL)
1062 return NULL;
1063 /* report a generic message */
1064 if (hashp->isshared)
1065 ereport(ERROR,
1066 (errcode(ERRCODE_OUT_OF_MEMORY),
1067 errmsg("out of shared memory")));
1068 else
1069 ereport(ERROR,
1070 (errcode(ERRCODE_OUT_OF_MEMORY),
1071 errmsg("out of memory")));
1072 }
1073
1074 /* link into hashbucket chain */
1075 *prevBucketPtr = currBucket;
1076 currBucket->link = NULL;
1077
1078 /* copy key into record */
1079 currBucket->hashvalue = hashvalue;
1080 hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1081
1082 /*
1083 * Caller is expected to fill the data field on return. DO NOT
1084 * insert any code that could possibly throw error here, as doing
1085 * so would leave the table entry incomplete and hence corrupt the
1086 * caller's data structure.
1087 */
1088
1089 return (void *) ELEMENTKEY(currBucket);
1090 }
1091
1092 elog(ERROR, "unrecognized hash action code: %d", (int) action);
1093
1094 return NULL; /* keep compiler quiet */
1095 }
1096
1097 /*
1098 * hash_update_hash_key -- change the hash key of an existing table entry
1099 *
1100 * This is equivalent to removing the entry, making a new entry, and copying
1101 * over its data, except that the entry never goes to the table's freelist.
1102 * Therefore this cannot suffer an out-of-memory failure, even if there are
1103 * other processes operating in other partitions of the hashtable.
1104 *
1105 * Returns true if successful, false if the requested new hash key is already
1106 * present. Throws error if the specified entry pointer isn't actually a
1107 * table member.
1108 *
1109 * NB: currently, there is no special case for old and new hash keys being
1110 * identical, which means we'll report false for that situation. This is
1111 * preferable for existing uses.
1112 *
1113 * NB: for a partitioned hashtable, caller must hold lock on both relevant
1114 * partitions, if the new hash key would belong to a different partition.
1115 */
1116 bool
hash_update_hash_key(HTAB * hashp,void * existingEntry,const void * newKeyPtr)1117 hash_update_hash_key(HTAB *hashp,
1118 void *existingEntry,
1119 const void *newKeyPtr)
1120 {
1121 HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1122 HASHHDR *hctl = hashp->hctl;
1123 uint32 newhashvalue;
1124 Size keysize;
1125 uint32 bucket;
1126 uint32 newbucket;
1127 long segment_num;
1128 long segment_ndx;
1129 HASHSEGMENT segp;
1130 HASHBUCKET currBucket;
1131 HASHBUCKET *prevBucketPtr;
1132 HASHBUCKET *oldPrevPtr;
1133 HashCompareFunc match;
1134
1135 #ifdef HASH_STATISTICS
1136 hash_accesses++;
1137 hctl->accesses++;
1138 #endif
1139
1140 /* disallow updates if frozen */
1141 if (hashp->frozen)
1142 elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1143 hashp->tabname);
1144
1145 /*
1146 * Lookup the existing element using its saved hash value. We need to do
1147 * this to be able to unlink it from its hash chain, but as a side benefit
1148 * we can verify the validity of the passed existingEntry pointer.
1149 */
1150 bucket = calc_bucket(hctl, existingElement->hashvalue);
1151
1152 segment_num = bucket >> hashp->sshift;
1153 segment_ndx = MOD(bucket, hashp->ssize);
1154
1155 segp = hashp->dir[segment_num];
1156
1157 if (segp == NULL)
1158 hash_corrupted(hashp);
1159
1160 prevBucketPtr = &segp[segment_ndx];
1161 currBucket = *prevBucketPtr;
1162
1163 while (currBucket != NULL)
1164 {
1165 if (currBucket == existingElement)
1166 break;
1167 prevBucketPtr = &(currBucket->link);
1168 currBucket = *prevBucketPtr;
1169 }
1170
1171 if (currBucket == NULL)
1172 elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1173 hashp->tabname);
1174
1175 oldPrevPtr = prevBucketPtr;
1176
1177 /*
1178 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1179 * chain we want to put the entry into.
1180 */
1181 newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1182
1183 newbucket = calc_bucket(hctl, newhashvalue);
1184
1185 segment_num = newbucket >> hashp->sshift;
1186 segment_ndx = MOD(newbucket, hashp->ssize);
1187
1188 segp = hashp->dir[segment_num];
1189
1190 if (segp == NULL)
1191 hash_corrupted(hashp);
1192
1193 prevBucketPtr = &segp[segment_ndx];
1194 currBucket = *prevBucketPtr;
1195
1196 /*
1197 * Follow collision chain looking for matching key
1198 */
1199 match = hashp->match; /* save one fetch in inner loop */
1200 keysize = hashp->keysize; /* ditto */
1201
1202 while (currBucket != NULL)
1203 {
1204 if (currBucket->hashvalue == newhashvalue &&
1205 match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1206 break;
1207 prevBucketPtr = &(currBucket->link);
1208 currBucket = *prevBucketPtr;
1209 #ifdef HASH_STATISTICS
1210 hash_collisions++;
1211 hctl->collisions++;
1212 #endif
1213 }
1214
1215 if (currBucket != NULL)
1216 return false; /* collision with an existing entry */
1217
1218 currBucket = existingElement;
1219
1220 /*
1221 * If old and new hash values belong to the same bucket, we need not
1222 * change any chain links, and indeed should not since this simplistic
1223 * update will corrupt the list if currBucket is the last element. (We
1224 * cannot fall out earlier, however, since we need to scan the bucket to
1225 * check for duplicate keys.)
1226 */
1227 if (bucket != newbucket)
1228 {
1229 /* OK to remove record from old hash bucket's chain. */
1230 *oldPrevPtr = currBucket->link;
1231
1232 /* link into new hashbucket chain */
1233 *prevBucketPtr = currBucket;
1234 currBucket->link = NULL;
1235 }
1236
1237 /* copy new key into record */
1238 currBucket->hashvalue = newhashvalue;
1239 hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1240
1241 /* rest of record is untouched */
1242
1243 return true;
1244 }
1245
1246 /*
1247 * Allocate a new hashtable entry if possible; return NULL if out of memory.
1248 * (Or, if the underlying space allocator throws error for out-of-memory,
1249 * we won't return at all.)
1250 */
1251 static HASHBUCKET
get_hash_entry(HTAB * hashp,int freelist_idx)1252 get_hash_entry(HTAB *hashp, int freelist_idx)
1253 {
1254 HASHHDR *hctl = hashp->hctl;
1255 HASHBUCKET newElement;
1256
1257 for (;;)
1258 {
1259 /* if partitioned, must lock to touch nentries and freeList */
1260 if (IS_PARTITIONED(hctl))
1261 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1262
1263 /* try to get an entry from the freelist */
1264 newElement = hctl->freeList[freelist_idx].freeList;
1265
1266 if (newElement != NULL)
1267 break;
1268
1269 if (IS_PARTITIONED(hctl))
1270 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1271
1272 /*
1273 * No free elements in this freelist. In a partitioned table, there
1274 * might be entries in other freelists, but to reduce contention we
1275 * prefer to first try to get another chunk of buckets from the main
1276 * shmem allocator. If that fails, though, we *MUST* root through all
1277 * the other freelists before giving up. There are multiple callers
1278 * that assume that they can allocate every element in the initially
1279 * requested table size, or that deleting an element guarantees they
1280 * can insert a new element, even if shared memory is entirely full.
1281 * Failing because the needed element is in a different freelist is
1282 * not acceptable.
1283 */
1284 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1285 {
1286 int borrow_from_idx;
1287
1288 if (!IS_PARTITIONED(hctl))
1289 return NULL; /* out of memory */
1290
1291 /* try to borrow element from another freelist */
1292 borrow_from_idx = freelist_idx;
1293 for (;;)
1294 {
1295 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1296 if (borrow_from_idx == freelist_idx)
1297 break; /* examined all freelists, fail */
1298
1299 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1300 newElement = hctl->freeList[borrow_from_idx].freeList;
1301
1302 if (newElement != NULL)
1303 {
1304 hctl->freeList[borrow_from_idx].freeList = newElement->link;
1305 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1306
1307 /* careful: count the new element in its proper freelist */
1308 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1309 hctl->freeList[freelist_idx].nentries++;
1310 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1311
1312 return newElement;
1313 }
1314
1315 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1316 }
1317
1318 /* no elements available to borrow either, so out of memory */
1319 return NULL;
1320 }
1321 }
1322
1323 /* remove entry from freelist, bump nentries */
1324 hctl->freeList[freelist_idx].freeList = newElement->link;
1325 hctl->freeList[freelist_idx].nentries++;
1326
1327 if (IS_PARTITIONED(hctl))
1328 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1329
1330 return newElement;
1331 }
1332
1333 /*
1334 * hash_get_num_entries -- get the number of entries in a hashtable
1335 */
1336 long
hash_get_num_entries(HTAB * hashp)1337 hash_get_num_entries(HTAB *hashp)
1338 {
1339 int i;
1340 long sum = hashp->hctl->freeList[0].nentries;
1341
1342 /*
1343 * We currently don't bother with acquiring the mutexes; it's only
1344 * sensible to call this function if you've got lock on all partitions of
1345 * the table.
1346 */
1347 if (IS_PARTITIONED(hashp->hctl))
1348 {
1349 for (i = 1; i < NUM_FREELISTS; i++)
1350 sum += hashp->hctl->freeList[i].nentries;
1351 }
1352
1353 return sum;
1354 }
1355
1356 /*
1357 * hash_seq_init/_search/_term
1358 * Sequentially search through hash table and return
1359 * all the elements one by one, return NULL when no more.
1360 *
1361 * hash_seq_term should be called if and only if the scan is abandoned before
1362 * completion; if hash_seq_search returns NULL then it has already done the
1363 * end-of-scan cleanup.
1364 *
1365 * NOTE: caller may delete the returned element before continuing the scan.
1366 * However, deleting any other element while the scan is in progress is
1367 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1368 * if elements are added to the table while the scan is in progress, it is
1369 * unspecified whether they will be visited by the scan or not.
1370 *
1371 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1372 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1373 * further insertions by calling hash_freeze.
1374 *
1375 * NOTE: to use this with a partitioned hashtable, caller had better hold
1376 * at least shared lock on all partitions of the table throughout the scan!
1377 * We can cope with insertions or deletions by our own backend, but *not*
1378 * with concurrent insertions or deletions by another.
1379 */
1380 void
hash_seq_init(HASH_SEQ_STATUS * status,HTAB * hashp)1381 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1382 {
1383 status->hashp = hashp;
1384 status->curBucket = 0;
1385 status->curEntry = NULL;
1386 if (!hashp->frozen)
1387 register_seq_scan(hashp);
1388 }
1389
1390 void *
hash_seq_search(HASH_SEQ_STATUS * status)1391 hash_seq_search(HASH_SEQ_STATUS *status)
1392 {
1393 HTAB *hashp;
1394 HASHHDR *hctl;
1395 uint32 max_bucket;
1396 long ssize;
1397 long segment_num;
1398 long segment_ndx;
1399 HASHSEGMENT segp;
1400 uint32 curBucket;
1401 HASHELEMENT *curElem;
1402
1403 if ((curElem = status->curEntry) != NULL)
1404 {
1405 /* Continuing scan of curBucket... */
1406 status->curEntry = curElem->link;
1407 if (status->curEntry == NULL) /* end of this bucket */
1408 ++status->curBucket;
1409 return (void *) ELEMENTKEY(curElem);
1410 }
1411
1412 /*
1413 * Search for next nonempty bucket starting at curBucket.
1414 */
1415 curBucket = status->curBucket;
1416 hashp = status->hashp;
1417 hctl = hashp->hctl;
1418 ssize = hashp->ssize;
1419 max_bucket = hctl->max_bucket;
1420
1421 if (curBucket > max_bucket)
1422 {
1423 hash_seq_term(status);
1424 return NULL; /* search is done */
1425 }
1426
1427 /*
1428 * first find the right segment in the table directory.
1429 */
1430 segment_num = curBucket >> hashp->sshift;
1431 segment_ndx = MOD(curBucket, ssize);
1432
1433 segp = hashp->dir[segment_num];
1434
1435 /*
1436 * Pick up the first item in this bucket's chain. If chain is not empty
1437 * we can begin searching it. Otherwise we have to advance to find the
1438 * next nonempty bucket. We try to optimize that case since searching a
1439 * near-empty hashtable has to iterate this loop a lot.
1440 */
1441 while ((curElem = segp[segment_ndx]) == NULL)
1442 {
1443 /* empty bucket, advance to next */
1444 if (++curBucket > max_bucket)
1445 {
1446 status->curBucket = curBucket;
1447 hash_seq_term(status);
1448 return NULL; /* search is done */
1449 }
1450 if (++segment_ndx >= ssize)
1451 {
1452 segment_num++;
1453 segment_ndx = 0;
1454 segp = hashp->dir[segment_num];
1455 }
1456 }
1457
1458 /* Begin scan of curBucket... */
1459 status->curEntry = curElem->link;
1460 if (status->curEntry == NULL) /* end of this bucket */
1461 ++curBucket;
1462 status->curBucket = curBucket;
1463 return (void *) ELEMENTKEY(curElem);
1464 }
1465
1466 void
hash_seq_term(HASH_SEQ_STATUS * status)1467 hash_seq_term(HASH_SEQ_STATUS *status)
1468 {
1469 if (!status->hashp->frozen)
1470 deregister_seq_scan(status->hashp);
1471 }
1472
1473 /*
1474 * hash_freeze
1475 * Freeze a hashtable against future insertions (deletions are
1476 * still allowed)
1477 *
1478 * The reason for doing this is that by preventing any more bucket splits,
1479 * we no longer need to worry about registering hash_seq_search scans,
1480 * and thus caller need not be careful about ensuring hash_seq_term gets
1481 * called at the right times.
1482 *
1483 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1484 * with active scans (since hash_seq_term would then do the wrong thing).
1485 */
1486 void
hash_freeze(HTAB * hashp)1487 hash_freeze(HTAB *hashp)
1488 {
1489 if (hashp->isshared)
1490 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1491 if (!hashp->frozen && has_seq_scans(hashp))
1492 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1493 hashp->tabname);
1494 hashp->frozen = true;
1495 }
1496
1497
1498 /********************************* UTILITIES ************************/
1499
1500 /*
1501 * Expand the table by adding one more hash bucket.
1502 */
1503 static bool
expand_table(HTAB * hashp)1504 expand_table(HTAB *hashp)
1505 {
1506 HASHHDR *hctl = hashp->hctl;
1507 HASHSEGMENT old_seg,
1508 new_seg;
1509 long old_bucket,
1510 new_bucket;
1511 long new_segnum,
1512 new_segndx;
1513 long old_segnum,
1514 old_segndx;
1515 HASHBUCKET *oldlink,
1516 *newlink;
1517 HASHBUCKET currElement,
1518 nextElement;
1519
1520 Assert(!IS_PARTITIONED(hctl));
1521
1522 #ifdef HASH_STATISTICS
1523 hash_expansions++;
1524 #endif
1525
1526 new_bucket = hctl->max_bucket + 1;
1527 new_segnum = new_bucket >> hashp->sshift;
1528 new_segndx = MOD(new_bucket, hashp->ssize);
1529
1530 if (new_segnum >= hctl->nsegs)
1531 {
1532 /* Allocate new segment if necessary -- could fail if dir full */
1533 if (new_segnum >= hctl->dsize)
1534 if (!dir_realloc(hashp))
1535 return false;
1536 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1537 return false;
1538 hctl->nsegs++;
1539 }
1540
1541 /* OK, we created a new bucket */
1542 hctl->max_bucket++;
1543
1544 /*
1545 * *Before* changing masks, find old bucket corresponding to same hash
1546 * values; values in that bucket may need to be relocated to new bucket.
1547 * Note that new_bucket is certainly larger than low_mask at this point,
1548 * so we can skip the first step of the regular hash mask calc.
1549 */
1550 old_bucket = (new_bucket & hctl->low_mask);
1551
1552 /*
1553 * If we crossed a power of 2, readjust masks.
1554 */
1555 if ((uint32) new_bucket > hctl->high_mask)
1556 {
1557 hctl->low_mask = hctl->high_mask;
1558 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1559 }
1560
1561 /*
1562 * Relocate records to the new bucket. NOTE: because of the way the hash
1563 * masking is done in calc_bucket, only one old bucket can need to be
1564 * split at this point. With a different way of reducing the hash value,
1565 * that might not be true!
1566 */
1567 old_segnum = old_bucket >> hashp->sshift;
1568 old_segndx = MOD(old_bucket, hashp->ssize);
1569
1570 old_seg = hashp->dir[old_segnum];
1571 new_seg = hashp->dir[new_segnum];
1572
1573 oldlink = &old_seg[old_segndx];
1574 newlink = &new_seg[new_segndx];
1575
1576 for (currElement = *oldlink;
1577 currElement != NULL;
1578 currElement = nextElement)
1579 {
1580 nextElement = currElement->link;
1581 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1582 {
1583 *oldlink = currElement;
1584 oldlink = &currElement->link;
1585 }
1586 else
1587 {
1588 *newlink = currElement;
1589 newlink = &currElement->link;
1590 }
1591 }
1592 /* don't forget to terminate the rebuilt hash chains... */
1593 *oldlink = NULL;
1594 *newlink = NULL;
1595
1596 return true;
1597 }
1598
1599
1600 static bool
dir_realloc(HTAB * hashp)1601 dir_realloc(HTAB *hashp)
1602 {
1603 HASHSEGMENT *p;
1604 HASHSEGMENT *old_p;
1605 long new_dsize;
1606 long old_dirsize;
1607 long new_dirsize;
1608
1609 if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1610 return false;
1611
1612 /* Reallocate directory */
1613 new_dsize = hashp->hctl->dsize << 1;
1614 old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1615 new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1616
1617 old_p = hashp->dir;
1618 CurrentDynaHashCxt = hashp->hcxt;
1619 p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1620
1621 if (p != NULL)
1622 {
1623 memcpy(p, old_p, old_dirsize);
1624 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1625 hashp->dir = p;
1626 hashp->hctl->dsize = new_dsize;
1627
1628 /* XXX assume the allocator is palloc, so we know how to free */
1629 Assert(hashp->alloc == DynaHashAlloc);
1630 pfree(old_p);
1631
1632 return true;
1633 }
1634
1635 return false;
1636 }
1637
1638
1639 static HASHSEGMENT
seg_alloc(HTAB * hashp)1640 seg_alloc(HTAB *hashp)
1641 {
1642 HASHSEGMENT segp;
1643
1644 CurrentDynaHashCxt = hashp->hcxt;
1645 segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1646
1647 if (!segp)
1648 return NULL;
1649
1650 MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1651
1652 return segp;
1653 }
1654
1655 /*
1656 * allocate some new elements and link them into the indicated free list
1657 */
1658 static bool
element_alloc(HTAB * hashp,int nelem,int freelist_idx)1659 element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1660 {
1661 HASHHDR *hctl = hashp->hctl;
1662 Size elementSize;
1663 HASHELEMENT *firstElement;
1664 HASHELEMENT *tmpElement;
1665 HASHELEMENT *prevElement;
1666 int i;
1667
1668 if (hashp->isfixed)
1669 return false;
1670
1671 /* Each element has a HASHELEMENT header plus user data. */
1672 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1673
1674 CurrentDynaHashCxt = hashp->hcxt;
1675 firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1676
1677 if (!firstElement)
1678 return false;
1679
1680 /* prepare to link all the new entries into the freelist */
1681 prevElement = NULL;
1682 tmpElement = firstElement;
1683 for (i = 0; i < nelem; i++)
1684 {
1685 tmpElement->link = prevElement;
1686 prevElement = tmpElement;
1687 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1688 }
1689
1690 /* if partitioned, must lock to touch freeList */
1691 if (IS_PARTITIONED(hctl))
1692 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1693
1694 /* freelist could be nonempty if two backends did this concurrently */
1695 firstElement->link = hctl->freeList[freelist_idx].freeList;
1696 hctl->freeList[freelist_idx].freeList = prevElement;
1697
1698 if (IS_PARTITIONED(hctl))
1699 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1700
1701 return true;
1702 }
1703
1704 /* complain when we have detected a corrupted hashtable */
1705 static void
hash_corrupted(HTAB * hashp)1706 hash_corrupted(HTAB *hashp)
1707 {
1708 /*
1709 * If the corruption is in a shared hashtable, we'd better force a
1710 * systemwide restart. Otherwise, just shut down this one backend.
1711 */
1712 if (hashp->isshared)
1713 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1714 else
1715 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1716 }
1717
1718 /* calculate ceil(log base 2) of num */
1719 int
my_log2(long num)1720 my_log2(long num)
1721 {
1722 /*
1723 * guard against too-large input, which would be invalid for
1724 * pg_ceil_log2_*()
1725 */
1726 if (num > LONG_MAX / 2)
1727 num = LONG_MAX / 2;
1728
1729 #if SIZEOF_LONG < 8
1730 return pg_ceil_log2_32(num);
1731 #else
1732 return pg_ceil_log2_64(num);
1733 #endif
1734 }
1735
1736 /* calculate first power of 2 >= num, bounded to what will fit in a long */
1737 static long
next_pow2_long(long num)1738 next_pow2_long(long num)
1739 {
1740 /* my_log2's internal range check is sufficient */
1741 return 1L << my_log2(num);
1742 }
1743
1744 /* calculate first power of 2 >= num, bounded to what will fit in an int */
1745 static int
next_pow2_int(long num)1746 next_pow2_int(long num)
1747 {
1748 if (num > INT_MAX / 2)
1749 num = INT_MAX / 2;
1750 return 1 << my_log2(num);
1751 }
1752
1753
1754 /************************* SEQ SCAN TRACKING ************************/
1755
1756 /*
1757 * We track active hash_seq_search scans here. The need for this mechanism
1758 * comes from the fact that a scan will get confused if a bucket split occurs
1759 * while it's in progress: it might visit entries twice, or even miss some
1760 * entirely (if it's partway through the same bucket that splits). Hence
1761 * we want to inhibit bucket splits if there are any active scans on the
1762 * table being inserted into. This is a fairly rare case in current usage,
1763 * so just postponing the split until the next insertion seems sufficient.
1764 *
1765 * Given present usages of the function, only a few scans are likely to be
1766 * open concurrently; so a finite-size stack of open scans seems sufficient,
1767 * and we don't worry that linear search is too slow. Note that we do
1768 * allow multiple scans of the same hashtable to be open concurrently.
1769 *
1770 * This mechanism can support concurrent scan and insertion in a shared
1771 * hashtable if it's the same backend doing both. It would fail otherwise,
1772 * but locking reasons seem to preclude any such scenario anyway, so we don't
1773 * worry.
1774 *
1775 * This arrangement is reasonably robust if a transient hashtable is deleted
1776 * without notifying us. The absolute worst case is we might inhibit splits
1777 * in another table created later at exactly the same address. We will give
1778 * a warning at transaction end for reference leaks, so any bugs leading to
1779 * lack of notification should be easy to catch.
1780 */
1781
1782 #define MAX_SEQ_SCANS 100
1783
1784 static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1785 static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1786 static int num_seq_scans = 0;
1787
1788
1789 /* Register a table as having an active hash_seq_search scan */
1790 static void
register_seq_scan(HTAB * hashp)1791 register_seq_scan(HTAB *hashp)
1792 {
1793 if (num_seq_scans >= MAX_SEQ_SCANS)
1794 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1795 hashp->tabname);
1796 seq_scan_tables[num_seq_scans] = hashp;
1797 seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1798 num_seq_scans++;
1799 }
1800
1801 /* Deregister an active scan */
1802 static void
deregister_seq_scan(HTAB * hashp)1803 deregister_seq_scan(HTAB *hashp)
1804 {
1805 int i;
1806
1807 /* Search backward since it's most likely at the stack top */
1808 for (i = num_seq_scans - 1; i >= 0; i--)
1809 {
1810 if (seq_scan_tables[i] == hashp)
1811 {
1812 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1813 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1814 num_seq_scans--;
1815 return;
1816 }
1817 }
1818 elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1819 hashp->tabname);
1820 }
1821
1822 /* Check if a table has any active scan */
1823 static bool
has_seq_scans(HTAB * hashp)1824 has_seq_scans(HTAB *hashp)
1825 {
1826 int i;
1827
1828 for (i = 0; i < num_seq_scans; i++)
1829 {
1830 if (seq_scan_tables[i] == hashp)
1831 return true;
1832 }
1833 return false;
1834 }
1835
1836 /* Clean up any open scans at end of transaction */
1837 void
AtEOXact_HashTables(bool isCommit)1838 AtEOXact_HashTables(bool isCommit)
1839 {
1840 /*
1841 * During abort cleanup, open scans are expected; just silently clean 'em
1842 * out. An open scan at commit means someone forgot a hash_seq_term()
1843 * call, so complain.
1844 *
1845 * Note: it's tempting to try to print the tabname here, but refrain for
1846 * fear of touching deallocated memory. This isn't a user-facing message
1847 * anyway, so it needn't be pretty.
1848 */
1849 if (isCommit)
1850 {
1851 int i;
1852
1853 for (i = 0; i < num_seq_scans; i++)
1854 {
1855 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1856 seq_scan_tables[i]);
1857 }
1858 }
1859 num_seq_scans = 0;
1860 }
1861
1862 /* Clean up any open scans at end of subtransaction */
1863 void
AtEOSubXact_HashTables(bool isCommit,int nestDepth)1864 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1865 {
1866 int i;
1867
1868 /*
1869 * Search backward to make cleanup easy. Note we must check all entries,
1870 * not only those at the end of the array, because deletion technique
1871 * doesn't keep them in order.
1872 */
1873 for (i = num_seq_scans - 1; i >= 0; i--)
1874 {
1875 if (seq_scan_level[i] >= nestDepth)
1876 {
1877 if (isCommit)
1878 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1879 seq_scan_tables[i]);
1880 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1881 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1882 num_seq_scans--;
1883 }
1884 }
1885 }
1886