1 /*-------------------------------------------------------------------------
2 *
3 * dynahash.c
4 * dynamic hash tables
5 *
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * (A partitioned table uses multiple copies of those fields, guarded by
19 * spinlocks, for additional concurrency.)
20 * This lets any subset of the hash buckets be treated as a separately
21 * lockable partition. We expect callers to use the low-order bits of a
22 * lookup key's hash value as a partition number --- this will work because
23 * of the way calc_bucket() maps hash values to bucket numbers.
24 *
25 * For hash tables in shared memory, the memory allocator function should
26 * match malloc's semantics of returning NULL on failure. For hash tables
27 * in local memory, we typically use palloc() which will throw error on
28 * failure. The code in this file has to cope with both cases.
29 *
30 * dynahash.c provides support for these types of lookup keys:
31 *
32 * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 * compared as though by strcmp(). This is the default behavior.
34 *
35 * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
36 * (Caller must ensure there are no undefined padding bits in the keys!)
37 * This is selected by specifying HASH_BLOBS flag to hash_create.
38 *
39 * 3. More complex key behavior can be selected by specifying user-supplied
40 * hashing, comparison, and/or key-copying functions. At least a hashing
41 * function must be supplied; comparison defaults to memcmp() and key copying
42 * to memcpy() when a user-defined hashing function is selected.
43 *
44 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
45 * Portions Copyright (c) 1994, Regents of the University of California
46 *
47 *
48 * IDENTIFICATION
49 * src/backend/utils/hash/dynahash.c
50 *
51 *-------------------------------------------------------------------------
52 */
53
54 /*
55 * Original comments:
56 *
57 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
58 * Coded into C, with minor code improvements, and with hsearch(3) interface,
59 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
60 * also, hcreate/hdestroy routines added to simulate hsearch(3).
61 *
62 * These routines simulate hsearch(3) and family, with the important
63 * difference that the hash table is dynamic - can grow indefinitely
64 * beyond its original size (as supplied to hcreate()).
65 *
66 * Performance appears to be comparable to that of hsearch(3).
67 * The 'source-code' options referred to in hsearch(3)'s 'man' page
68 * are not implemented; otherwise functionality is identical.
69 *
70 * Compilation controls:
71 * HASH_DEBUG controls some informative traces, mainly for debugging.
72 * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
73 * when combined with HASH_DEBUG, these are displayed by hdestroy().
74 *
75 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
76 * concatenation property, in probably unnecessary code 'optimization'.
77 *
78 * Modified margo@postgres.berkeley.edu February 1990
79 * added multiple table interface
80 * Modified by sullivan@postgres.berkeley.edu April 1990
81 * changed ctl structure for shared memory
82 */
83
84 #include "postgres.h"
85
86 #include <limits.h>
87
88 #include "access/xact.h"
89 #include "storage/shmem.h"
90 #include "storage/spin.h"
91 #include "utils/dynahash.h"
92 #include "utils/memutils.h"
93
94
95 /*
96 * Constants
97 *
98 * A hash table has a top-level "directory", each of whose entries points
99 * to a "segment" of ssize bucket headers. The maximum number of hash
100 * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
101 * the number of records in the table can be larger, but we don't want a
102 * whole lot of records per bucket or performance goes down.
103 *
104 * In a hash table allocated in shared memory, the directory cannot be
105 * expanded because it must stay at a fixed address. The directory size
106 * should be selected using hash_select_dirsize (and you'd better have
107 * a good idea of the maximum number of entries!). For non-shared hash
108 * tables, the initial directory size can be left at the default.
109 */
110 #define DEF_SEGSIZE 256
111 #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
112 #define DEF_DIRSIZE 256
113 #define DEF_FFACTOR 1 /* default fill factor */
114
115 /* Number of freelists to be used for a partitioned hash table. */
116 #define NUM_FREELISTS 32
117
118 /* A hash bucket is a linked list of HASHELEMENTs */
119 typedef HASHELEMENT *HASHBUCKET;
120
121 /* A hash segment is an array of bucket headers */
122 typedef HASHBUCKET *HASHSEGMENT;
123
124 /*
125 * Per-freelist data.
126 *
127 * In a partitioned hash table, each freelist is associated with a specific
128 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
129 * nentries tracks the number of live hashtable entries having those hashcodes
130 * (NOT the number of entries in the freelist, as you might expect).
131 *
132 * The coverage of a freelist might be more or less than one partition, so it
133 * needs its own lock rather than relying on caller locking. Relying on that
134 * wouldn't work even if the coverage was the same, because of the occasional
135 * need to "borrow" entries from another freelist; see get_hash_entry().
136 *
137 * Using an array of FreeListData instead of separate arrays of mutexes,
138 * nentries and freeLists helps to reduce sharing of cache lines between
139 * different mutexes.
140 */
141 typedef struct
142 {
143 slock_t mutex; /* spinlock for this freelist */
144 long nentries; /* number of entries in associated buckets */
145 HASHELEMENT *freeList; /* chain of free elements */
146 } FreeListData;
147
148 /*
149 * Header structure for a hash table --- contains all changeable info
150 *
151 * In a shared-memory hash table, the HASHHDR is in shared memory, while
152 * each backend has a local HTAB struct. For a non-shared table, there isn't
153 * any functional difference between HASHHDR and HTAB, but we separate them
154 * anyway to share code between shared and non-shared tables.
155 */
156 struct HASHHDR
157 {
158 /*
159 * The freelist can become a point of contention in high-concurrency hash
160 * tables, so we use an array of freelists, each with its own mutex and
161 * nentries count, instead of just a single one. Although the freelists
162 * normally operate independently, we will scavenge entries from freelists
163 * other than a hashcode's default freelist when necessary.
164 *
165 * If the hash table is not partitioned, only freeList[0] is used and its
166 * spinlock is not used at all; callers' locking is assumed sufficient.
167 */
168 FreeListData freeList[NUM_FREELISTS];
169
170 /* These fields can change, but not in a partitioned table */
171 /* Also, dsize can't change in a shared table, even if unpartitioned */
172 long dsize; /* directory size */
173 long nsegs; /* number of allocated segments (<= dsize) */
174 uint32 max_bucket; /* ID of maximum bucket in use */
175 uint32 high_mask; /* mask to modulo into entire table */
176 uint32 low_mask; /* mask to modulo into lower half of table */
177
178 /* These fields are fixed at hashtable creation */
179 Size keysize; /* hash key length in bytes */
180 Size entrysize; /* total user element size in bytes */
181 long num_partitions; /* # partitions (must be power of 2), or 0 */
182 long ffactor; /* target fill factor */
183 long max_dsize; /* 'dsize' limit if directory is fixed size */
184 long ssize; /* segment size --- must be power of 2 */
185 int sshift; /* segment shift = log2(ssize) */
186 int nelem_alloc; /* number of entries to allocate at once */
187
188 #ifdef HASH_STATISTICS
189
190 /*
191 * Count statistics here. NB: stats code doesn't bother with mutex, so
192 * counts could be corrupted a bit in a partitioned table.
193 */
194 long accesses;
195 long collisions;
196 #endif
197 };
198
199 #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
200
201 #define FREELIST_IDX(hctl, hashcode) \
202 (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
203
204 /*
205 * Top control structure for a hashtable --- in a shared table, each backend
206 * has its own copy (OK since no fields change at runtime)
207 */
208 struct HTAB
209 {
210 HASHHDR *hctl; /* => shared control information */
211 HASHSEGMENT *dir; /* directory of segment starts */
212 HashValueFunc hash; /* hash function */
213 HashCompareFunc match; /* key comparison function */
214 HashCopyFunc keycopy; /* key copying function */
215 HashAllocFunc alloc; /* memory allocator */
216 MemoryContext hcxt; /* memory context if default allocator used */
217 char *tabname; /* table name (for error messages) */
218 bool isshared; /* true if table is in shared memory */
219 bool isfixed; /* if true, don't enlarge */
220
221 /* freezing a shared table isn't allowed, so we can keep state here */
222 bool frozen; /* true = no more inserts allowed */
223
224 /* We keep local copies of these fixed values to reduce contention */
225 Size keysize; /* hash key length in bytes */
226 long ssize; /* segment size --- must be power of 2 */
227 int sshift; /* segment shift = log2(ssize) */
228 };
229
230 /*
231 * Key (also entry) part of a HASHELEMENT
232 */
233 #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
234
235 /*
236 * Obtain element pointer given pointer to key
237 */
238 #define ELEMENT_FROM_KEY(key) \
239 ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
240
241 /*
242 * Fast MOD arithmetic, assuming that y is a power of 2 !
243 */
244 #define MOD(x,y) ((x) & ((y)-1))
245
246 #if HASH_STATISTICS
247 static long hash_accesses,
248 hash_collisions,
249 hash_expansions;
250 #endif
251
252 /*
253 * Private function prototypes
254 */
255 static void *DynaHashAlloc(Size size);
256 static HASHSEGMENT seg_alloc(HTAB *hashp);
257 static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
258 static bool dir_realloc(HTAB *hashp);
259 static bool expand_table(HTAB *hashp);
260 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
261 static void hdefault(HTAB *hashp);
262 static int choose_nelem_alloc(Size entrysize);
263 static bool init_htab(HTAB *hashp, long nelem);
264 static void hash_corrupted(HTAB *hashp);
265 static long next_pow2_long(long num);
266 static int next_pow2_int(long num);
267 static void register_seq_scan(HTAB *hashp);
268 static void deregister_seq_scan(HTAB *hashp);
269 static bool has_seq_scans(HTAB *hashp);
270
271
272 /*
273 * memory allocation support
274 */
275 static MemoryContext CurrentDynaHashCxt = NULL;
276
277 static void *
DynaHashAlloc(Size size)278 DynaHashAlloc(Size size)
279 {
280 Assert(MemoryContextIsValid(CurrentDynaHashCxt));
281 return MemoryContextAlloc(CurrentDynaHashCxt, size);
282 }
283
284
285 /*
286 * HashCompareFunc for string keys
287 *
288 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
289 * bytes, so we can only compare that many ... hence strncmp is almost but
290 * not quite the right thing.
291 */
292 static int
string_compare(const char * key1,const char * key2,Size keysize)293 string_compare(const char *key1, const char *key2, Size keysize)
294 {
295 return strncmp(key1, key2, keysize - 1);
296 }
297
298
299 /************************** CREATE ROUTINES **********************/
300
301 /*
302 * hash_create -- create a new dynamic hash table
303 *
304 * tabname: a name for the table (for debugging purposes)
305 * nelem: maximum number of elements expected
306 * *info: additional table parameters, as indicated by flags
307 * flags: bitmask indicating which parameters to take from *info
308 *
309 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
310 * estimate, since we can't expand the table on the fly. But an unshared
311 * hashtable can be expanded on-the-fly, so it's better for nelem to be
312 * on the small side and let the table grow if it's exceeded. An overly
313 * large nelem will penalize hash_seq_search speed without buying much.
314 */
315 HTAB *
hash_create(const char * tabname,long nelem,HASHCTL * info,int flags)316 hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
317 {
318 HTAB *hashp;
319 HASHHDR *hctl;
320
321 /*
322 * For shared hash tables, we have a local hash header (HTAB struct) that
323 * we allocate in TopMemoryContext; all else is in shared memory.
324 *
325 * For non-shared hash tables, everything including the hash header is in
326 * a memory context created specially for the hash table --- this makes
327 * hash_destroy very simple. The memory context is made a child of either
328 * a context specified by the caller, or TopMemoryContext if nothing is
329 * specified.
330 */
331 if (flags & HASH_SHARED_MEM)
332 {
333 /* Set up to allocate the hash header */
334 CurrentDynaHashCxt = TopMemoryContext;
335 }
336 else
337 {
338 /* Create the hash table's private memory context */
339 if (flags & HASH_CONTEXT)
340 CurrentDynaHashCxt = info->hcxt;
341 else
342 CurrentDynaHashCxt = TopMemoryContext;
343 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
344 tabname,
345 ALLOCSET_DEFAULT_SIZES);
346 }
347
348 /* Initialize the hash header, plus a copy of the table name */
349 hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
350 MemSet(hashp, 0, sizeof(HTAB));
351
352 hashp->tabname = (char *) (hashp + 1);
353 strcpy(hashp->tabname, tabname);
354
355 /*
356 * Select the appropriate hash function (see comments at head of file).
357 */
358 if (flags & HASH_FUNCTION)
359 hashp->hash = info->hash;
360 else if (flags & HASH_BLOBS)
361 {
362 /* We can optimize hashing for common key sizes */
363 Assert(flags & HASH_ELEM);
364 if (info->keysize == sizeof(uint32))
365 hashp->hash = uint32_hash;
366 else
367 hashp->hash = tag_hash;
368 }
369 else
370 hashp->hash = string_hash; /* default hash function */
371
372 /*
373 * If you don't specify a match function, it defaults to string_compare if
374 * you used string_hash (either explicitly or by default) and to memcmp
375 * otherwise.
376 *
377 * Note: explicitly specifying string_hash is deprecated, because this
378 * might not work for callers in loadable modules on some platforms due to
379 * referencing a trampoline instead of the string_hash function proper.
380 * Just let it default, eh?
381 */
382 if (flags & HASH_COMPARE)
383 hashp->match = info->match;
384 else if (hashp->hash == string_hash)
385 hashp->match = (HashCompareFunc) string_compare;
386 else
387 hashp->match = memcmp;
388
389 /*
390 * Similarly, the key-copying function defaults to strlcpy or memcpy.
391 */
392 if (flags & HASH_KEYCOPY)
393 hashp->keycopy = info->keycopy;
394 else if (hashp->hash == string_hash)
395 hashp->keycopy = (HashCopyFunc) strlcpy;
396 else
397 hashp->keycopy = memcpy;
398
399 /* And select the entry allocation function, too. */
400 if (flags & HASH_ALLOC)
401 hashp->alloc = info->alloc;
402 else
403 hashp->alloc = DynaHashAlloc;
404
405 if (flags & HASH_SHARED_MEM)
406 {
407 /*
408 * ctl structure and directory are preallocated for shared memory
409 * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
410 * well.
411 */
412 hashp->hctl = info->hctl;
413 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
414 hashp->hcxt = NULL;
415 hashp->isshared = true;
416
417 /* hash table already exists, we're just attaching to it */
418 if (flags & HASH_ATTACH)
419 {
420 /* make local copies of some heavily-used values */
421 hctl = hashp->hctl;
422 hashp->keysize = hctl->keysize;
423 hashp->ssize = hctl->ssize;
424 hashp->sshift = hctl->sshift;
425
426 return hashp;
427 }
428 }
429 else
430 {
431 /* setup hash table defaults */
432 hashp->hctl = NULL;
433 hashp->dir = NULL;
434 hashp->hcxt = CurrentDynaHashCxt;
435 hashp->isshared = false;
436 }
437
438 if (!hashp->hctl)
439 {
440 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
441 if (!hashp->hctl)
442 ereport(ERROR,
443 (errcode(ERRCODE_OUT_OF_MEMORY),
444 errmsg("out of memory")));
445 }
446
447 hashp->frozen = false;
448
449 hdefault(hashp);
450
451 hctl = hashp->hctl;
452
453 if (flags & HASH_PARTITION)
454 {
455 /* Doesn't make sense to partition a local hash table */
456 Assert(flags & HASH_SHARED_MEM);
457
458 /*
459 * The number of partitions had better be a power of 2. Also, it must
460 * be less than INT_MAX (see init_htab()), so call the int version of
461 * next_pow2.
462 */
463 Assert(info->num_partitions == next_pow2_int(info->num_partitions));
464
465 hctl->num_partitions = info->num_partitions;
466 }
467
468 if (flags & HASH_SEGMENT)
469 {
470 hctl->ssize = info->ssize;
471 hctl->sshift = my_log2(info->ssize);
472 /* ssize had better be a power of 2 */
473 Assert(hctl->ssize == (1L << hctl->sshift));
474 }
475 if (flags & HASH_FFACTOR)
476 hctl->ffactor = info->ffactor;
477
478 /*
479 * SHM hash tables have fixed directory size passed by the caller.
480 */
481 if (flags & HASH_DIRSIZE)
482 {
483 hctl->max_dsize = info->max_dsize;
484 hctl->dsize = info->dsize;
485 }
486
487 /*
488 * hash table now allocates space for key and data but you have to say how
489 * much space to allocate
490 */
491 if (flags & HASH_ELEM)
492 {
493 Assert(info->entrysize >= info->keysize);
494 hctl->keysize = info->keysize;
495 hctl->entrysize = info->entrysize;
496 }
497
498 /* make local copies of heavily-used constant fields */
499 hashp->keysize = hctl->keysize;
500 hashp->ssize = hctl->ssize;
501 hashp->sshift = hctl->sshift;
502
503 /* Build the hash directory structure */
504 if (!init_htab(hashp, nelem))
505 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
506
507 /*
508 * For a shared hash table, preallocate the requested number of elements.
509 * This reduces problems with run-time out-of-shared-memory conditions.
510 *
511 * For a non-shared hash table, preallocate the requested number of
512 * elements if it's less than our chosen nelem_alloc. This avoids wasting
513 * space if the caller correctly estimates a small table size.
514 */
515 if ((flags & HASH_SHARED_MEM) ||
516 nelem < hctl->nelem_alloc)
517 {
518 int i,
519 freelist_partitions,
520 nelem_alloc,
521 nelem_alloc_first;
522
523 /*
524 * If hash table is partitioned, give each freelist an equal share of
525 * the initial allocation. Otherwise only freeList[0] is used.
526 */
527 if (IS_PARTITIONED(hashp->hctl))
528 freelist_partitions = NUM_FREELISTS;
529 else
530 freelist_partitions = 1;
531
532 nelem_alloc = nelem / freelist_partitions;
533 if (nelem_alloc <= 0)
534 nelem_alloc = 1;
535
536 /*
537 * Make sure we'll allocate all the requested elements; freeList[0]
538 * gets the excess if the request isn't divisible by NUM_FREELISTS.
539 */
540 if (nelem_alloc * freelist_partitions < nelem)
541 nelem_alloc_first =
542 nelem - nelem_alloc * (freelist_partitions - 1);
543 else
544 nelem_alloc_first = nelem_alloc;
545
546 for (i = 0; i < freelist_partitions; i++)
547 {
548 int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
549
550 if (!element_alloc(hashp, temp, i))
551 ereport(ERROR,
552 (errcode(ERRCODE_OUT_OF_MEMORY),
553 errmsg("out of memory")));
554 }
555 }
556
557 if (flags & HASH_FIXED_SIZE)
558 hashp->isfixed = true;
559 return hashp;
560 }
561
562 /*
563 * Set default HASHHDR parameters.
564 */
565 static void
hdefault(HTAB * hashp)566 hdefault(HTAB *hashp)
567 {
568 HASHHDR *hctl = hashp->hctl;
569
570 MemSet(hctl, 0, sizeof(HASHHDR));
571
572 hctl->dsize = DEF_DIRSIZE;
573 hctl->nsegs = 0;
574
575 /* rather pointless defaults for key & entry size */
576 hctl->keysize = sizeof(char *);
577 hctl->entrysize = 2 * sizeof(char *);
578
579 hctl->num_partitions = 0; /* not partitioned */
580
581 hctl->ffactor = DEF_FFACTOR;
582
583 /* table has no fixed maximum size */
584 hctl->max_dsize = NO_MAX_DSIZE;
585
586 hctl->ssize = DEF_SEGSIZE;
587 hctl->sshift = DEF_SEGSIZE_SHIFT;
588
589 #ifdef HASH_STATISTICS
590 hctl->accesses = hctl->collisions = 0;
591 #endif
592 }
593
594 /*
595 * Given the user-specified entry size, choose nelem_alloc, ie, how many
596 * elements to add to the hash table when we need more.
597 */
598 static int
choose_nelem_alloc(Size entrysize)599 choose_nelem_alloc(Size entrysize)
600 {
601 int nelem_alloc;
602 Size elementSize;
603 Size allocSize;
604
605 /* Each element has a HASHELEMENT header plus user data. */
606 /* NB: this had better match element_alloc() */
607 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
608
609 /*
610 * The idea here is to choose nelem_alloc at least 32, but round up so
611 * that the allocation request will be a power of 2 or just less. This
612 * makes little difference for hash tables in shared memory, but for hash
613 * tables managed by palloc, the allocation request will be rounded up to
614 * a power of 2 anyway. If we fail to take this into account, we'll waste
615 * as much as half the allocated space.
616 */
617 allocSize = 32 * 4; /* assume elementSize at least 8 */
618 do
619 {
620 allocSize <<= 1;
621 nelem_alloc = allocSize / elementSize;
622 } while (nelem_alloc < 32);
623
624 return nelem_alloc;
625 }
626
627 /*
628 * Compute derived fields of hctl and build the initial directory/segment
629 * arrays
630 */
631 static bool
init_htab(HTAB * hashp,long nelem)632 init_htab(HTAB *hashp, long nelem)
633 {
634 HASHHDR *hctl = hashp->hctl;
635 HASHSEGMENT *segp;
636 int nbuckets;
637 int nsegs;
638 int i;
639
640 /*
641 * initialize mutexes if it's a partitioned table
642 */
643 if (IS_PARTITIONED(hctl))
644 for (i = 0; i < NUM_FREELISTS; i++)
645 SpinLockInit(&(hctl->freeList[i].mutex));
646
647 /*
648 * Divide number of elements by the fill factor to determine a desired
649 * number of buckets. Allocate space for the next greater power of two
650 * number of buckets
651 */
652 nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1);
653
654 /*
655 * In a partitioned table, nbuckets must be at least equal to
656 * num_partitions; were it less, keys with apparently different partition
657 * numbers would map to the same bucket, breaking partition independence.
658 * (Normally nbuckets will be much bigger; this is just a safety check.)
659 */
660 while (nbuckets < hctl->num_partitions)
661 nbuckets <<= 1;
662
663 hctl->max_bucket = hctl->low_mask = nbuckets - 1;
664 hctl->high_mask = (nbuckets << 1) - 1;
665
666 /*
667 * Figure number of directory segments needed, round up to a power of 2
668 */
669 nsegs = (nbuckets - 1) / hctl->ssize + 1;
670 nsegs = next_pow2_int(nsegs);
671
672 /*
673 * Make sure directory is big enough. If pre-allocated directory is too
674 * small, choke (caller screwed up).
675 */
676 if (nsegs > hctl->dsize)
677 {
678 if (!(hashp->dir))
679 hctl->dsize = nsegs;
680 else
681 return false;
682 }
683
684 /* Allocate a directory */
685 if (!(hashp->dir))
686 {
687 CurrentDynaHashCxt = hashp->hcxt;
688 hashp->dir = (HASHSEGMENT *)
689 hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
690 if (!hashp->dir)
691 return false;
692 }
693
694 /* Allocate initial segments */
695 for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
696 {
697 *segp = seg_alloc(hashp);
698 if (*segp == NULL)
699 return false;
700 }
701
702 /* Choose number of entries to allocate at a time */
703 hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
704
705 #if HASH_DEBUG
706 fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
707 "TABLE POINTER ", hashp,
708 "DIRECTORY SIZE ", hctl->dsize,
709 "SEGMENT SIZE ", hctl->ssize,
710 "SEGMENT SHIFT ", hctl->sshift,
711 "FILL FACTOR ", hctl->ffactor,
712 "MAX BUCKET ", hctl->max_bucket,
713 "HIGH MASK ", hctl->high_mask,
714 "LOW MASK ", hctl->low_mask,
715 "NSEGS ", hctl->nsegs);
716 #endif
717 return true;
718 }
719
720 /*
721 * Estimate the space needed for a hashtable containing the given number
722 * of entries of given size.
723 * NOTE: this is used to estimate the footprint of hashtables in shared
724 * memory; therefore it does not count HTAB which is in local memory.
725 * NB: assumes that all hash structure parameters have default values!
726 */
727 Size
hash_estimate_size(long num_entries,Size entrysize)728 hash_estimate_size(long num_entries, Size entrysize)
729 {
730 Size size;
731 long nBuckets,
732 nSegments,
733 nDirEntries,
734 nElementAllocs,
735 elementSize,
736 elementAllocCnt;
737
738 /* estimate number of buckets wanted */
739 nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
740 /* # of segments needed for nBuckets */
741 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
742 /* directory entries */
743 nDirEntries = DEF_DIRSIZE;
744 while (nDirEntries < nSegments)
745 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
746
747 /* fixed control info */
748 size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
749 /* directory */
750 size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
751 /* segments */
752 size = add_size(size, mul_size(nSegments,
753 MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
754 /* elements --- allocated in groups of choose_nelem_alloc() entries */
755 elementAllocCnt = choose_nelem_alloc(entrysize);
756 nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
757 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
758 size = add_size(size,
759 mul_size(nElementAllocs,
760 mul_size(elementAllocCnt, elementSize)));
761
762 return size;
763 }
764
765 /*
766 * Select an appropriate directory size for a hashtable with the given
767 * maximum number of entries.
768 * This is only needed for hashtables in shared memory, whose directories
769 * cannot be expanded dynamically.
770 * NB: assumes that all hash structure parameters have default values!
771 *
772 * XXX this had better agree with the behavior of init_htab()...
773 */
774 long
hash_select_dirsize(long num_entries)775 hash_select_dirsize(long num_entries)
776 {
777 long nBuckets,
778 nSegments,
779 nDirEntries;
780
781 /* estimate number of buckets wanted */
782 nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
783 /* # of segments needed for nBuckets */
784 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
785 /* directory entries */
786 nDirEntries = DEF_DIRSIZE;
787 while (nDirEntries < nSegments)
788 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
789
790 return nDirEntries;
791 }
792
793 /*
794 * Compute the required initial memory allocation for a shared-memory
795 * hashtable with the given parameters. We need space for the HASHHDR
796 * and for the (non expansible) directory.
797 */
798 Size
hash_get_shared_size(HASHCTL * info,int flags)799 hash_get_shared_size(HASHCTL *info, int flags)
800 {
801 Assert(flags & HASH_DIRSIZE);
802 Assert(info->dsize == info->max_dsize);
803 return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
804 }
805
806
807 /********************** DESTROY ROUTINES ************************/
808
809 void
hash_destroy(HTAB * hashp)810 hash_destroy(HTAB *hashp)
811 {
812 if (hashp != NULL)
813 {
814 /* allocation method must be one we know how to free, too */
815 Assert(hashp->alloc == DynaHashAlloc);
816 /* so this hashtable must have it's own context */
817 Assert(hashp->hcxt != NULL);
818
819 hash_stats("destroy", hashp);
820
821 /*
822 * Free everything by destroying the hash table's memory context.
823 */
824 MemoryContextDelete(hashp->hcxt);
825 }
826 }
827
828 void
hash_stats(const char * where,HTAB * hashp)829 hash_stats(const char *where, HTAB *hashp)
830 {
831 #if HASH_STATISTICS
832 fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
833 where, hashp->hctl->accesses, hashp->hctl->collisions);
834
835 fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
836 hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
837 hashp->hctl->max_bucket, hashp->hctl->nsegs);
838 fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
839 where, hash_accesses, hash_collisions);
840 fprintf(stderr, "hash_stats: total expansions %ld\n",
841 hash_expansions);
842 #endif
843 }
844
845 /*******************************SEARCH ROUTINES *****************************/
846
847
848 /*
849 * get_hash_value -- exported routine to calculate a key's hash value
850 *
851 * We export this because for partitioned tables, callers need to compute
852 * the partition number (from the low-order bits of the hash value) before
853 * searching.
854 */
855 uint32
get_hash_value(HTAB * hashp,const void * keyPtr)856 get_hash_value(HTAB *hashp, const void *keyPtr)
857 {
858 return hashp->hash(keyPtr, hashp->keysize);
859 }
860
861 /* Convert a hash value to a bucket number */
862 static inline uint32
calc_bucket(HASHHDR * hctl,uint32 hash_val)863 calc_bucket(HASHHDR *hctl, uint32 hash_val)
864 {
865 uint32 bucket;
866
867 bucket = hash_val & hctl->high_mask;
868 if (bucket > hctl->max_bucket)
869 bucket = bucket & hctl->low_mask;
870
871 return bucket;
872 }
873
874 /*
875 * hash_search -- look up key in table and perform action
876 * hash_search_with_hash_value -- same, with key's hash value already computed
877 *
878 * action is one of:
879 * HASH_FIND: look up key in table
880 * HASH_ENTER: look up key in table, creating entry if not present
881 * HASH_ENTER_NULL: same, but return NULL if out of memory
882 * HASH_REMOVE: look up key in table, remove entry if present
883 *
884 * Return value is a pointer to the element found/entered/removed if any,
885 * or NULL if no match was found. (NB: in the case of the REMOVE action,
886 * the result is a dangling pointer that shouldn't be dereferenced!)
887 *
888 * HASH_ENTER will normally ereport a generic "out of memory" error if
889 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
890 * the same except it will return NULL if out of memory. Note that
891 * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
892 * since palloc internally ereports on out-of-memory.
893 *
894 * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
895 * existing entry in the table, FALSE otherwise. This is needed in the
896 * HASH_ENTER case, but is redundant with the return value otherwise.
897 *
898 * For hash_search_with_hash_value, the hashvalue parameter must have been
899 * calculated with get_hash_value().
900 */
901 void *
hash_search(HTAB * hashp,const void * keyPtr,HASHACTION action,bool * foundPtr)902 hash_search(HTAB *hashp,
903 const void *keyPtr,
904 HASHACTION action,
905 bool *foundPtr)
906 {
907 return hash_search_with_hash_value(hashp,
908 keyPtr,
909 hashp->hash(keyPtr, hashp->keysize),
910 action,
911 foundPtr);
912 }
913
914 void *
hash_search_with_hash_value(HTAB * hashp,const void * keyPtr,uint32 hashvalue,HASHACTION action,bool * foundPtr)915 hash_search_with_hash_value(HTAB *hashp,
916 const void *keyPtr,
917 uint32 hashvalue,
918 HASHACTION action,
919 bool *foundPtr)
920 {
921 HASHHDR *hctl = hashp->hctl;
922 int freelist_idx = FREELIST_IDX(hctl, hashvalue);
923 Size keysize;
924 uint32 bucket;
925 long segment_num;
926 long segment_ndx;
927 HASHSEGMENT segp;
928 HASHBUCKET currBucket;
929 HASHBUCKET *prevBucketPtr;
930 HashCompareFunc match;
931
932 #if HASH_STATISTICS
933 hash_accesses++;
934 hctl->accesses++;
935 #endif
936
937 /*
938 * If inserting, check if it is time to split a bucket.
939 *
940 * NOTE: failure to expand table is not a fatal error, it just means we
941 * have to run at higher fill factor than we wanted. However, if we're
942 * using the palloc allocator then it will throw error anyway on
943 * out-of-memory, so we must do this before modifying the table.
944 */
945 if (action == HASH_ENTER || action == HASH_ENTER_NULL)
946 {
947 /*
948 * Can't split if running in partitioned mode, nor if frozen, nor if
949 * table is the subject of any active hash_seq_search scans. Strange
950 * order of these tests is to try to check cheaper conditions first.
951 */
952 if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
953 hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
954 !has_seq_scans(hashp))
955 (void) expand_table(hashp);
956 }
957
958 /*
959 * Do the initial lookup
960 */
961 bucket = calc_bucket(hctl, hashvalue);
962
963 segment_num = bucket >> hashp->sshift;
964 segment_ndx = MOD(bucket, hashp->ssize);
965
966 segp = hashp->dir[segment_num];
967
968 if (segp == NULL)
969 hash_corrupted(hashp);
970
971 prevBucketPtr = &segp[segment_ndx];
972 currBucket = *prevBucketPtr;
973
974 /*
975 * Follow collision chain looking for matching key
976 */
977 match = hashp->match; /* save one fetch in inner loop */
978 keysize = hashp->keysize; /* ditto */
979
980 while (currBucket != NULL)
981 {
982 if (currBucket->hashvalue == hashvalue &&
983 match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
984 break;
985 prevBucketPtr = &(currBucket->link);
986 currBucket = *prevBucketPtr;
987 #if HASH_STATISTICS
988 hash_collisions++;
989 hctl->collisions++;
990 #endif
991 }
992
993 if (foundPtr)
994 *foundPtr = (bool) (currBucket != NULL);
995
996 /*
997 * OK, now what?
998 */
999 switch (action)
1000 {
1001 case HASH_FIND:
1002 if (currBucket != NULL)
1003 return (void *) ELEMENTKEY(currBucket);
1004 return NULL;
1005
1006 case HASH_REMOVE:
1007 if (currBucket != NULL)
1008 {
1009 /* if partitioned, must lock to touch nentries and freeList */
1010 if (IS_PARTITIONED(hctl))
1011 SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1012
1013 /* delete the record from the appropriate nentries counter. */
1014 Assert(hctl->freeList[freelist_idx].nentries > 0);
1015 hctl->freeList[freelist_idx].nentries--;
1016
1017 /* remove record from hash bucket's chain. */
1018 *prevBucketPtr = currBucket->link;
1019
1020 /* add the record to the appropriate freelist. */
1021 currBucket->link = hctl->freeList[freelist_idx].freeList;
1022 hctl->freeList[freelist_idx].freeList = currBucket;
1023
1024 if (IS_PARTITIONED(hctl))
1025 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1026
1027 /*
1028 * better hope the caller is synchronizing access to this
1029 * element, because someone else is going to reuse it the next
1030 * time something is added to the table
1031 */
1032 return (void *) ELEMENTKEY(currBucket);
1033 }
1034 return NULL;
1035
1036 case HASH_ENTER_NULL:
1037 /* ENTER_NULL does not work with palloc-based allocator */
1038 Assert(hashp->alloc != DynaHashAlloc);
1039 /* FALL THRU */
1040
1041 case HASH_ENTER:
1042 /* Return existing element if found, else create one */
1043 if (currBucket != NULL)
1044 return (void *) ELEMENTKEY(currBucket);
1045
1046 /* disallow inserts if frozen */
1047 if (hashp->frozen)
1048 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1049 hashp->tabname);
1050
1051 currBucket = get_hash_entry(hashp, freelist_idx);
1052 if (currBucket == NULL)
1053 {
1054 /* out of memory */
1055 if (action == HASH_ENTER_NULL)
1056 return NULL;
1057 /* report a generic message */
1058 if (hashp->isshared)
1059 ereport(ERROR,
1060 (errcode(ERRCODE_OUT_OF_MEMORY),
1061 errmsg("out of shared memory")));
1062 else
1063 ereport(ERROR,
1064 (errcode(ERRCODE_OUT_OF_MEMORY),
1065 errmsg("out of memory")));
1066 }
1067
1068 /* link into hashbucket chain */
1069 *prevBucketPtr = currBucket;
1070 currBucket->link = NULL;
1071
1072 /* copy key into record */
1073 currBucket->hashvalue = hashvalue;
1074 hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1075
1076 /*
1077 * Caller is expected to fill the data field on return. DO NOT
1078 * insert any code that could possibly throw error here, as doing
1079 * so would leave the table entry incomplete and hence corrupt the
1080 * caller's data structure.
1081 */
1082
1083 return (void *) ELEMENTKEY(currBucket);
1084 }
1085
1086 elog(ERROR, "unrecognized hash action code: %d", (int) action);
1087
1088 return NULL; /* keep compiler quiet */
1089 }
1090
1091 /*
1092 * hash_update_hash_key -- change the hash key of an existing table entry
1093 *
1094 * This is equivalent to removing the entry, making a new entry, and copying
1095 * over its data, except that the entry never goes to the table's freelist.
1096 * Therefore this cannot suffer an out-of-memory failure, even if there are
1097 * other processes operating in other partitions of the hashtable.
1098 *
1099 * Returns TRUE if successful, FALSE if the requested new hash key is already
1100 * present. Throws error if the specified entry pointer isn't actually a
1101 * table member.
1102 *
1103 * NB: currently, there is no special case for old and new hash keys being
1104 * identical, which means we'll report FALSE for that situation. This is
1105 * preferable for existing uses.
1106 *
1107 * NB: for a partitioned hashtable, caller must hold lock on both relevant
1108 * partitions, if the new hash key would belong to a different partition.
1109 */
1110 bool
hash_update_hash_key(HTAB * hashp,void * existingEntry,const void * newKeyPtr)1111 hash_update_hash_key(HTAB *hashp,
1112 void *existingEntry,
1113 const void *newKeyPtr)
1114 {
1115 HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1116 HASHHDR *hctl = hashp->hctl;
1117 uint32 newhashvalue;
1118 Size keysize;
1119 uint32 bucket;
1120 uint32 newbucket;
1121 long segment_num;
1122 long segment_ndx;
1123 HASHSEGMENT segp;
1124 HASHBUCKET currBucket;
1125 HASHBUCKET *prevBucketPtr;
1126 HASHBUCKET *oldPrevPtr;
1127 HashCompareFunc match;
1128
1129 #if HASH_STATISTICS
1130 hash_accesses++;
1131 hctl->accesses++;
1132 #endif
1133
1134 /* disallow updates if frozen */
1135 if (hashp->frozen)
1136 elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1137 hashp->tabname);
1138
1139 /*
1140 * Lookup the existing element using its saved hash value. We need to do
1141 * this to be able to unlink it from its hash chain, but as a side benefit
1142 * we can verify the validity of the passed existingEntry pointer.
1143 */
1144 bucket = calc_bucket(hctl, existingElement->hashvalue);
1145
1146 segment_num = bucket >> hashp->sshift;
1147 segment_ndx = MOD(bucket, hashp->ssize);
1148
1149 segp = hashp->dir[segment_num];
1150
1151 if (segp == NULL)
1152 hash_corrupted(hashp);
1153
1154 prevBucketPtr = &segp[segment_ndx];
1155 currBucket = *prevBucketPtr;
1156
1157 while (currBucket != NULL)
1158 {
1159 if (currBucket == existingElement)
1160 break;
1161 prevBucketPtr = &(currBucket->link);
1162 currBucket = *prevBucketPtr;
1163 }
1164
1165 if (currBucket == NULL)
1166 elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1167 hashp->tabname);
1168
1169 oldPrevPtr = prevBucketPtr;
1170
1171 /*
1172 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1173 * chain we want to put the entry into.
1174 */
1175 newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1176
1177 newbucket = calc_bucket(hctl, newhashvalue);
1178
1179 segment_num = newbucket >> hashp->sshift;
1180 segment_ndx = MOD(newbucket, hashp->ssize);
1181
1182 segp = hashp->dir[segment_num];
1183
1184 if (segp == NULL)
1185 hash_corrupted(hashp);
1186
1187 prevBucketPtr = &segp[segment_ndx];
1188 currBucket = *prevBucketPtr;
1189
1190 /*
1191 * Follow collision chain looking for matching key
1192 */
1193 match = hashp->match; /* save one fetch in inner loop */
1194 keysize = hashp->keysize; /* ditto */
1195
1196 while (currBucket != NULL)
1197 {
1198 if (currBucket->hashvalue == newhashvalue &&
1199 match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1200 break;
1201 prevBucketPtr = &(currBucket->link);
1202 currBucket = *prevBucketPtr;
1203 #if HASH_STATISTICS
1204 hash_collisions++;
1205 hctl->collisions++;
1206 #endif
1207 }
1208
1209 if (currBucket != NULL)
1210 return false; /* collision with an existing entry */
1211
1212 currBucket = existingElement;
1213
1214 /*
1215 * If old and new hash values belong to the same bucket, we need not
1216 * change any chain links, and indeed should not since this simplistic
1217 * update will corrupt the list if currBucket is the last element. (We
1218 * cannot fall out earlier, however, since we need to scan the bucket to
1219 * check for duplicate keys.)
1220 */
1221 if (bucket != newbucket)
1222 {
1223 /* OK to remove record from old hash bucket's chain. */
1224 *oldPrevPtr = currBucket->link;
1225
1226 /* link into new hashbucket chain */
1227 *prevBucketPtr = currBucket;
1228 currBucket->link = NULL;
1229 }
1230
1231 /* copy new key into record */
1232 currBucket->hashvalue = newhashvalue;
1233 hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1234
1235 /* rest of record is untouched */
1236
1237 return true;
1238 }
1239
1240 /*
1241 * Allocate a new hashtable entry if possible; return NULL if out of memory.
1242 * (Or, if the underlying space allocator throws error for out-of-memory,
1243 * we won't return at all.)
1244 */
1245 static HASHBUCKET
get_hash_entry(HTAB * hashp,int freelist_idx)1246 get_hash_entry(HTAB *hashp, int freelist_idx)
1247 {
1248 HASHHDR *hctl = hashp->hctl;
1249 HASHBUCKET newElement;
1250
1251 for (;;)
1252 {
1253 /* if partitioned, must lock to touch nentries and freeList */
1254 if (IS_PARTITIONED(hctl))
1255 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1256
1257 /* try to get an entry from the freelist */
1258 newElement = hctl->freeList[freelist_idx].freeList;
1259
1260 if (newElement != NULL)
1261 break;
1262
1263 if (IS_PARTITIONED(hctl))
1264 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1265
1266 /*
1267 * No free elements in this freelist. In a partitioned table, there
1268 * might be entries in other freelists, but to reduce contention we
1269 * prefer to first try to get another chunk of buckets from the main
1270 * shmem allocator. If that fails, though, we *MUST* root through all
1271 * the other freelists before giving up. There are multiple callers
1272 * that assume that they can allocate every element in the initially
1273 * requested table size, or that deleting an element guarantees they
1274 * can insert a new element, even if shared memory is entirely full.
1275 * Failing because the needed element is in a different freelist is
1276 * not acceptable.
1277 */
1278 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1279 {
1280 int borrow_from_idx;
1281
1282 if (!IS_PARTITIONED(hctl))
1283 return NULL; /* out of memory */
1284
1285 /* try to borrow element from another freelist */
1286 borrow_from_idx = freelist_idx;
1287 for (;;)
1288 {
1289 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1290 if (borrow_from_idx == freelist_idx)
1291 break; /* examined all freelists, fail */
1292
1293 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1294 newElement = hctl->freeList[borrow_from_idx].freeList;
1295
1296 if (newElement != NULL)
1297 {
1298 hctl->freeList[borrow_from_idx].freeList = newElement->link;
1299 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1300
1301 /* careful: count the new element in its proper freelist */
1302 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1303 hctl->freeList[freelist_idx].nentries++;
1304 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1305
1306 return newElement;
1307 }
1308
1309 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1310 }
1311
1312 /* no elements available to borrow either, so out of memory */
1313 return NULL;
1314 }
1315 }
1316
1317 /* remove entry from freelist, bump nentries */
1318 hctl->freeList[freelist_idx].freeList = newElement->link;
1319 hctl->freeList[freelist_idx].nentries++;
1320
1321 if (IS_PARTITIONED(hctl))
1322 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1323
1324 return newElement;
1325 }
1326
1327 /*
1328 * hash_get_num_entries -- get the number of entries in a hashtable
1329 */
1330 long
hash_get_num_entries(HTAB * hashp)1331 hash_get_num_entries(HTAB *hashp)
1332 {
1333 int i;
1334 long sum = hashp->hctl->freeList[0].nentries;
1335
1336 /*
1337 * We currently don't bother with acquiring the mutexes; it's only
1338 * sensible to call this function if you've got lock on all partitions of
1339 * the table.
1340 */
1341 if (IS_PARTITIONED(hashp->hctl))
1342 {
1343 for (i = 1; i < NUM_FREELISTS; i++)
1344 sum += hashp->hctl->freeList[i].nentries;
1345 }
1346
1347 return sum;
1348 }
1349
1350 /*
1351 * hash_seq_init/_search/_term
1352 * Sequentially search through hash table and return
1353 * all the elements one by one, return NULL when no more.
1354 *
1355 * hash_seq_term should be called if and only if the scan is abandoned before
1356 * completion; if hash_seq_search returns NULL then it has already done the
1357 * end-of-scan cleanup.
1358 *
1359 * NOTE: caller may delete the returned element before continuing the scan.
1360 * However, deleting any other element while the scan is in progress is
1361 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1362 * if elements are added to the table while the scan is in progress, it is
1363 * unspecified whether they will be visited by the scan or not.
1364 *
1365 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1366 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1367 * further insertions by calling hash_freeze.
1368 *
1369 * NOTE: to use this with a partitioned hashtable, caller had better hold
1370 * at least shared lock on all partitions of the table throughout the scan!
1371 * We can cope with insertions or deletions by our own backend, but *not*
1372 * with concurrent insertions or deletions by another.
1373 */
1374 void
hash_seq_init(HASH_SEQ_STATUS * status,HTAB * hashp)1375 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1376 {
1377 status->hashp = hashp;
1378 status->curBucket = 0;
1379 status->curEntry = NULL;
1380 if (!hashp->frozen)
1381 register_seq_scan(hashp);
1382 }
1383
1384 void *
hash_seq_search(HASH_SEQ_STATUS * status)1385 hash_seq_search(HASH_SEQ_STATUS *status)
1386 {
1387 HTAB *hashp;
1388 HASHHDR *hctl;
1389 uint32 max_bucket;
1390 long ssize;
1391 long segment_num;
1392 long segment_ndx;
1393 HASHSEGMENT segp;
1394 uint32 curBucket;
1395 HASHELEMENT *curElem;
1396
1397 if ((curElem = status->curEntry) != NULL)
1398 {
1399 /* Continuing scan of curBucket... */
1400 status->curEntry = curElem->link;
1401 if (status->curEntry == NULL) /* end of this bucket */
1402 ++status->curBucket;
1403 return (void *) ELEMENTKEY(curElem);
1404 }
1405
1406 /*
1407 * Search for next nonempty bucket starting at curBucket.
1408 */
1409 curBucket = status->curBucket;
1410 hashp = status->hashp;
1411 hctl = hashp->hctl;
1412 ssize = hashp->ssize;
1413 max_bucket = hctl->max_bucket;
1414
1415 if (curBucket > max_bucket)
1416 {
1417 hash_seq_term(status);
1418 return NULL; /* search is done */
1419 }
1420
1421 /*
1422 * first find the right segment in the table directory.
1423 */
1424 segment_num = curBucket >> hashp->sshift;
1425 segment_ndx = MOD(curBucket, ssize);
1426
1427 segp = hashp->dir[segment_num];
1428
1429 /*
1430 * Pick up the first item in this bucket's chain. If chain is not empty
1431 * we can begin searching it. Otherwise we have to advance to find the
1432 * next nonempty bucket. We try to optimize that case since searching a
1433 * near-empty hashtable has to iterate this loop a lot.
1434 */
1435 while ((curElem = segp[segment_ndx]) == NULL)
1436 {
1437 /* empty bucket, advance to next */
1438 if (++curBucket > max_bucket)
1439 {
1440 status->curBucket = curBucket;
1441 hash_seq_term(status);
1442 return NULL; /* search is done */
1443 }
1444 if (++segment_ndx >= ssize)
1445 {
1446 segment_num++;
1447 segment_ndx = 0;
1448 segp = hashp->dir[segment_num];
1449 }
1450 }
1451
1452 /* Begin scan of curBucket... */
1453 status->curEntry = curElem->link;
1454 if (status->curEntry == NULL) /* end of this bucket */
1455 ++curBucket;
1456 status->curBucket = curBucket;
1457 return (void *) ELEMENTKEY(curElem);
1458 }
1459
1460 void
hash_seq_term(HASH_SEQ_STATUS * status)1461 hash_seq_term(HASH_SEQ_STATUS *status)
1462 {
1463 if (!status->hashp->frozen)
1464 deregister_seq_scan(status->hashp);
1465 }
1466
1467 /*
1468 * hash_freeze
1469 * Freeze a hashtable against future insertions (deletions are
1470 * still allowed)
1471 *
1472 * The reason for doing this is that by preventing any more bucket splits,
1473 * we no longer need to worry about registering hash_seq_search scans,
1474 * and thus caller need not be careful about ensuring hash_seq_term gets
1475 * called at the right times.
1476 *
1477 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1478 * with active scans (since hash_seq_term would then do the wrong thing).
1479 */
1480 void
hash_freeze(HTAB * hashp)1481 hash_freeze(HTAB *hashp)
1482 {
1483 if (hashp->isshared)
1484 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1485 if (!hashp->frozen && has_seq_scans(hashp))
1486 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1487 hashp->tabname);
1488 hashp->frozen = true;
1489 }
1490
1491
1492 /********************************* UTILITIES ************************/
1493
1494 /*
1495 * Expand the table by adding one more hash bucket.
1496 */
1497 static bool
expand_table(HTAB * hashp)1498 expand_table(HTAB *hashp)
1499 {
1500 HASHHDR *hctl = hashp->hctl;
1501 HASHSEGMENT old_seg,
1502 new_seg;
1503 long old_bucket,
1504 new_bucket;
1505 long new_segnum,
1506 new_segndx;
1507 long old_segnum,
1508 old_segndx;
1509 HASHBUCKET *oldlink,
1510 *newlink;
1511 HASHBUCKET currElement,
1512 nextElement;
1513
1514 Assert(!IS_PARTITIONED(hctl));
1515
1516 #ifdef HASH_STATISTICS
1517 hash_expansions++;
1518 #endif
1519
1520 new_bucket = hctl->max_bucket + 1;
1521 new_segnum = new_bucket >> hashp->sshift;
1522 new_segndx = MOD(new_bucket, hashp->ssize);
1523
1524 if (new_segnum >= hctl->nsegs)
1525 {
1526 /* Allocate new segment if necessary -- could fail if dir full */
1527 if (new_segnum >= hctl->dsize)
1528 if (!dir_realloc(hashp))
1529 return false;
1530 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1531 return false;
1532 hctl->nsegs++;
1533 }
1534
1535 /* OK, we created a new bucket */
1536 hctl->max_bucket++;
1537
1538 /*
1539 * *Before* changing masks, find old bucket corresponding to same hash
1540 * values; values in that bucket may need to be relocated to new bucket.
1541 * Note that new_bucket is certainly larger than low_mask at this point,
1542 * so we can skip the first step of the regular hash mask calc.
1543 */
1544 old_bucket = (new_bucket & hctl->low_mask);
1545
1546 /*
1547 * If we crossed a power of 2, readjust masks.
1548 */
1549 if ((uint32) new_bucket > hctl->high_mask)
1550 {
1551 hctl->low_mask = hctl->high_mask;
1552 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1553 }
1554
1555 /*
1556 * Relocate records to the new bucket. NOTE: because of the way the hash
1557 * masking is done in calc_bucket, only one old bucket can need to be
1558 * split at this point. With a different way of reducing the hash value,
1559 * that might not be true!
1560 */
1561 old_segnum = old_bucket >> hashp->sshift;
1562 old_segndx = MOD(old_bucket, hashp->ssize);
1563
1564 old_seg = hashp->dir[old_segnum];
1565 new_seg = hashp->dir[new_segnum];
1566
1567 oldlink = &old_seg[old_segndx];
1568 newlink = &new_seg[new_segndx];
1569
1570 for (currElement = *oldlink;
1571 currElement != NULL;
1572 currElement = nextElement)
1573 {
1574 nextElement = currElement->link;
1575 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1576 {
1577 *oldlink = currElement;
1578 oldlink = &currElement->link;
1579 }
1580 else
1581 {
1582 *newlink = currElement;
1583 newlink = &currElement->link;
1584 }
1585 }
1586 /* don't forget to terminate the rebuilt hash chains... */
1587 *oldlink = NULL;
1588 *newlink = NULL;
1589
1590 return true;
1591 }
1592
1593
1594 static bool
dir_realloc(HTAB * hashp)1595 dir_realloc(HTAB *hashp)
1596 {
1597 HASHSEGMENT *p;
1598 HASHSEGMENT *old_p;
1599 long new_dsize;
1600 long old_dirsize;
1601 long new_dirsize;
1602
1603 if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1604 return false;
1605
1606 /* Reallocate directory */
1607 new_dsize = hashp->hctl->dsize << 1;
1608 old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1609 new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1610
1611 old_p = hashp->dir;
1612 CurrentDynaHashCxt = hashp->hcxt;
1613 p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1614
1615 if (p != NULL)
1616 {
1617 memcpy(p, old_p, old_dirsize);
1618 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1619 hashp->dir = p;
1620 hashp->hctl->dsize = new_dsize;
1621
1622 /* XXX assume the allocator is palloc, so we know how to free */
1623 Assert(hashp->alloc == DynaHashAlloc);
1624 pfree(old_p);
1625
1626 return true;
1627 }
1628
1629 return false;
1630 }
1631
1632
1633 static HASHSEGMENT
seg_alloc(HTAB * hashp)1634 seg_alloc(HTAB *hashp)
1635 {
1636 HASHSEGMENT segp;
1637
1638 CurrentDynaHashCxt = hashp->hcxt;
1639 segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1640
1641 if (!segp)
1642 return NULL;
1643
1644 MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1645
1646 return segp;
1647 }
1648
1649 /*
1650 * allocate some new elements and link them into the indicated free list
1651 */
1652 static bool
element_alloc(HTAB * hashp,int nelem,int freelist_idx)1653 element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1654 {
1655 HASHHDR *hctl = hashp->hctl;
1656 Size elementSize;
1657 HASHELEMENT *firstElement;
1658 HASHELEMENT *tmpElement;
1659 HASHELEMENT *prevElement;
1660 int i;
1661
1662 if (hashp->isfixed)
1663 return false;
1664
1665 /* Each element has a HASHELEMENT header plus user data. */
1666 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1667
1668 CurrentDynaHashCxt = hashp->hcxt;
1669 firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1670
1671 if (!firstElement)
1672 return false;
1673
1674 /* prepare to link all the new entries into the freelist */
1675 prevElement = NULL;
1676 tmpElement = firstElement;
1677 for (i = 0; i < nelem; i++)
1678 {
1679 tmpElement->link = prevElement;
1680 prevElement = tmpElement;
1681 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1682 }
1683
1684 /* if partitioned, must lock to touch freeList */
1685 if (IS_PARTITIONED(hctl))
1686 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1687
1688 /* freelist could be nonempty if two backends did this concurrently */
1689 firstElement->link = hctl->freeList[freelist_idx].freeList;
1690 hctl->freeList[freelist_idx].freeList = prevElement;
1691
1692 if (IS_PARTITIONED(hctl))
1693 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1694
1695 return true;
1696 }
1697
1698 /* complain when we have detected a corrupted hashtable */
1699 static void
hash_corrupted(HTAB * hashp)1700 hash_corrupted(HTAB *hashp)
1701 {
1702 /*
1703 * If the corruption is in a shared hashtable, we'd better force a
1704 * systemwide restart. Otherwise, just shut down this one backend.
1705 */
1706 if (hashp->isshared)
1707 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1708 else
1709 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1710 }
1711
1712 /* calculate ceil(log base 2) of num */
1713 int
my_log2(long num)1714 my_log2(long num)
1715 {
1716 int i;
1717 long limit;
1718
1719 /* guard against too-large input, which would put us into infinite loop */
1720 if (num > LONG_MAX / 2)
1721 num = LONG_MAX / 2;
1722
1723 for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
1724 ;
1725 return i;
1726 }
1727
1728 /* calculate first power of 2 >= num, bounded to what will fit in a long */
1729 static long
next_pow2_long(long num)1730 next_pow2_long(long num)
1731 {
1732 /* my_log2's internal range check is sufficient */
1733 return 1L << my_log2(num);
1734 }
1735
1736 /* calculate first power of 2 >= num, bounded to what will fit in an int */
1737 static int
next_pow2_int(long num)1738 next_pow2_int(long num)
1739 {
1740 if (num > INT_MAX / 2)
1741 num = INT_MAX / 2;
1742 return 1 << my_log2(num);
1743 }
1744
1745
1746 /************************* SEQ SCAN TRACKING ************************/
1747
1748 /*
1749 * We track active hash_seq_search scans here. The need for this mechanism
1750 * comes from the fact that a scan will get confused if a bucket split occurs
1751 * while it's in progress: it might visit entries twice, or even miss some
1752 * entirely (if it's partway through the same bucket that splits). Hence
1753 * we want to inhibit bucket splits if there are any active scans on the
1754 * table being inserted into. This is a fairly rare case in current usage,
1755 * so just postponing the split until the next insertion seems sufficient.
1756 *
1757 * Given present usages of the function, only a few scans are likely to be
1758 * open concurrently; so a finite-size stack of open scans seems sufficient,
1759 * and we don't worry that linear search is too slow. Note that we do
1760 * allow multiple scans of the same hashtable to be open concurrently.
1761 *
1762 * This mechanism can support concurrent scan and insertion in a shared
1763 * hashtable if it's the same backend doing both. It would fail otherwise,
1764 * but locking reasons seem to preclude any such scenario anyway, so we don't
1765 * worry.
1766 *
1767 * This arrangement is reasonably robust if a transient hashtable is deleted
1768 * without notifying us. The absolute worst case is we might inhibit splits
1769 * in another table created later at exactly the same address. We will give
1770 * a warning at transaction end for reference leaks, so any bugs leading to
1771 * lack of notification should be easy to catch.
1772 */
1773
1774 #define MAX_SEQ_SCANS 100
1775
1776 static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1777 static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1778 static int num_seq_scans = 0;
1779
1780
1781 /* Register a table as having an active hash_seq_search scan */
1782 static void
register_seq_scan(HTAB * hashp)1783 register_seq_scan(HTAB *hashp)
1784 {
1785 if (num_seq_scans >= MAX_SEQ_SCANS)
1786 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1787 hashp->tabname);
1788 seq_scan_tables[num_seq_scans] = hashp;
1789 seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1790 num_seq_scans++;
1791 }
1792
1793 /* Deregister an active scan */
1794 static void
deregister_seq_scan(HTAB * hashp)1795 deregister_seq_scan(HTAB *hashp)
1796 {
1797 int i;
1798
1799 /* Search backward since it's most likely at the stack top */
1800 for (i = num_seq_scans - 1; i >= 0; i--)
1801 {
1802 if (seq_scan_tables[i] == hashp)
1803 {
1804 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1805 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1806 num_seq_scans--;
1807 return;
1808 }
1809 }
1810 elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1811 hashp->tabname);
1812 }
1813
1814 /* Check if a table has any active scan */
1815 static bool
has_seq_scans(HTAB * hashp)1816 has_seq_scans(HTAB *hashp)
1817 {
1818 int i;
1819
1820 for (i = 0; i < num_seq_scans; i++)
1821 {
1822 if (seq_scan_tables[i] == hashp)
1823 return true;
1824 }
1825 return false;
1826 }
1827
1828 /* Clean up any open scans at end of transaction */
1829 void
AtEOXact_HashTables(bool isCommit)1830 AtEOXact_HashTables(bool isCommit)
1831 {
1832 /*
1833 * During abort cleanup, open scans are expected; just silently clean 'em
1834 * out. An open scan at commit means someone forgot a hash_seq_term()
1835 * call, so complain.
1836 *
1837 * Note: it's tempting to try to print the tabname here, but refrain for
1838 * fear of touching deallocated memory. This isn't a user-facing message
1839 * anyway, so it needn't be pretty.
1840 */
1841 if (isCommit)
1842 {
1843 int i;
1844
1845 for (i = 0; i < num_seq_scans; i++)
1846 {
1847 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1848 seq_scan_tables[i]);
1849 }
1850 }
1851 num_seq_scans = 0;
1852 }
1853
1854 /* Clean up any open scans at end of subtransaction */
1855 void
AtEOSubXact_HashTables(bool isCommit,int nestDepth)1856 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1857 {
1858 int i;
1859
1860 /*
1861 * Search backward to make cleanup easy. Note we must check all entries,
1862 * not only those at the end of the array, because deletion technique
1863 * doesn't keep them in order.
1864 */
1865 for (i = num_seq_scans - 1; i >= 0; i--)
1866 {
1867 if (seq_scan_level[i] >= nestDepth)
1868 {
1869 if (isCommit)
1870 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1871 seq_scan_tables[i]);
1872 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1873 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1874 num_seq_scans--;
1875 }
1876 }
1877 }
1878