1 /* $NetBSD: subr_thmap.c,v 1.13 2023/04/11 13:06:21 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * Upstream: https://github.com/rmind/thmap/
29 */
30
31 /*
32 * Concurrent trie-hash map.
33 *
34 * The data structure is conceptually a radix trie on hashed keys.
35 * Keys are hashed using a 32-bit function. The root level is a special
36 * case: it is managed using the compare-and-swap (CAS) atomic operation
37 * and has a fanout of 64. The subsequent levels are constructed using
38 * intermediate nodes with a fanout of 16 (using 4 bits). As more levels
39 * are created, more blocks of the 32-bit hash value might be generated
40 * by incrementing the seed parameter of the hash function.
41 *
42 * Concurrency
43 *
44 * - READERS: Descending is simply walking through the slot values of
45 * the intermediate nodes. It is lock-free as there is no intermediate
46 * state: the slot is either empty or has a pointer to the child node.
47 * The main assumptions here are the following:
48 *
49 * i) modifications must preserve consistency with the respect to the
50 * readers i.e. the readers can only see the valid node values;
51 *
52 * ii) any invalid view must "fail" the reads, e.g. by making them
53 * re-try from the root; this is a case for deletions and is achieved
54 * using the NODE_DELETED flag.
55 *
56 * iii) the node destruction must be synchronized with the readers,
57 * e.g. by using the Epoch-based reclamation or other techniques.
58 *
59 * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which
60 * is implemented using the NODE_LOCKED bit) -- it provides mutual
61 * exclusion amongst concurrent writers. The lock order for the nodes
62 * is "bottom-up" i.e. they are locked as we ascend the trie. A key
63 * constraint here is that parent pointer never changes.
64 *
65 * - DELETES: In addition to writer's locking, the deletion keeps the
66 * intermediate nodes in a valid state and sets the NODE_DELETED flag,
67 * to indicate that the readers must re-start the walk from the root.
68 * As the levels are collapsed, NODE_DELETED gets propagated up-tree.
69 * The leaf nodes just stay as-is until they are reclaimed.
70 *
71 * - ROOT LEVEL: The root level is a special case, as it is implemented
72 * as an array (rather than intermediate node). The root-level slot can
73 * only be set using CAS and it can only be set to a valid intermediate
74 * node. The root-level slot can only be cleared when the node it points
75 * at becomes empty, is locked and marked as NODE_DELETED (this causes
76 * the insert/delete operations to re-try until the slot is set to NULL).
77 *
78 * References:
79 *
80 * W. Litwin, 1981, Trie Hashing.
81 * Proceedings of the 1981 ACM SIGMOD, p. 19-29
82 * https://dl.acm.org/citation.cfm?id=582322
83 *
84 * P. L. Lehman and S. B. Yao.
85 * Efficient locking for concurrent operations on B-trees.
86 * ACM TODS, 6(4):650-670, 1981
87 * https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf
88 */
89
90 #ifdef _KERNEL
91 #include <sys/cdefs.h>
92 #include <sys/param.h>
93 #include <sys/types.h>
94 #include <sys/thmap.h>
95 #include <sys/kmem.h>
96 #include <sys/lock.h>
97 #include <sys/atomic.h>
98 #include <sys/hash.h>
99 #include <sys/cprng.h>
100 #define THMAP_RCSID(a) __KERNEL_RCSID(0, a)
101 #else
102 #include <stdio.h>
103 #include <stdlib.h>
104 #include <stdbool.h>
105 #include <stddef.h>
106 #include <inttypes.h>
107 #include <string.h>
108 #include <limits.h>
109 #define THMAP_RCSID(a) __RCSID(a)
110
111 #include "thmap.h"
112 #include "utils.h"
113 #endif
114
115 THMAP_RCSID("$NetBSD: subr_thmap.c,v 1.13 2023/04/11 13:06:21 riastradh Exp $");
116
117 #include <crypto/blake2/blake2s.h>
118
119 /*
120 * NetBSD kernel wrappers
121 */
122 #ifdef _KERNEL
123 #define ASSERT KASSERT
124 #define atomic_thread_fence(x) membar_release() /* only used for release order */
125 #define atomic_compare_exchange_weak_explicit_32(p, e, n, m1, m2) \
126 (atomic_cas_32((p), *(e), (n)) == *(e))
127 #define atomic_compare_exchange_weak_explicit_ptr(p, e, n, m1, m2) \
128 (atomic_cas_ptr((p), *(void **)(e), (void *)(n)) == *(void **)(e))
129 #define atomic_exchange_explicit(o, n, m1) atomic_swap_ptr((o), (n))
130 #define murmurhash3 murmurhash2
131 #endif
132
133 /*
134 * The root level fanout is 64 (indexed by the last 6 bits of the hash
135 * value XORed with the length). Each subsequent level, represented by
136 * intermediate nodes, has a fanout of 16 (using 4 bits).
137 *
138 * The hash function produces 32-bit values.
139 */
140
141 #define HASHVAL_SEEDLEN (16)
142 #define HASHVAL_BITS (32)
143 #define HASHVAL_MOD (HASHVAL_BITS - 1)
144 #define HASHVAL_SHIFT (5)
145
146 #define ROOT_BITS (6)
147 #define ROOT_SIZE (1 << ROOT_BITS)
148 #define ROOT_MASK (ROOT_SIZE - 1)
149 #define ROOT_MSBITS (HASHVAL_BITS - ROOT_BITS)
150
151 #define LEVEL_BITS (4)
152 #define LEVEL_SIZE (1 << LEVEL_BITS)
153 #define LEVEL_MASK (LEVEL_SIZE - 1)
154
155 /*
156 * Instead of raw pointers, we use offsets from the base address.
157 * This accommodates the use of this data structure in shared memory,
158 * where mappings can be in different address spaces.
159 *
160 * The pointers must be aligned, since pointer tagging is used to
161 * differentiate the intermediate nodes from leaves. We reserve the
162 * least significant bit.
163 */
164 typedef uintptr_t thmap_ptr_t;
165 typedef uintptr_t atomic_thmap_ptr_t; // C11 _Atomic
166
167 #define THMAP_NULL ((thmap_ptr_t)0)
168
169 #define THMAP_LEAF_BIT (0x1)
170
171 #define THMAP_ALIGNED_P(p) (((uintptr_t)(p) & 3) == 0)
172 #define THMAP_ALIGN(p) ((uintptr_t)(p) & ~(uintptr_t)3)
173 #define THMAP_INODE_P(p) (((uintptr_t)(p) & THMAP_LEAF_BIT) == 0)
174
175 #define THMAP_GETPTR(th, p) ((void *)((th)->baseptr + (uintptr_t)(p)))
176 #define THMAP_GETOFF(th, p) ((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr))
177 #define THMAP_NODE(th, p) THMAP_GETPTR(th, THMAP_ALIGN(p))
178
179 /*
180 * State field.
181 */
182
183 #define NODE_LOCKED (1U << 31) // lock (writers)
184 #define NODE_DELETED (1U << 30) // node deleted
185 #define NODE_COUNT(s) ((s) & 0x3fffffff) // slot count mask
186
187 /*
188 * There are two types of nodes:
189 * - Intermediate nodes -- arrays pointing to another level or a leaf;
190 * - Leaves, which store a key-value pair.
191 */
192
193 typedef struct {
194 uint32_t state; // C11 _Atomic
195 thmap_ptr_t parent;
196 atomic_thmap_ptr_t slots[LEVEL_SIZE];
197 } thmap_inode_t;
198
199 #define THMAP_INODE_LEN sizeof(thmap_inode_t)
200
201 typedef struct {
202 thmap_ptr_t key;
203 size_t len;
204 void * val;
205 } thmap_leaf_t;
206
207 typedef struct {
208 const uint8_t * seed; // secret seed
209 unsigned rslot; // root-level slot index
210 unsigned level; // current level in the tree
211 unsigned hashidx; // current hash index (block of bits)
212 uint32_t hashval; // current hash value
213 } thmap_query_t;
214
215 typedef struct {
216 uintptr_t addr;
217 size_t len;
218 void * next;
219 } thmap_gc_t;
220
221 #define THMAP_ROOT_LEN (sizeof(thmap_ptr_t) * ROOT_SIZE)
222
223 struct thmap {
224 uintptr_t baseptr;
225 atomic_thmap_ptr_t * root;
226 unsigned flags;
227 const thmap_ops_t * ops;
228 thmap_gc_t * gc_list; // C11 _Atomic
229 uint8_t seed[HASHVAL_SEEDLEN];
230 };
231
232 static void stage_mem_gc(thmap_t *, uintptr_t, size_t);
233
234 /*
235 * A few low-level helper routines.
236 */
237
238 static uintptr_t
alloc_wrapper(size_t len)239 alloc_wrapper(size_t len)
240 {
241 return (uintptr_t)kmem_intr_alloc(len, KM_NOSLEEP);
242 }
243
244 static void
free_wrapper(uintptr_t addr,size_t len)245 free_wrapper(uintptr_t addr, size_t len)
246 {
247 kmem_intr_free((void *)addr, len);
248 }
249
250 static const thmap_ops_t thmap_default_ops = {
251 .alloc = alloc_wrapper,
252 .free = free_wrapper
253 };
254
255 /*
256 * NODE LOCKING.
257 */
258
259 static inline bool __diagused
node_locked_p(thmap_inode_t * node)260 node_locked_p(thmap_inode_t *node)
261 {
262 return (atomic_load_relaxed(&node->state) & NODE_LOCKED) != 0;
263 }
264
265 static void
lock_node(thmap_inode_t * node)266 lock_node(thmap_inode_t *node)
267 {
268 unsigned bcount = SPINLOCK_BACKOFF_MIN;
269 uint32_t s;
270 again:
271 s = atomic_load_relaxed(&node->state);
272 if (s & NODE_LOCKED) {
273 SPINLOCK_BACKOFF(bcount);
274 goto again;
275 }
276 /* Acquire from prior release in unlock_node.() */
277 if (!atomic_compare_exchange_weak_explicit_32(&node->state,
278 &s, s | NODE_LOCKED, memory_order_acquire, memory_order_relaxed)) {
279 bcount = SPINLOCK_BACKOFF_MIN;
280 goto again;
281 }
282 }
283
284 static void
unlock_node(thmap_inode_t * node)285 unlock_node(thmap_inode_t *node)
286 {
287 uint32_t s = atomic_load_relaxed(&node->state) & ~NODE_LOCKED;
288
289 ASSERT(node_locked_p(node));
290 /* Release to subsequent acquire in lock_node(). */
291 atomic_store_release(&node->state, s);
292 }
293
294 /*
295 * HASH VALUE AND KEY OPERATIONS.
296 */
297
298 static inline uint32_t
hash(const uint8_t seed[static HASHVAL_SEEDLEN],const void * key,size_t len,uint32_t level)299 hash(const uint8_t seed[static HASHVAL_SEEDLEN], const void *key, size_t len,
300 uint32_t level)
301 {
302 struct blake2s B;
303 uint32_t h;
304
305 if (level == 0)
306 return murmurhash3(key, len, 0);
307
308 /*
309 * Byte order is not significant here because this is
310 * intentionally secret and independent for each thmap.
311 *
312 * XXX We get 32 bytes of output at a time; we could march
313 * through them sequentially rather than throwing away 28 bytes
314 * and recomputing BLAKE2 each time. But the number of
315 * iterations ought to be geometric in the collision
316 * probability at each level which should be very small anyway.
317 */
318 blake2s_init(&B, sizeof h, seed, HASHVAL_SEEDLEN);
319 blake2s_update(&B, &level, sizeof level);
320 blake2s_update(&B, key, len);
321 blake2s_final(&B, &h);
322
323 return h;
324 }
325
326 static inline void
hashval_init(thmap_query_t * query,const uint8_t seed[static HASHVAL_SEEDLEN],const void * restrict key,size_t len)327 hashval_init(thmap_query_t *query, const uint8_t seed[static HASHVAL_SEEDLEN],
328 const void * restrict key, size_t len)
329 {
330 const uint32_t hashval = hash(seed, key, len, 0);
331
332 query->seed = seed;
333 query->rslot = ((hashval >> ROOT_MSBITS) ^ len) & ROOT_MASK;
334 query->level = 0;
335 query->hashval = hashval;
336 query->hashidx = 0;
337 }
338
339 /*
340 * hashval_getslot: given the key, compute the hash (if not already cached)
341 * and return the offset for the current level.
342 */
343 static unsigned
hashval_getslot(thmap_query_t * query,const void * restrict key,size_t len)344 hashval_getslot(thmap_query_t *query, const void * restrict key, size_t len)
345 {
346 const unsigned offset = query->level * LEVEL_BITS;
347 const unsigned shift = offset & HASHVAL_MOD;
348 const unsigned i = offset >> HASHVAL_SHIFT;
349
350 if (query->hashidx != i) {
351 /* Generate a hash value for a required range. */
352 query->hashval = hash(query->seed, key, len, i);
353 query->hashidx = i;
354 }
355 return (query->hashval >> shift) & LEVEL_MASK;
356 }
357
358 static unsigned
hashval_getleafslot(const thmap_t * thmap,const thmap_leaf_t * leaf,unsigned level)359 hashval_getleafslot(const thmap_t *thmap,
360 const thmap_leaf_t *leaf, unsigned level)
361 {
362 const void *key = THMAP_GETPTR(thmap, leaf->key);
363 const unsigned offset = level * LEVEL_BITS;
364 const unsigned shift = offset & HASHVAL_MOD;
365 const unsigned i = offset >> HASHVAL_SHIFT;
366
367 return (hash(thmap->seed, key, leaf->len, i) >> shift) & LEVEL_MASK;
368 }
369
370 static inline unsigned
hashval_getl0slot(const thmap_t * thmap,const thmap_query_t * query,const thmap_leaf_t * leaf)371 hashval_getl0slot(const thmap_t *thmap, const thmap_query_t *query,
372 const thmap_leaf_t *leaf)
373 {
374 if (__predict_true(query->hashidx == 0)) {
375 return query->hashval & LEVEL_MASK;
376 }
377 return hashval_getleafslot(thmap, leaf, 0);
378 }
379
380 static bool
key_cmp_p(const thmap_t * thmap,const thmap_leaf_t * leaf,const void * restrict key,size_t len)381 key_cmp_p(const thmap_t *thmap, const thmap_leaf_t *leaf,
382 const void * restrict key, size_t len)
383 {
384 const void *leafkey = THMAP_GETPTR(thmap, leaf->key);
385 return len == leaf->len && memcmp(key, leafkey, len) == 0;
386 }
387
388 /*
389 * INTER-NODE OPERATIONS.
390 */
391
392 static thmap_inode_t *
node_create(thmap_t * thmap,thmap_inode_t * parent)393 node_create(thmap_t *thmap, thmap_inode_t *parent)
394 {
395 thmap_inode_t *node;
396 uintptr_t p;
397
398 p = thmap->ops->alloc(THMAP_INODE_LEN);
399 if (!p) {
400 return NULL;
401 }
402 node = THMAP_GETPTR(thmap, p);
403 ASSERT(THMAP_ALIGNED_P(node));
404
405 memset(node, 0, THMAP_INODE_LEN);
406 if (parent) {
407 /* Not yet published, no need for ordering. */
408 atomic_store_relaxed(&node->state, NODE_LOCKED);
409 node->parent = THMAP_GETOFF(thmap, parent);
410 }
411 return node;
412 }
413
414 static void
node_insert(thmap_inode_t * node,unsigned slot,thmap_ptr_t child)415 node_insert(thmap_inode_t *node, unsigned slot, thmap_ptr_t child)
416 {
417 ASSERT(node_locked_p(node) || node->parent == THMAP_NULL);
418 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0);
419 ASSERT(atomic_load_relaxed(&node->slots[slot]) == THMAP_NULL);
420
421 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) < LEVEL_SIZE);
422
423 /*
424 * If node is public already, caller is responsible for issuing
425 * release fence; if node is not public, no ordering is needed.
426 * Hence relaxed ordering.
427 */
428 atomic_store_relaxed(&node->slots[slot], child);
429 atomic_store_relaxed(&node->state,
430 atomic_load_relaxed(&node->state) + 1);
431 }
432
433 static void
node_remove(thmap_inode_t * node,unsigned slot)434 node_remove(thmap_inode_t *node, unsigned slot)
435 {
436 ASSERT(node_locked_p(node));
437 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0);
438 ASSERT(atomic_load_relaxed(&node->slots[slot]) != THMAP_NULL);
439
440 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) > 0);
441 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) <= LEVEL_SIZE);
442
443 /* Element will be GC-ed later; no need for ordering here. */
444 atomic_store_relaxed(&node->slots[slot], THMAP_NULL);
445 atomic_store_relaxed(&node->state,
446 atomic_load_relaxed(&node->state) - 1);
447 }
448
449 /*
450 * LEAF OPERATIONS.
451 */
452
453 static thmap_leaf_t *
leaf_create(const thmap_t * thmap,const void * key,size_t len,void * val)454 leaf_create(const thmap_t *thmap, const void *key, size_t len, void *val)
455 {
456 thmap_leaf_t *leaf;
457 uintptr_t leaf_off, key_off;
458
459 leaf_off = thmap->ops->alloc(sizeof(thmap_leaf_t));
460 if (!leaf_off) {
461 return NULL;
462 }
463 leaf = THMAP_GETPTR(thmap, leaf_off);
464 ASSERT(THMAP_ALIGNED_P(leaf));
465
466 if ((thmap->flags & THMAP_NOCOPY) == 0) {
467 /*
468 * Copy the key.
469 */
470 key_off = thmap->ops->alloc(len);
471 if (!key_off) {
472 thmap->ops->free(leaf_off, sizeof(thmap_leaf_t));
473 return NULL;
474 }
475 memcpy(THMAP_GETPTR(thmap, key_off), key, len);
476 leaf->key = key_off;
477 } else {
478 /* Otherwise, we use a reference. */
479 leaf->key = (uintptr_t)key;
480 }
481 leaf->len = len;
482 leaf->val = val;
483 return leaf;
484 }
485
486 static void
leaf_free(const thmap_t * thmap,thmap_leaf_t * leaf)487 leaf_free(const thmap_t *thmap, thmap_leaf_t *leaf)
488 {
489 if ((thmap->flags & THMAP_NOCOPY) == 0) {
490 thmap->ops->free(leaf->key, leaf->len);
491 }
492 thmap->ops->free(THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
493 }
494
495 static thmap_leaf_t *
get_leaf(const thmap_t * thmap,thmap_inode_t * parent,unsigned slot)496 get_leaf(const thmap_t *thmap, thmap_inode_t *parent, unsigned slot)
497 {
498 thmap_ptr_t node;
499
500 /* Consume from prior release in thmap_put(). */
501 node = atomic_load_consume(&parent->slots[slot]);
502 if (THMAP_INODE_P(node)) {
503 return NULL;
504 }
505 return THMAP_NODE(thmap, node);
506 }
507
508 /*
509 * ROOT OPERATIONS.
510 */
511
512 /*
513 * root_try_put: Try to set a root pointer at query->rslot.
514 *
515 * => Implies release operation on success.
516 * => Implies no ordering on failure.
517 */
518 static inline int
root_try_put(thmap_t * thmap,const thmap_query_t * query,thmap_leaf_t * leaf)519 root_try_put(thmap_t *thmap, const thmap_query_t *query, thmap_leaf_t *leaf)
520 {
521 thmap_ptr_t expected;
522 const unsigned i = query->rslot;
523 thmap_inode_t *node;
524 thmap_ptr_t nptr;
525 unsigned slot;
526
527 /*
528 * Must pre-check first. No ordering required because we will
529 * check again before taking any actions, and start over if
530 * this changes from null.
531 */
532 if (atomic_load_relaxed(&thmap->root[i])) {
533 return EEXIST;
534 }
535
536 /*
537 * Create an intermediate node. Since there is no parent set,
538 * it will be created unlocked and the CAS operation will
539 * release it to readers.
540 */
541 node = node_create(thmap, NULL);
542 if (__predict_false(node == NULL)) {
543 return ENOMEM;
544 }
545 slot = hashval_getl0slot(thmap, query, leaf);
546 node_insert(node, slot, THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT);
547 nptr = THMAP_GETOFF(thmap, node);
548 again:
549 if (atomic_load_relaxed(&thmap->root[i])) {
550 thmap->ops->free(nptr, THMAP_INODE_LEN);
551 return EEXIST;
552 }
553 /* Release to subsequent consume in find_edge_node(). */
554 expected = THMAP_NULL;
555 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->root[i], &expected,
556 nptr, memory_order_release, memory_order_relaxed)) {
557 goto again;
558 }
559 return 0;
560 }
561
562 /*
563 * find_edge_node: given the hash, traverse the tree to find the edge node.
564 *
565 * => Returns an aligned (clean) pointer to the parent node.
566 * => Returns the slot number and sets current level.
567 */
568 static thmap_inode_t *
find_edge_node(const thmap_t * thmap,thmap_query_t * query,const void * restrict key,size_t len,unsigned * slot)569 find_edge_node(const thmap_t *thmap, thmap_query_t *query,
570 const void * restrict key, size_t len, unsigned *slot)
571 {
572 thmap_ptr_t root_slot;
573 thmap_inode_t *parent;
574 thmap_ptr_t node;
575 unsigned off;
576
577 ASSERT(query->level == 0);
578
579 /* Consume from prior release in root_try_put(). */
580 root_slot = atomic_load_consume(&thmap->root[query->rslot]);
581 parent = THMAP_NODE(thmap, root_slot);
582 if (!parent) {
583 return NULL;
584 }
585 descend:
586 off = hashval_getslot(query, key, len);
587 /* Consume from prior release in thmap_put(). */
588 node = atomic_load_consume(&parent->slots[off]);
589
590 /* Descend the tree until we find a leaf or empty slot. */
591 if (node && THMAP_INODE_P(node)) {
592 parent = THMAP_NODE(thmap, node);
593 query->level++;
594 goto descend;
595 }
596 /*
597 * NODE_DELETED does not become stale until GC runs, which
598 * cannot happen while we are in the middle of an operation,
599 * hence relaxed ordering.
600 */
601 if (atomic_load_relaxed(&parent->state) & NODE_DELETED) {
602 return NULL;
603 }
604 *slot = off;
605 return parent;
606 }
607
608 /*
609 * find_edge_node_locked: traverse the tree, like find_edge_node(),
610 * but attempt to lock the edge node.
611 *
612 * => Returns NULL if the deleted node is found. This indicates that
613 * the caller must re-try from the root, as the root slot might have
614 * changed too.
615 */
616 static thmap_inode_t *
find_edge_node_locked(const thmap_t * thmap,thmap_query_t * query,const void * restrict key,size_t len,unsigned * slot)617 find_edge_node_locked(const thmap_t *thmap, thmap_query_t *query,
618 const void * restrict key, size_t len, unsigned *slot)
619 {
620 thmap_inode_t *node;
621 thmap_ptr_t target;
622 retry:
623 /*
624 * Find the edge node and lock it! Re-check the state since
625 * the tree might change by the time we acquire the lock.
626 */
627 node = find_edge_node(thmap, query, key, len, slot);
628 if (!node) {
629 /* The root slot is empty -- let the caller decide. */
630 query->level = 0;
631 return NULL;
632 }
633 lock_node(node);
634 if (__predict_false(atomic_load_relaxed(&node->state) & NODE_DELETED)) {
635 /*
636 * The node has been deleted. The tree might have a new
637 * shape now, therefore we must re-start from the root.
638 */
639 unlock_node(node);
640 query->level = 0;
641 return NULL;
642 }
643 target = atomic_load_relaxed(&node->slots[*slot]);
644 if (__predict_false(target && THMAP_INODE_P(target))) {
645 /*
646 * The target slot has been changed and it is now an
647 * intermediate node. Re-start from the top internode.
648 */
649 unlock_node(node);
650 query->level = 0;
651 goto retry;
652 }
653 return node;
654 }
655
656 /*
657 * thmap_get: lookup a value given the key.
658 */
659 void *
thmap_get(thmap_t * thmap,const void * key,size_t len)660 thmap_get(thmap_t *thmap, const void *key, size_t len)
661 {
662 thmap_query_t query;
663 thmap_inode_t *parent;
664 thmap_leaf_t *leaf;
665 unsigned slot;
666
667 hashval_init(&query, thmap->seed, key, len);
668 parent = find_edge_node(thmap, &query, key, len, &slot);
669 if (!parent) {
670 return NULL;
671 }
672 leaf = get_leaf(thmap, parent, slot);
673 if (!leaf) {
674 return NULL;
675 }
676 if (!key_cmp_p(thmap, leaf, key, len)) {
677 return NULL;
678 }
679 return leaf->val;
680 }
681
682 /*
683 * thmap_put: insert a value given the key.
684 *
685 * => If the key is already present, return the associated value.
686 * => Otherwise, on successful insert, return the given value.
687 */
688 void *
thmap_put(thmap_t * thmap,const void * key,size_t len,void * val)689 thmap_put(thmap_t *thmap, const void *key, size_t len, void *val)
690 {
691 thmap_query_t query;
692 thmap_leaf_t *leaf, *other;
693 thmap_inode_t *parent, *child;
694 unsigned slot, other_slot;
695 thmap_ptr_t target;
696
697 /*
698 * First, pre-allocate and initialize the leaf node.
699 */
700 leaf = leaf_create(thmap, key, len, val);
701 if (__predict_false(!leaf)) {
702 return NULL;
703 }
704 hashval_init(&query, thmap->seed, key, len);
705 retry:
706 /*
707 * Try to insert into the root first, if its slot is empty.
708 */
709 switch (root_try_put(thmap, &query, leaf)) {
710 case 0:
711 /* Success: the leaf was inserted; no locking involved. */
712 return val;
713 case EEXIST:
714 break;
715 case ENOMEM:
716 return NULL;
717 default:
718 __unreachable();
719 }
720
721 /*
722 * Release node via store in node_insert (*) to subsequent
723 * consume in get_leaf() or find_edge_node().
724 */
725 atomic_thread_fence(memory_order_release);
726
727 /*
728 * Find the edge node and the target slot.
729 */
730 parent = find_edge_node_locked(thmap, &query, key, len, &slot);
731 if (!parent) {
732 goto retry;
733 }
734 target = atomic_load_relaxed(&parent->slots[slot]); // tagged offset
735 if (THMAP_INODE_P(target)) {
736 /*
737 * Empty slot: simply insert the new leaf. The release
738 * fence is already issued for us.
739 */
740 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
741 node_insert(parent, slot, target); /* (*) */
742 goto out;
743 }
744
745 /*
746 * Collision or duplicate.
747 */
748 other = THMAP_NODE(thmap, target);
749 if (key_cmp_p(thmap, other, key, len)) {
750 /*
751 * Duplicate. Free the pre-allocated leaf and
752 * return the present value.
753 */
754 leaf_free(thmap, leaf);
755 val = other->val;
756 goto out;
757 }
758 descend:
759 /*
760 * Collision -- expand the tree. Create an intermediate node
761 * which will be locked (NODE_LOCKED) for us. At this point,
762 * we advance to the next level.
763 */
764 child = node_create(thmap, parent);
765 if (__predict_false(!child)) {
766 leaf_free(thmap, leaf);
767 val = NULL;
768 goto out;
769 }
770 query.level++;
771
772 /*
773 * Insert the other (colliding) leaf first. The new child is
774 * not yet published, so memory order is relaxed.
775 */
776 other_slot = hashval_getleafslot(thmap, other, query.level);
777 target = THMAP_GETOFF(thmap, other) | THMAP_LEAF_BIT;
778 node_insert(child, other_slot, target);
779
780 /*
781 * Insert the intermediate node into the parent node.
782 * It becomes the new parent for the our new leaf.
783 *
784 * Ensure that stores to the child (and leaf) reach global
785 * visibility before it gets inserted to the parent, as
786 * consumed by get_leaf() or find_edge_node().
787 */
788 atomic_store_release(&parent->slots[slot], THMAP_GETOFF(thmap, child));
789
790 unlock_node(parent);
791 ASSERT(node_locked_p(child));
792 parent = child;
793
794 /*
795 * Get the new slot and check for another collision
796 * at the next level.
797 */
798 slot = hashval_getslot(&query, key, len);
799 if (slot == other_slot) {
800 /* Another collision -- descend and expand again. */
801 goto descend;
802 }
803
804 /*
805 * Insert our new leaf once we expanded enough. The release
806 * fence is already issued for us.
807 */
808 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
809 node_insert(parent, slot, target); /* (*) */
810 out:
811 unlock_node(parent);
812 return val;
813 }
814
815 /*
816 * thmap_del: remove the entry given the key.
817 */
818 void *
thmap_del(thmap_t * thmap,const void * key,size_t len)819 thmap_del(thmap_t *thmap, const void *key, size_t len)
820 {
821 thmap_query_t query;
822 thmap_leaf_t *leaf;
823 thmap_inode_t *parent;
824 unsigned slot;
825 void *val;
826
827 hashval_init(&query, thmap->seed, key, len);
828 parent = find_edge_node_locked(thmap, &query, key, len, &slot);
829 if (!parent) {
830 /* Root slot empty: not found. */
831 return NULL;
832 }
833 leaf = get_leaf(thmap, parent, slot);
834 if (!leaf || !key_cmp_p(thmap, leaf, key, len)) {
835 /* Not found. */
836 unlock_node(parent);
837 return NULL;
838 }
839
840 /* Remove the leaf. */
841 ASSERT(THMAP_NODE(thmap, atomic_load_relaxed(&parent->slots[slot]))
842 == leaf);
843 node_remove(parent, slot);
844
845 /*
846 * Collapse the levels if removing the last item.
847 */
848 while (query.level &&
849 NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) {
850 thmap_inode_t *node = parent;
851
852 ASSERT(atomic_load_relaxed(&node->state) == NODE_LOCKED);
853
854 /*
855 * Ascend one level up.
856 * => Mark our current parent as deleted.
857 * => Lock the parent one level up.
858 */
859 query.level--;
860 slot = hashval_getslot(&query, key, len);
861 parent = THMAP_NODE(thmap, node->parent);
862 ASSERT(parent != NULL);
863
864 lock_node(parent);
865 ASSERT((atomic_load_relaxed(&parent->state) & NODE_DELETED)
866 == 0);
867
868 /*
869 * Lock is exclusive, so nobody else can be writing at
870 * the same time, and no need for atomic R/M/W, but
871 * readers may read without the lock and so need atomic
872 * load/store. No ordering here needed because the
873 * entry itself stays valid until GC.
874 */
875 atomic_store_relaxed(&node->state,
876 atomic_load_relaxed(&node->state) | NODE_DELETED);
877 unlock_node(node); // memory_order_release
878
879 ASSERT(THMAP_NODE(thmap,
880 atomic_load_relaxed(&parent->slots[slot])) == node);
881 node_remove(parent, slot);
882
883 /* Stage the removed node for G/C. */
884 stage_mem_gc(thmap, THMAP_GETOFF(thmap, node), THMAP_INODE_LEN);
885 }
886
887 /*
888 * If the top node is empty, then we need to remove it from the
889 * root level. Mark the node as deleted and clear the slot.
890 *
891 * Note: acquiring the lock on the top node effectively prevents
892 * the root slot from changing.
893 */
894 if (NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) {
895 const unsigned rslot = query.rslot;
896 const thmap_ptr_t nptr =
897 atomic_load_relaxed(&thmap->root[rslot]);
898
899 ASSERT(query.level == 0);
900 ASSERT(parent->parent == THMAP_NULL);
901 ASSERT(THMAP_GETOFF(thmap, parent) == nptr);
902
903 /* Mark as deleted and remove from the root-level slot. */
904 atomic_store_relaxed(&parent->state,
905 atomic_load_relaxed(&parent->state) | NODE_DELETED);
906 atomic_store_relaxed(&thmap->root[rslot], THMAP_NULL);
907
908 stage_mem_gc(thmap, nptr, THMAP_INODE_LEN);
909 }
910 unlock_node(parent);
911
912 /*
913 * Save the value and stage the leaf for G/C.
914 */
915 val = leaf->val;
916 if ((thmap->flags & THMAP_NOCOPY) == 0) {
917 stage_mem_gc(thmap, leaf->key, leaf->len);
918 }
919 stage_mem_gc(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
920 return val;
921 }
922
923 /*
924 * G/C routines.
925 */
926
927 static void
stage_mem_gc(thmap_t * thmap,uintptr_t addr,size_t len)928 stage_mem_gc(thmap_t *thmap, uintptr_t addr, size_t len)
929 {
930 thmap_gc_t *head, *gc;
931
932 gc = kmem_intr_alloc(sizeof(thmap_gc_t), KM_NOSLEEP);
933 gc->addr = addr;
934 gc->len = len;
935 retry:
936 head = atomic_load_relaxed(&thmap->gc_list);
937 gc->next = head; // not yet published
938
939 /* Release to subsequent acquire in thmap_stage_gc(). */
940 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->gc_list, &head, gc,
941 memory_order_release, memory_order_relaxed)) {
942 goto retry;
943 }
944 }
945
946 void *
thmap_stage_gc(thmap_t * thmap)947 thmap_stage_gc(thmap_t *thmap)
948 {
949 /* Acquire from prior release in stage_mem_gc(). */
950 return atomic_exchange_explicit(&thmap->gc_list, NULL,
951 memory_order_acquire);
952 }
953
954 void
thmap_gc(thmap_t * thmap,void * ref)955 thmap_gc(thmap_t *thmap, void *ref)
956 {
957 thmap_gc_t *gc = ref;
958
959 while (gc) {
960 thmap_gc_t *next = gc->next;
961 thmap->ops->free(gc->addr, gc->len);
962 kmem_intr_free(gc, sizeof(thmap_gc_t));
963 gc = next;
964 }
965 }
966
967 /*
968 * thmap_create: construct a new trie-hash map object.
969 */
970 thmap_t *
thmap_create(uintptr_t baseptr,const thmap_ops_t * ops,unsigned flags)971 thmap_create(uintptr_t baseptr, const thmap_ops_t *ops, unsigned flags)
972 {
973 thmap_t *thmap;
974 uintptr_t root;
975
976 /*
977 * Setup the map object.
978 */
979 if (!THMAP_ALIGNED_P(baseptr)) {
980 return NULL;
981 }
982 thmap = kmem_zalloc(sizeof(thmap_t), KM_SLEEP);
983 thmap->baseptr = baseptr;
984 thmap->ops = ops ? ops : &thmap_default_ops;
985 thmap->flags = flags;
986
987 if ((thmap->flags & THMAP_SETROOT) == 0) {
988 /* Allocate the root level. */
989 root = thmap->ops->alloc(THMAP_ROOT_LEN);
990 thmap->root = THMAP_GETPTR(thmap, root);
991 if (!thmap->root) {
992 kmem_free(thmap, sizeof(thmap_t));
993 return NULL;
994 }
995 memset(thmap->root, 0, THMAP_ROOT_LEN);
996 }
997
998 cprng_strong(kern_cprng, thmap->seed, sizeof thmap->seed, 0);
999
1000 return thmap;
1001 }
1002
1003 int
thmap_setroot(thmap_t * thmap,uintptr_t root_off)1004 thmap_setroot(thmap_t *thmap, uintptr_t root_off)
1005 {
1006 if (thmap->root) {
1007 return -1;
1008 }
1009 thmap->root = THMAP_GETPTR(thmap, root_off);
1010 return 0;
1011 }
1012
1013 uintptr_t
thmap_getroot(const thmap_t * thmap)1014 thmap_getroot(const thmap_t *thmap)
1015 {
1016 return THMAP_GETOFF(thmap, thmap->root);
1017 }
1018
1019 void
thmap_destroy(thmap_t * thmap)1020 thmap_destroy(thmap_t *thmap)
1021 {
1022 uintptr_t root = THMAP_GETOFF(thmap, thmap->root);
1023 void *ref;
1024
1025 ref = thmap_stage_gc(thmap);
1026 thmap_gc(thmap, ref);
1027
1028 if ((thmap->flags & THMAP_SETROOT) == 0) {
1029 thmap->ops->free(root, THMAP_ROOT_LEN);
1030 }
1031 kmem_free(thmap, sizeof(thmap_t));
1032 }
1033