1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2013 EMC Corp.
5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 */
31
32 /*
33 * Path-compressed radix trie implementation.
34 *
35 * The implementation takes into account the following rationale:
36 * - Size of the nodes should be as small as possible but still big enough
37 * to avoid a large maximum depth for the trie. This is a balance
38 * between the necessity to not wire too much physical memory for the nodes
39 * and the necessity to avoid too much cache pollution during the trie
40 * operations.
41 * - There is not a huge bias toward the number of lookup operations over
42 * the number of insert and remove operations. This basically implies
43 * that optimizations supposedly helping one operation but hurting the
44 * other might be carefully evaluated.
45 * - On average not many nodes are expected to be fully populated, hence
46 * level compression may just complicate things.
47 */
48
49 #include <sys/cdefs.h>
50 #include "opt_ddb.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/libkern.h>
56 #include <sys/pctrie.h>
57 #include <sys/proc.h> /* smr.h depends on struct thread. */
58 #include <sys/smr.h>
59 #include <sys/smr_types.h>
60
61 #ifdef DDB
62 #include <ddb/ddb.h>
63 #endif
64
65 #define PCTRIE_MASK (PCTRIE_COUNT - 1)
66 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
67
68 #if PCTRIE_WIDTH == 3
69 typedef uint8_t pn_popmap_t;
70 #elif PCTRIE_WIDTH == 4
71 typedef uint16_t pn_popmap_t;
72 #elif PCTRIE_WIDTH == 5
73 typedef uint32_t pn_popmap_t;
74 #else
75 #error Unsupported width
76 #endif
77 _Static_assert(sizeof(pn_popmap_t) <= sizeof(int),
78 "pn_popmap_t too wide");
79
80 struct pctrie_node;
81 typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t;
82
83 struct pctrie_node {
84 uint64_t pn_owner; /* Owner of record. */
85 pn_popmap_t pn_popmap; /* Valid children. */
86 uint8_t pn_clev; /* Level * WIDTH. */
87 smr_pctnode_t pn_child[PCTRIE_COUNT]; /* Child nodes. */
88 };
89
90 enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
91
92 static __inline void pctrie_node_store(smr_pctnode_t *p, void *val,
93 enum pctrie_access access);
94
95 /*
96 * Map index to an array position for the children of node,
97 */
98 static __inline int
pctrie_slot(struct pctrie_node * node,uint64_t index)99 pctrie_slot(struct pctrie_node *node, uint64_t index)
100 {
101 return ((index >> node->pn_clev) & PCTRIE_MASK);
102 }
103
104 /*
105 * Returns true if index does not belong to the specified node. Otherwise,
106 * sets slot value, and returns false.
107 */
108 static __inline bool
pctrie_keybarr(struct pctrie_node * node,uint64_t index,int * slot)109 pctrie_keybarr(struct pctrie_node *node, uint64_t index, int *slot)
110 {
111 index = (index - node->pn_owner) >> node->pn_clev;
112 if (index >= PCTRIE_COUNT)
113 return (true);
114 *slot = index;
115 return (false);
116 }
117
118 /*
119 * Check radix node.
120 */
121 static __inline void
pctrie_node_put(struct pctrie_node * node)122 pctrie_node_put(struct pctrie_node *node)
123 {
124 #ifdef INVARIANTS
125 int slot;
126
127 KASSERT(powerof2(node->pn_popmap),
128 ("pctrie_node_put: node %p has too many children %04x", node,
129 node->pn_popmap));
130 for (slot = 0; slot < PCTRIE_COUNT; slot++) {
131 if ((node->pn_popmap & (1 << slot)) != 0)
132 continue;
133 KASSERT(smr_unserialized_load(&node->pn_child[slot], true) ==
134 PCTRIE_NULL,
135 ("pctrie_node_put: node %p has a child", node));
136 }
137 #endif
138 }
139
140 /*
141 * Fetch a node pointer from a slot.
142 */
143 static __inline struct pctrie_node *
pctrie_node_load(smr_pctnode_t * p,smr_t smr,enum pctrie_access access)144 pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access)
145 {
146 switch (access) {
147 case PCTRIE_UNSERIALIZED:
148 return (smr_unserialized_load(p, true));
149 case PCTRIE_LOCKED:
150 return (smr_serialized_load(p, true));
151 case PCTRIE_SMR:
152 return (smr_entered_load(p, smr));
153 }
154 __assert_unreachable();
155 }
156
157 static __inline void
pctrie_node_store(smr_pctnode_t * p,void * v,enum pctrie_access access)158 pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access)
159 {
160 switch (access) {
161 case PCTRIE_UNSERIALIZED:
162 smr_unserialized_store(p, v, true);
163 break;
164 case PCTRIE_LOCKED:
165 smr_serialized_store(p, v, true);
166 break;
167 case PCTRIE_SMR:
168 panic("%s: Not supported in SMR section.", __func__);
169 break;
170 default:
171 __assert_unreachable();
172 break;
173 }
174 }
175
176 /*
177 * Get the root node for a tree.
178 */
179 static __inline struct pctrie_node *
pctrie_root_load(struct pctrie * ptree,smr_t smr,enum pctrie_access access)180 pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access)
181 {
182 return (pctrie_node_load((smr_pctnode_t *)&ptree->pt_root, smr, access));
183 }
184
185 /*
186 * Set the root node for a tree.
187 */
188 static __inline void
pctrie_root_store(struct pctrie * ptree,struct pctrie_node * node,enum pctrie_access access)189 pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node,
190 enum pctrie_access access)
191 {
192 pctrie_node_store((smr_pctnode_t *)&ptree->pt_root, node, access);
193 }
194
195 /*
196 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
197 */
198 static __inline bool
pctrie_isleaf(struct pctrie_node * node)199 pctrie_isleaf(struct pctrie_node *node)
200 {
201
202 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
203 }
204
205 /*
206 * Returns val with leaf bit set.
207 */
208 static __inline void *
pctrie_toleaf(uint64_t * val)209 pctrie_toleaf(uint64_t *val)
210 {
211 return ((void *)((uintptr_t)val | PCTRIE_ISLEAF));
212 }
213
214 /*
215 * Returns the associated val extracted from node.
216 */
217 static __inline uint64_t *
pctrie_toval(struct pctrie_node * node)218 pctrie_toval(struct pctrie_node *node)
219 {
220
221 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
222 }
223
224 /*
225 * Make 'child' a child of 'node'.
226 */
227 static __inline void
pctrie_addnode(struct pctrie_node * node,uint64_t index,struct pctrie_node * child,enum pctrie_access access)228 pctrie_addnode(struct pctrie_node *node, uint64_t index,
229 struct pctrie_node *child, enum pctrie_access access)
230 {
231 int slot;
232
233 slot = pctrie_slot(node, index);
234 pctrie_node_store(&node->pn_child[slot], child, access);
235 node->pn_popmap ^= 1 << slot;
236 KASSERT((node->pn_popmap & (1 << slot)) != 0,
237 ("%s: bad popmap slot %d in node %p", __func__, slot, node));
238 }
239
240 /*
241 * pctrie node zone initializer.
242 */
243 int
pctrie_zone_init(void * mem,int size __unused,int flags __unused)244 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
245 {
246 struct pctrie_node *node;
247
248 node = mem;
249 node->pn_popmap = 0;
250 for (int i = 0; i < nitems(node->pn_child); i++)
251 pctrie_node_store(&node->pn_child[i], PCTRIE_NULL,
252 PCTRIE_UNSERIALIZED);
253 return (0);
254 }
255
256 size_t
pctrie_node_size(void)257 pctrie_node_size(void)
258 {
259
260 return (sizeof(struct pctrie_node));
261 }
262
263 enum pctrie_insert_neighbor_mode {
264 PCTRIE_INSERT_NEIGHBOR_NONE,
265 PCTRIE_INSERT_NEIGHBOR_LT,
266 PCTRIE_INSERT_NEIGHBOR_GT,
267 };
268
269 /*
270 * Look for where to insert the key-value pair into the trie. Complete the
271 * insertion if it replaces a null leaf. Return the insertion location if the
272 * insertion needs to be completed by the caller; otherwise return NULL.
273 *
274 * If the key is already present in the trie, populate *found_out as if by
275 * pctrie_lookup().
276 *
277 * With mode PCTRIE_INSERT_NEIGHBOR_GT or PCTRIE_INSERT_NEIGHBOR_LT, set
278 * *neighbor_out to the lowest level node we encounter during the insert lookup
279 * that is a parent of the next greater or lesser entry. The value is not
280 * defined if the key was already present in the trie.
281 *
282 * Note that mode is expected to be a compile-time constant, and this procedure
283 * is expected to be inlined into callers with extraneous code optimized out.
284 */
285 static __always_inline void *
pctrie_insert_lookup_compound(struct pctrie * ptree,uint64_t * val,uint64_t ** found_out,struct pctrie_node ** neighbor_out,enum pctrie_insert_neighbor_mode mode)286 pctrie_insert_lookup_compound(struct pctrie *ptree, uint64_t *val,
287 uint64_t **found_out, struct pctrie_node **neighbor_out,
288 enum pctrie_insert_neighbor_mode mode)
289 {
290 uint64_t index;
291 struct pctrie_node *node, *parent;
292 int slot;
293
294 index = *val;
295
296 /*
297 * The owner of record for root is not really important because it
298 * will never be used.
299 */
300 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
301 parent = NULL;
302 for (;;) {
303 if (pctrie_isleaf(node)) {
304 if (node == PCTRIE_NULL) {
305 if (parent == NULL)
306 ptree->pt_root = pctrie_toleaf(val);
307 else
308 pctrie_addnode(parent, index,
309 pctrie_toleaf(val), PCTRIE_LOCKED);
310 return (NULL);
311 }
312 if (*pctrie_toval(node) == index) {
313 *found_out = pctrie_toval(node);
314 return (NULL);
315 }
316 break;
317 }
318 if (pctrie_keybarr(node, index, &slot))
319 break;
320 /*
321 * Descend. If we're tracking the next neighbor and this node
322 * contains a neighboring entry in the right direction, record
323 * it.
324 */
325 if (mode == PCTRIE_INSERT_NEIGHBOR_LT) {
326 if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
327 *neighbor_out = node;
328 } else if (mode == PCTRIE_INSERT_NEIGHBOR_GT) {
329 if ((node->pn_popmap >> slot) > 1)
330 *neighbor_out = node;
331 }
332 parent = node;
333 node = pctrie_node_load(&node->pn_child[slot], NULL,
334 PCTRIE_LOCKED);
335 }
336
337 /*
338 * The caller will split this node. If we're tracking the next
339 * neighbor, record the old node if the old entry is in the right
340 * direction.
341 */
342 if (mode == PCTRIE_INSERT_NEIGHBOR_LT) {
343 if (*pctrie_toval(node) < index)
344 *neighbor_out = node;
345 } else if (mode == PCTRIE_INSERT_NEIGHBOR_GT) {
346 if (*pctrie_toval(node) > index)
347 *neighbor_out = node;
348 }
349
350 /*
351 * 'node' must be replaced in the tree with a new branch node, with
352 * children 'node' and 'val'. Return the place that points to 'node'
353 * now, and will point to to the new branching node later.
354 */
355 return ((parent != NULL) ? &parent->pn_child[slot]:
356 (smr_pctnode_t *)&ptree->pt_root);
357 }
358
359 /*
360 * Wrap pctrie_insert_lookup_compound to implement a strict insertion. Panic
361 * if the key already exists, and do not look for neighboring entries.
362 */
363 void *
pctrie_insert_lookup_strict(struct pctrie * ptree,uint64_t * val)364 pctrie_insert_lookup_strict(struct pctrie *ptree, uint64_t *val)
365 {
366 void *parentp;
367 uint64_t *found;
368
369 found = NULL;
370 parentp = pctrie_insert_lookup_compound(ptree, val, &found, NULL,
371 PCTRIE_INSERT_NEIGHBOR_NONE);
372 if (__predict_false(found != NULL))
373 panic("%s: key %jx is already present", __func__,
374 (uintmax_t)*val);
375 return (parentp);
376 }
377
378 /*
379 * Wrap pctrie_insert_lookup_compound to implement find-or-insert. Do not look
380 * for neighboring entries.
381 */
382 void *
pctrie_insert_lookup(struct pctrie * ptree,uint64_t * val,uint64_t ** found_out)383 pctrie_insert_lookup(struct pctrie *ptree, uint64_t *val,
384 uint64_t **found_out)
385 {
386 *found_out = NULL;
387 return (pctrie_insert_lookup_compound(ptree, val, found_out, NULL,
388 PCTRIE_INSERT_NEIGHBOR_NONE));
389 }
390
391 /*
392 * Wrap pctrie_insert_lookup_compound to implement find or insert and find next
393 * greater entry. Find a subtree that contains the next entry greater than the
394 * newly-inserted or to-be-inserted entry.
395 */
396 void *
pctrie_insert_lookup_gt(struct pctrie * ptree,uint64_t * val,uint64_t ** found_out,struct pctrie_node ** neighbor_out)397 pctrie_insert_lookup_gt(struct pctrie *ptree, uint64_t *val,
398 uint64_t **found_out, struct pctrie_node **neighbor_out)
399 {
400 *found_out = NULL;
401 *neighbor_out = NULL;
402 return (pctrie_insert_lookup_compound(ptree, val, found_out,
403 neighbor_out, PCTRIE_INSERT_NEIGHBOR_GT));
404 }
405
406 /*
407 * Wrap pctrie_insert_lookup_compound to implement find or insert and find next
408 * lesser entry. Find a subtree that contains the next entry less than the
409 * newly-inserted or to-be-inserted entry.
410 */
411 void *
pctrie_insert_lookup_lt(struct pctrie * ptree,uint64_t * val,uint64_t ** found_out,struct pctrie_node ** neighbor_out)412 pctrie_insert_lookup_lt(struct pctrie *ptree, uint64_t *val,
413 uint64_t **found_out, struct pctrie_node **neighbor_out)
414 {
415 *found_out = NULL;
416 *neighbor_out = NULL;
417 return (pctrie_insert_lookup_compound(ptree, val, found_out,
418 neighbor_out, PCTRIE_INSERT_NEIGHBOR_LT));
419 }
420
421 /*
422 * Uses new node to insert key-value pair into the trie at given location.
423 */
424 void
pctrie_insert_node(void * parentp,struct pctrie_node * parent,uint64_t * val)425 pctrie_insert_node(void *parentp, struct pctrie_node *parent, uint64_t *val)
426 {
427 struct pctrie_node *node;
428 uint64_t index, newind;
429
430 /*
431 * Clear the last child pointer of the newly allocated parent. We want
432 * to clear it after the final section has exited so lookup can not
433 * return false negatives. It is done here because it will be
434 * cache-cold in the dtor callback.
435 */
436 if (parent->pn_popmap != 0) {
437 pctrie_node_store(&parent->pn_child[ffs(parent->pn_popmap) - 1],
438 PCTRIE_NULL, PCTRIE_UNSERIALIZED);
439 parent->pn_popmap = 0;
440 }
441
442 /*
443 * Recover the values of the two children of the new parent node. If
444 * 'node' is not a leaf, this stores into 'newind' the 'owner' field,
445 * which must be first in the node.
446 */
447 index = *val;
448 node = pctrie_node_load(parentp, NULL, PCTRIE_UNSERIALIZED);
449 newind = *pctrie_toval(node);
450
451 /*
452 * From the highest-order bit where the indexes differ,
453 * compute the highest level in the trie where they differ. Then,
454 * compute the least index of this subtrie.
455 */
456 _Static_assert(sizeof(long long) >= sizeof(uint64_t),
457 "uint64 too wide");
458 _Static_assert(sizeof(uint64_t) * NBBY <=
459 (1 << (sizeof(parent->pn_clev) * NBBY)), "pn_clev too narrow");
460 parent->pn_clev = rounddown(ilog2(index ^ newind), PCTRIE_WIDTH);
461 parent->pn_owner = PCTRIE_COUNT;
462 parent->pn_owner = index & -(parent->pn_owner << parent->pn_clev);
463
464
465 /* These writes are not yet visible due to ordering. */
466 pctrie_addnode(parent, index, pctrie_toleaf(val), PCTRIE_UNSERIALIZED);
467 pctrie_addnode(parent, newind, node, PCTRIE_UNSERIALIZED);
468 /* Synchronize to make the above visible. */
469 pctrie_node_store(parentp, parent, PCTRIE_LOCKED);
470 }
471
472 /*
473 * Returns the value stored at the index. If the index is not present,
474 * NULL is returned.
475 */
476 static __always_inline uint64_t *
_pctrie_lookup(struct pctrie * ptree,uint64_t index,smr_t smr,enum pctrie_access access)477 _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr,
478 enum pctrie_access access)
479 {
480 struct pctrie_node *node;
481 uint64_t *m;
482 int slot;
483
484 node = pctrie_root_load(ptree, smr, access);
485 for (;;) {
486 if (pctrie_isleaf(node)) {
487 if ((m = pctrie_toval(node)) != NULL && *m == index)
488 return (m);
489 break;
490 }
491 if (pctrie_keybarr(node, index, &slot))
492 break;
493 node = pctrie_node_load(&node->pn_child[slot], smr, access);
494 }
495 return (NULL);
496 }
497
498 /*
499 * Returns the value stored at the index, assuming access is externally
500 * synchronized by a lock.
501 *
502 * If the index is not present, NULL is returned.
503 */
504 uint64_t *
pctrie_lookup(struct pctrie * ptree,uint64_t index)505 pctrie_lookup(struct pctrie *ptree, uint64_t index)
506 {
507 return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED));
508 }
509
510 /*
511 * Returns the value stored at the index without requiring an external lock.
512 *
513 * If the index is not present, NULL is returned.
514 */
515 uint64_t *
pctrie_lookup_unlocked(struct pctrie * ptree,uint64_t index,smr_t smr)516 pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr)
517 {
518 uint64_t *res;
519
520 smr_enter(smr);
521 res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR);
522 smr_exit(smr);
523 return (res);
524 }
525
526 /*
527 * Returns the value with the least index that is greater than or equal to the
528 * specified index, or NULL if there are no such values.
529 *
530 * Requires that access be externally synchronized by a lock.
531 */
532 static __inline uint64_t *
pctrie_lookup_ge_node(struct pctrie_node * node,uint64_t index)533 pctrie_lookup_ge_node(struct pctrie_node *node, uint64_t index)
534 {
535 struct pctrie_node *succ;
536 uint64_t *m;
537 int slot;
538
539 /*
540 * Descend the trie as if performing an ordinary lookup for the
541 * specified value. However, unlike an ordinary lookup, as we descend
542 * the trie, we use "succ" to remember the last branching-off point,
543 * that is, the interior node under which the least value that is both
544 * outside our current path down the trie and greater than the specified
545 * index resides. (The node's popmap makes it fast and easy to
546 * recognize a branching-off point.) If our ordinary lookup fails to
547 * yield a value that is greater than or equal to the specified index,
548 * then we will exit this loop and perform a lookup starting from
549 * "succ". If "succ" is not NULL, then that lookup is guaranteed to
550 * succeed.
551 */
552 succ = NULL;
553 for (;;) {
554 if (pctrie_isleaf(node)) {
555 if ((m = pctrie_toval(node)) != NULL && *m >= index)
556 return (m);
557 break;
558 }
559 if (pctrie_keybarr(node, index, &slot)) {
560 /*
561 * If all values in this subtree are > index, then the
562 * least value in this subtree is the answer.
563 */
564 if (node->pn_owner > index)
565 succ = node;
566 break;
567 }
568
569 /*
570 * Just in case the next search step leads to a subtree of all
571 * values < index, check popmap to see if a next bigger step, to
572 * a subtree of all pages with values > index, is available. If
573 * so, remember to restart the search here.
574 */
575 if ((node->pn_popmap >> slot) > 1)
576 succ = node;
577 node = pctrie_node_load(&node->pn_child[slot], NULL,
578 PCTRIE_LOCKED);
579 }
580
581 /*
582 * Restart the search from the last place visited in the subtree that
583 * included some values > index, if there was such a place.
584 */
585 if (succ == NULL)
586 return (NULL);
587 if (succ != node) {
588 /*
589 * Take a step to the next bigger sibling of the node chosen
590 * last time. In that subtree, all values > index.
591 */
592 slot = pctrie_slot(succ, index) + 1;
593 KASSERT((succ->pn_popmap >> slot) != 0,
594 ("%s: no popmap siblings past slot %d in node %p",
595 __func__, slot, succ));
596 slot += ffs(succ->pn_popmap >> slot) - 1;
597 succ = pctrie_node_load(&succ->pn_child[slot], NULL,
598 PCTRIE_LOCKED);
599 }
600
601 /*
602 * Find the value in the subtree rooted at "succ" with the least index.
603 */
604 while (!pctrie_isleaf(succ)) {
605 KASSERT(succ->pn_popmap != 0,
606 ("%s: no popmap children in node %p", __func__, succ));
607 slot = ffs(succ->pn_popmap) - 1;
608 succ = pctrie_node_load(&succ->pn_child[slot], NULL,
609 PCTRIE_LOCKED);
610 }
611 return (pctrie_toval(succ));
612 }
613
614 uint64_t *
pctrie_lookup_ge(struct pctrie * ptree,uint64_t index)615 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
616 {
617 return (pctrie_lookup_ge_node(
618 pctrie_root_load(ptree, NULL, PCTRIE_LOCKED), index));
619 }
620
621 uint64_t *
pctrie_subtree_lookup_gt(struct pctrie_node * node,uint64_t index)622 pctrie_subtree_lookup_gt(struct pctrie_node *node, uint64_t index)
623 {
624 if (node == NULL || index + 1 == 0)
625 return (NULL);
626 return (pctrie_lookup_ge_node(node, index + 1));
627 }
628
629 #ifdef INVARIANTS
630 void
pctrie_subtree_lookup_gt_assert(struct pctrie_node * node,uint64_t index,struct pctrie * ptree,uint64_t * res)631 pctrie_subtree_lookup_gt_assert(struct pctrie_node *node, uint64_t index,
632 struct pctrie *ptree, uint64_t *res)
633 {
634 uint64_t *expected;
635
636 if (index + 1 == 0)
637 expected = NULL;
638 else
639 expected = pctrie_lookup_ge(ptree, index + 1);
640 KASSERT(res == expected,
641 ("pctrie subtree lookup gt result different from root lookup: "
642 "ptree %p, index %ju, subtree %p, found %p, expected %p", ptree,
643 (uintmax_t)index, node, res, expected));
644 }
645 #endif
646
647 /*
648 * Returns the value with the greatest index that is less than or equal to the
649 * specified index, or NULL if there are no such values.
650 *
651 * Requires that access be externally synchronized by a lock.
652 */
653 static __inline uint64_t *
pctrie_lookup_le_node(struct pctrie_node * node,uint64_t index)654 pctrie_lookup_le_node(struct pctrie_node *node, uint64_t index)
655 {
656 struct pctrie_node *pred;
657 uint64_t *m;
658 int slot;
659
660 /*
661 * Mirror the implementation of pctrie_lookup_ge_node, described above.
662 */
663 pred = NULL;
664 for (;;) {
665 if (pctrie_isleaf(node)) {
666 if ((m = pctrie_toval(node)) != NULL && *m <= index)
667 return (m);
668 break;
669 }
670 if (pctrie_keybarr(node, index, &slot)) {
671 if (node->pn_owner < index)
672 pred = node;
673 break;
674 }
675 if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
676 pred = node;
677 node = pctrie_node_load(&node->pn_child[slot], NULL,
678 PCTRIE_LOCKED);
679 }
680 if (pred == NULL)
681 return (NULL);
682 if (pred != node) {
683 slot = pctrie_slot(pred, index);
684 KASSERT((pred->pn_popmap & ((1 << slot) - 1)) != 0,
685 ("%s: no popmap siblings before slot %d in node %p",
686 __func__, slot, pred));
687 slot = ilog2(pred->pn_popmap & ((1 << slot) - 1));
688 pred = pctrie_node_load(&pred->pn_child[slot], NULL,
689 PCTRIE_LOCKED);
690 }
691 while (!pctrie_isleaf(pred)) {
692 KASSERT(pred->pn_popmap != 0,
693 ("%s: no popmap children in node %p", __func__, pred));
694 slot = ilog2(pred->pn_popmap);
695 pred = pctrie_node_load(&pred->pn_child[slot], NULL,
696 PCTRIE_LOCKED);
697 }
698 return (pctrie_toval(pred));
699 }
700
701 uint64_t *
pctrie_lookup_le(struct pctrie * ptree,uint64_t index)702 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
703 {
704 return (pctrie_lookup_le_node(
705 pctrie_root_load(ptree, NULL, PCTRIE_LOCKED), index));
706 }
707
708 uint64_t *
pctrie_subtree_lookup_lt(struct pctrie_node * node,uint64_t index)709 pctrie_subtree_lookup_lt(struct pctrie_node *node, uint64_t index)
710 {
711 if (node == NULL || index == 0)
712 return (NULL);
713 return (pctrie_lookup_le_node(node, index - 1));
714 }
715
716 #ifdef INVARIANTS
717 void
pctrie_subtree_lookup_lt_assert(struct pctrie_node * node,uint64_t index,struct pctrie * ptree,uint64_t * res)718 pctrie_subtree_lookup_lt_assert(struct pctrie_node *node, uint64_t index,
719 struct pctrie *ptree, uint64_t *res)
720 {
721 uint64_t *expected;
722
723 if (index == 0)
724 expected = NULL;
725 else
726 expected = pctrie_lookup_le(ptree, index - 1);
727 KASSERT(res == expected,
728 ("pctrie subtree lookup lt result different from root lookup: "
729 "ptree %p, index %ju, subtree %p, found %p, expected %p", ptree,
730 (uintmax_t)index, node, res, expected));
731 }
732 #endif
733
734 /*
735 * Remove the specified index from the tree, and return the value stored at
736 * that index. If the index is not present, return NULL.
737 */
738 uint64_t *
pctrie_remove_lookup(struct pctrie * ptree,uint64_t index,struct pctrie_node ** freenode)739 pctrie_remove_lookup(struct pctrie *ptree, uint64_t index,
740 struct pctrie_node **freenode)
741 {
742 struct pctrie_node *child, *node, *parent;
743 uint64_t *m;
744 int slot;
745
746 *freenode = node = NULL;
747 child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
748 for (;;) {
749 if (pctrie_isleaf(child))
750 break;
751 parent = node;
752 node = child;
753 slot = pctrie_slot(node, index);
754 child = pctrie_node_load(&node->pn_child[slot], NULL,
755 PCTRIE_LOCKED);
756 }
757 if ((m = pctrie_toval(child)) == NULL || *m != index)
758 return (NULL);
759 if (node == NULL) {
760 pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_LOCKED);
761 return (m);
762 }
763 KASSERT((node->pn_popmap & (1 << slot)) != 0,
764 ("%s: bad popmap slot %d in node %p",
765 __func__, slot, node));
766 node->pn_popmap ^= 1 << slot;
767 pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_LOCKED);
768 if (!powerof2(node->pn_popmap))
769 return (m);
770 KASSERT(node->pn_popmap != 0, ("%s: bad popmap all zeroes", __func__));
771 slot = ffs(node->pn_popmap) - 1;
772 child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED);
773 KASSERT(child != PCTRIE_NULL,
774 ("%s: bad popmap slot %d in node %p", __func__, slot, node));
775 if (parent == NULL)
776 pctrie_root_store(ptree, child, PCTRIE_LOCKED);
777 else {
778 slot = pctrie_slot(parent, index);
779 KASSERT(node ==
780 pctrie_node_load(&parent->pn_child[slot], NULL,
781 PCTRIE_LOCKED), ("%s: invalid child value", __func__));
782 pctrie_node_store(&parent->pn_child[slot], child,
783 PCTRIE_LOCKED);
784 }
785 /*
786 * The child is still valid and we can not zero the
787 * pointer until all SMR references are gone.
788 */
789 pctrie_node_put(node);
790 *freenode = node;
791 return (m);
792 }
793
794 /*
795 * Prune all the leaves of 'node' before its first non-leaf child, make child
796 * zero of 'node' point up to 'parent', make 'node' into 'parent' and that
797 * non-leaf child into 'node'. Repeat until a node has been stripped of all
798 * children, and mark it for freeing, returning its parent.
799 */
800 static struct pctrie_node *
pctrie_reclaim_prune(struct pctrie_node ** pnode,struct pctrie_node * parent)801 pctrie_reclaim_prune(struct pctrie_node **pnode,
802 struct pctrie_node *parent)
803 {
804 struct pctrie_node *child, *node;
805 int slot;
806
807 node = *pnode;
808 while (node->pn_popmap != 0) {
809 slot = ffs(node->pn_popmap) - 1;
810 node->pn_popmap ^= 1 << slot;
811 child = pctrie_node_load(&node->pn_child[slot], NULL,
812 PCTRIE_UNSERIALIZED);
813 pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL,
814 PCTRIE_UNSERIALIZED);
815 if (pctrie_isleaf(child))
816 continue;
817 /* Climb one level down the trie. */
818 pctrie_node_store(&node->pn_child[0], parent,
819 PCTRIE_UNSERIALIZED);
820 parent = node;
821 node = child;
822 }
823 *pnode = parent;
824 return (node);
825 }
826
827 /*
828 * Recover the node parent from its first child and continue pruning.
829 */
830 struct pctrie_node *
pctrie_reclaim_resume(struct pctrie_node ** pnode)831 pctrie_reclaim_resume(struct pctrie_node **pnode)
832 {
833 struct pctrie_node *parent, *node;
834
835 node = *pnode;
836 if (node == NULL)
837 return (NULL);
838 /* Climb one level up the trie. */
839 parent = pctrie_node_load(&node->pn_child[0], NULL,
840 PCTRIE_UNSERIALIZED);
841 pctrie_node_store(&node->pn_child[0], PCTRIE_NULL, PCTRIE_UNSERIALIZED);
842 return (pctrie_reclaim_prune(pnode, parent));
843 }
844
845 /*
846 * Find the trie root, and start pruning with a NULL parent.
847 */
848 struct pctrie_node *
pctrie_reclaim_begin(struct pctrie_node ** pnode,struct pctrie * ptree)849 pctrie_reclaim_begin(struct pctrie_node **pnode,
850 struct pctrie *ptree)
851 {
852 struct pctrie_node *node;
853
854 node = pctrie_root_load(ptree, NULL, PCTRIE_UNSERIALIZED);
855 pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_UNSERIALIZED);
856 if (pctrie_isleaf(node))
857 return (NULL);
858 *pnode = node;
859 return (pctrie_reclaim_prune(pnode, NULL));
860 }
861
862 /*
863 * Replace an existing value in the trie with another one.
864 * Panics if there is not an old value in the trie at the new value's index.
865 */
866 uint64_t *
pctrie_replace(struct pctrie * ptree,uint64_t * newval)867 pctrie_replace(struct pctrie *ptree, uint64_t *newval)
868 {
869 struct pctrie_node *leaf, *parent, *node;
870 uint64_t *m;
871 uint64_t index;
872 int slot;
873
874 leaf = pctrie_toleaf(newval);
875 index = *newval;
876 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
877 parent = NULL;
878 for (;;) {
879 if (pctrie_isleaf(node)) {
880 if ((m = pctrie_toval(node)) != NULL && *m == index) {
881 if (parent == NULL)
882 ptree->pt_root = leaf;
883 else
884 pctrie_node_store(
885 &parent->pn_child[slot], leaf,
886 PCTRIE_LOCKED);
887 return (m);
888 }
889 break;
890 }
891 if (pctrie_keybarr(node, index, &slot))
892 break;
893 parent = node;
894 node = pctrie_node_load(&node->pn_child[slot], NULL,
895 PCTRIE_LOCKED);
896 }
897 panic("%s: original replacing value not found", __func__);
898 }
899
900 #ifdef DDB
901 /*
902 * Show details about the given node.
903 */
DB_SHOW_COMMAND(pctrienode,db_show_pctrienode)904 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
905 {
906 struct pctrie_node *node, *tmp;
907 int slot;
908 pn_popmap_t popmap;
909
910 if (!have_addr)
911 return;
912 node = (struct pctrie_node *)addr;
913 db_printf("node %p, owner %jx, children popmap %04x, level %u:\n",
914 (void *)node, (uintmax_t)node->pn_owner, node->pn_popmap,
915 node->pn_clev / PCTRIE_WIDTH);
916 for (popmap = node->pn_popmap; popmap != 0; popmap ^= 1 << slot) {
917 slot = ffs(popmap) - 1;
918 tmp = pctrie_node_load(&node->pn_child[slot], NULL,
919 PCTRIE_UNSERIALIZED);
920 db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
921 slot, (void *)tmp,
922 pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL,
923 node->pn_clev / PCTRIE_WIDTH);
924 }
925 }
926 #endif /* DDB */
927