1 /*
2  * Elastic Binary Trees - macros and structures for Multi-Byte data nodes.
3  * Version 6.0.6
4  * (C) 2002-2011 - Willy Tarreau <w@1wt.eu>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation, version 2.1
9  * exclusively.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef _EBMBTREE_H
22 #define _EBMBTREE_H
23 
24 #include <string.h>
25 #include "ebtree.h"
26 
27 /* Return the structure of type <type> whose member <member> points to <ptr> */
28 #define ebmb_entry(ptr, type, member) container_of(ptr, type, member)
29 
30 #define EBMB_ROOT	EB_ROOT
31 #define EBMB_TREE_HEAD	EB_TREE_HEAD
32 
33 /* This structure carries a node, a leaf, and a key. It must start with the
34  * eb_node so that it can be cast into an eb_node. We could also have put some
35  * sort of transparent union here to reduce the indirection level, but the fact
36  * is, the end user is not meant to manipulate internals, so this is pointless.
37  * The 'node.bit' value here works differently from scalar types, as it contains
38  * the number of identical bits between the two branches.
39  * Note that we take a great care of making sure the key is located exactly at
40  * the end of the struct even if that involves holes before it, so that it
41  * always aliases any external key a user would append after. This is why the
42  * key uses the same alignment as the struct.
43  */
44 struct ebmb_node {
45 	struct eb_node node; /* the tree node, must be at the beginning */
46 	ALWAYS_ALIGN(sizeof(void*));
47 	unsigned char key[0]; /* the key, its size depends on the application */
48 } ALIGNED(sizeof(void*));
49 
50 /*
51  * Exported functions and macros.
52  * Many of them are always inlined because they are extremely small, and
53  * are generally called at most once or twice in a program.
54  */
55 
56 /* Return leftmost node in the tree, or NULL if none */
ebmb_first(struct eb_root * root)57 static forceinline struct ebmb_node *ebmb_first(struct eb_root *root)
58 {
59 	return ebmb_entry(eb_first(root), struct ebmb_node, node);
60 }
61 
62 /* Return rightmost node in the tree, or NULL if none */
ebmb_last(struct eb_root * root)63 static forceinline struct ebmb_node *ebmb_last(struct eb_root *root)
64 {
65 	return ebmb_entry(eb_last(root), struct ebmb_node, node);
66 }
67 
68 /* Return next node in the tree, or NULL if none */
ebmb_next(struct ebmb_node * ebmb)69 static forceinline struct ebmb_node *ebmb_next(struct ebmb_node *ebmb)
70 {
71 	return ebmb_entry(eb_next(&ebmb->node), struct ebmb_node, node);
72 }
73 
74 /* Return previous node in the tree, or NULL if none */
ebmb_prev(struct ebmb_node * ebmb)75 static forceinline struct ebmb_node *ebmb_prev(struct ebmb_node *ebmb)
76 {
77 	return ebmb_entry(eb_prev(&ebmb->node), struct ebmb_node, node);
78 }
79 
80 /* Return next leaf node within a duplicate sub-tree, or NULL if none. */
ebmb_next_dup(struct ebmb_node * ebmb)81 static inline struct ebmb_node *ebmb_next_dup(struct ebmb_node *ebmb)
82 {
83 	return ebmb_entry(eb_next_dup(&ebmb->node), struct ebmb_node, node);
84 }
85 
86 /* Return previous leaf node within a duplicate sub-tree, or NULL if none. */
ebmb_prev_dup(struct ebmb_node * ebmb)87 static inline struct ebmb_node *ebmb_prev_dup(struct ebmb_node *ebmb)
88 {
89 	return ebmb_entry(eb_prev_dup(&ebmb->node), struct ebmb_node, node);
90 }
91 
92 /* Return next node in the tree, skipping duplicates, or NULL if none */
ebmb_next_unique(struct ebmb_node * ebmb)93 static forceinline struct ebmb_node *ebmb_next_unique(struct ebmb_node *ebmb)
94 {
95 	return ebmb_entry(eb_next_unique(&ebmb->node), struct ebmb_node, node);
96 }
97 
98 /* Return previous node in the tree, skipping duplicates, or NULL if none */
ebmb_prev_unique(struct ebmb_node * ebmb)99 static forceinline struct ebmb_node *ebmb_prev_unique(struct ebmb_node *ebmb)
100 {
101 	return ebmb_entry(eb_prev_unique(&ebmb->node), struct ebmb_node, node);
102 }
103 
104 /* Delete node from the tree if it was linked in. Mark the node unused. Note
105  * that this function relies on a non-inlined generic function: eb_delete.
106  */
ebmb_delete(struct ebmb_node * ebmb)107 static forceinline void ebmb_delete(struct ebmb_node *ebmb)
108 {
109 	eb_delete(&ebmb->node);
110 }
111 
112 /* The following functions are not inlined by default. They are declared
113  * in ebmbtree.c, which simply relies on their inline version.
114  */
115 struct ebmb_node *ebmb_lookup(struct eb_root *root, const void *x, unsigned int len);
116 struct ebmb_node *ebmb_insert(struct eb_root *root, struct ebmb_node *new, unsigned int len);
117 struct ebmb_node *ebmb_lookup_longest(struct eb_root *root, const void *x);
118 struct ebmb_node *ebmb_lookup_prefix(struct eb_root *root, const void *x, unsigned int pfx);
119 struct ebmb_node *ebmb_insert_prefix(struct eb_root *root, struct ebmb_node *new, unsigned int len);
120 
121 /* The following functions are less likely to be used directly, because their
122  * code is larger. The non-inlined version is preferred.
123  */
124 
125 /* Delete node from the tree if it was linked in. Mark the node unused. */
__ebmb_delete(struct ebmb_node * ebmb)126 static forceinline void __ebmb_delete(struct ebmb_node *ebmb)
127 {
128 	__eb_delete(&ebmb->node);
129 }
130 
131 /* Find the first occurrence of a key of a least <len> bytes matching <x> in the
132  * tree <root>. The caller is responsible for ensuring that <len> will not exceed
133  * the common parts between the tree's keys and <x>. In case of multiple matches,
134  * the leftmost node is returned. This means that this function can be used to
135  * lookup string keys by prefix if all keys in the tree are zero-terminated. If
136  * no match is found, NULL is returned. Returns first node if <len> is zero.
137  */
__ebmb_lookup(struct eb_root * root,const void * x,unsigned int len)138 static forceinline struct ebmb_node *__ebmb_lookup(struct eb_root *root, const void *x, unsigned int len)
139 {
140 	struct ebmb_node *node;
141 	eb_troot_t *troot;
142 	int pos, side;
143 	int node_bit;
144 
145 	troot = root->b[EB_LEFT];
146 	if (unlikely(troot == NULL))
147 		goto ret_null;
148 
149 	if (unlikely(len == 0))
150 		goto walk_down;
151 
152 	pos = 0;
153 	while (1) {
154 		if (eb_gettag(troot) == EB_LEAF) {
155 			node = container_of(eb_untag(troot, EB_LEAF),
156 					    struct ebmb_node, node.branches);
157 			if (eb_memcmp(node->key + pos, x, len) != 0)
158 				goto ret_null;
159 			else
160 				goto ret_node;
161 		}
162 		node = container_of(eb_untag(troot, EB_NODE),
163 				    struct ebmb_node, node.branches);
164 
165 		node_bit = node->node.bit;
166 		if (node_bit < 0) {
167 			/* We have a dup tree now. Either it's for the same
168 			 * value, and we walk down left, or it's a different
169 			 * one and we don't have our key.
170 			 */
171 			if (eb_memcmp(node->key + pos, x, len) != 0)
172 				goto ret_null;
173 			else
174 				goto walk_left;
175 		}
176 
177 		/* OK, normal data node, let's walk down. We check if all full
178 		 * bytes are equal, and we start from the last one we did not
179 		 * completely check. We stop as soon as we reach the last byte,
180 		 * because we must decide to go left/right or abort.
181 		 */
182 		node_bit = ~node_bit + (pos << 3) + 8; // = (pos<<3) + (7 - node_bit)
183 		if (node_bit < 0) {
184 			/* This surprising construction gives better performance
185 			 * because gcc does not try to reorder the loop. Tested to
186 			 * be fine with 2.95 to 4.2.
187 			 */
188 			while (1) {
189 				if (node->key[pos++] ^ *(unsigned char*)(x++))
190 					goto ret_null;  /* more than one full byte is different */
191 				if (--len == 0)
192 					goto walk_left; /* return first node if all bytes matched */
193 				node_bit += 8;
194 				if (node_bit >= 0)
195 					break;
196 			}
197 		}
198 
199 		/* here we know that only the last byte differs, so node_bit < 8.
200 		 * We have 2 possibilities :
201 		 *   - more than the last bit differs => return NULL
202 		 *   - walk down on side = (x[pos] >> node_bit) & 1
203 		 */
204 		side = *(unsigned char *)x >> node_bit;
205 		if (((node->key[pos] >> node_bit) ^ side) > 1)
206 			goto ret_null;
207 		side &= 1;
208 		troot = node->node.branches.b[side];
209 	}
210  walk_left:
211 	troot = node->node.branches.b[EB_LEFT];
212  walk_down:
213 	while (eb_gettag(troot) != EB_LEAF)
214 		troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT];
215 	node = container_of(eb_untag(troot, EB_LEAF),
216 			    struct ebmb_node, node.branches);
217  ret_node:
218 	return node;
219  ret_null:
220 	return NULL;
221 }
222 
223 /* Insert ebmb_node <new> into subtree starting at node root <root>.
224  * Only new->key needs be set with the key. The ebmb_node is returned.
225  * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The
226  * len is specified in bytes. It is absolutely mandatory that this length
227  * is the same for all keys in the tree. This function cannot be used to
228  * insert strings.
229  */
230 static forceinline struct ebmb_node *
__ebmb_insert(struct eb_root * root,struct ebmb_node * new,unsigned int len)231 __ebmb_insert(struct eb_root *root, struct ebmb_node *new, unsigned int len)
232 {
233 	struct ebmb_node *old;
234 	unsigned int side;
235 	eb_troot_t *troot, **up_ptr;
236 	eb_troot_t *root_right;
237 	int diff;
238 	int bit;
239 	eb_troot_t *new_left, *new_rght;
240 	eb_troot_t *new_leaf;
241 	int old_node_bit;
242 
243 	side = EB_LEFT;
244 	troot = root->b[EB_LEFT];
245 	root_right = root->b[EB_RGHT];
246 	if (unlikely(troot == NULL)) {
247 		/* Tree is empty, insert the leaf part below the left branch */
248 		root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF);
249 		new->node.leaf_p = eb_dotag(root, EB_LEFT);
250 		new->node.node_p = NULL; /* node part unused */
251 		return new;
252 	}
253 
254 	/* The tree descent is fairly easy :
255 	 *  - first, check if we have reached a leaf node
256 	 *  - second, check if we have gone too far
257 	 *  - third, reiterate
258 	 * Everywhere, we use <new> for the node node we are inserting, <root>
259 	 * for the node we attach it to, and <old> for the node we are
260 	 * displacing below <new>. <troot> will always point to the future node
261 	 * (tagged with its type). <side> carries the side the node <new> is
262 	 * attached to below its parent, which is also where previous node
263 	 * was attached.
264 	 */
265 
266 	bit = 0;
267 	while (1) {
268 		if (unlikely(eb_gettag(troot) == EB_LEAF)) {
269 			/* insert above a leaf */
270 			old = container_of(eb_untag(troot, EB_LEAF),
271 					    struct ebmb_node, node.branches);
272 			new->node.node_p = old->node.leaf_p;
273 			up_ptr = &old->node.leaf_p;
274 			goto check_bit_and_break;
275 		}
276 
277 		/* OK we're walking down this link */
278 		old = container_of(eb_untag(troot, EB_NODE),
279 				   struct ebmb_node, node.branches);
280 		old_node_bit = old->node.bit;
281 
282 		if (unlikely(old->node.bit < 0)) {
283 			/* We're above a duplicate tree, so we must compare the whole value */
284 			new->node.node_p = old->node.node_p;
285 			up_ptr = &old->node.node_p;
286 		check_bit_and_break:
287 			bit = equal_bits(new->key, old->key, bit, len << 3);
288 			break;
289 		}
290 
291 		/* Stop going down when we don't have common bits anymore. We
292 		 * also stop in front of a duplicates tree because it means we
293 		 * have to insert above. Note: we can compare more bits than
294 		 * the current node's because as long as they are identical, we
295 		 * know we descend along the correct side.
296 		 */
297 
298 		bit = equal_bits(new->key, old->key, bit, old_node_bit);
299 		if (unlikely(bit < old_node_bit)) {
300 			/* The tree did not contain the key, so we insert <new> before the
301 			 * node <old>, and set ->bit to designate the lowest bit position in
302 			 * <new> which applies to ->branches.b[].
303 			 */
304 			new->node.node_p = old->node.node_p;
305 			up_ptr = &old->node.node_p;
306 			break;
307 		}
308 		/* we don't want to skip bits for further comparisons, so we must limit <bit>.
309 		 * However, since we're going down around <old_node_bit>, we know it will be
310 		 * properly matched, so we can skip this bit.
311 		 */
312 		bit = old_node_bit + 1;
313 
314 		/* walk down */
315 		root = &old->node.branches;
316 		side = old_node_bit & 7;
317 		side ^= 7;
318 		side = (new->key[old_node_bit >> 3] >> side) & 1;
319 		troot = root->b[side];
320 	}
321 
322 	new_left = eb_dotag(&new->node.branches, EB_LEFT);
323 	new_rght = eb_dotag(&new->node.branches, EB_RGHT);
324 	new_leaf = eb_dotag(&new->node.branches, EB_LEAF);
325 
326 	new->node.bit = bit;
327 
328 	/* Note: we can compare more bits than the current node's because as
329 	 * long as they are identical, we know we descend along the correct
330 	 * side. However we don't want to start to compare past the end.
331 	 */
332 	diff = 0;
333 	if (((unsigned)bit >> 3) < len)
334 		diff = cmp_bits(new->key, old->key, bit);
335 
336 	if (diff == 0) {
337 		new->node.bit = -1; /* mark as new dup tree, just in case */
338 
339 		if (likely(eb_gettag(root_right))) {
340 			/* we refuse to duplicate this key if the tree is
341 			 * tagged as containing only unique keys.
342 			 */
343 			return old;
344 		}
345 
346 		if (eb_gettag(troot) != EB_LEAF) {
347 			/* there was already a dup tree below */
348 			struct eb_node *ret;
349 			ret = eb_insert_dup(&old->node, &new->node);
350 			return container_of(ret, struct ebmb_node, node);
351 		}
352 		/* otherwise fall through */
353 	}
354 
355 	if (diff >= 0) {
356 		new->node.branches.b[EB_LEFT] = troot;
357 		new->node.branches.b[EB_RGHT] = new_leaf;
358 		new->node.leaf_p = new_rght;
359 		*up_ptr = new_left;
360 	}
361 	else if (diff < 0) {
362 		new->node.branches.b[EB_LEFT] = new_leaf;
363 		new->node.branches.b[EB_RGHT] = troot;
364 		new->node.leaf_p = new_left;
365 		*up_ptr = new_rght;
366 	}
367 
368 	/* Ok, now we are inserting <new> between <root> and <old>. <old>'s
369 	 * parent is already set to <new>, and the <root>'s branch is still in
370 	 * <side>. Update the root's leaf till we have it. Note that we can also
371 	 * find the side by checking the side of new->node.node_p.
372 	 */
373 
374 	root->b[side] = eb_dotag(&new->node.branches, EB_NODE);
375 	return new;
376 }
377 
378 
379 /* Find the first occurrence of the longest prefix matching a key <x> in the
380  * tree <root>. It's the caller's responsibility to ensure that key <x> is at
381  * least as long as the keys in the tree. Note that this can be ensured by
382  * having a byte at the end of <x> which cannot be part of any prefix, typically
383  * the trailing zero for a string. If none can be found, return NULL.
384  */
__ebmb_lookup_longest(struct eb_root * root,const void * x)385 static forceinline struct ebmb_node *__ebmb_lookup_longest(struct eb_root *root, const void *x)
386 {
387 	struct ebmb_node *node;
388 	eb_troot_t *troot, *cover;
389 	int pos, side;
390 	int node_bit;
391 
392 	troot = root->b[EB_LEFT];
393 	if (unlikely(troot == NULL))
394 		return NULL;
395 
396 	cover = NULL;
397 	pos = 0;
398 	while (1) {
399 		if ((eb_gettag(troot) == EB_LEAF)) {
400 			node = container_of(eb_untag(troot, EB_LEAF),
401 					    struct ebmb_node, node.branches);
402 			if (check_bits(x - pos, node->key, pos, node->node.pfx))
403 				goto not_found;
404 
405 			return node;
406 		}
407 		node = container_of(eb_untag(troot, EB_NODE),
408 				    struct ebmb_node, node.branches);
409 
410 		node_bit = node->node.bit;
411 		if (node_bit < 0) {
412 			/* We have a dup tree now. Either it's for the same
413 			 * value, and we walk down left, or it's a different
414 			 * one and we don't have our key.
415 			 */
416 			if (check_bits(x - pos, node->key, pos, node->node.pfx))
417 				goto not_found;
418 
419 			troot = node->node.branches.b[EB_LEFT];
420 			while (eb_gettag(troot) != EB_LEAF)
421 				troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT];
422 			node = container_of(eb_untag(troot, EB_LEAF),
423 					    struct ebmb_node, node.branches);
424 			return node;
425 		}
426 
427 		node_bit >>= 1; /* strip cover bit */
428 		node_bit = ~node_bit + (pos << 3) + 8; // = (pos<<3) + (7 - node_bit)
429 		if (node_bit < 0) {
430 			/* This uncommon construction gives better performance
431 			 * because gcc does not try to reorder the loop. Tested to
432 			 * be fine with 2.95 to 4.2.
433 			 */
434 			while (1) {
435 				x++; pos++;
436 				if (node->key[pos-1] ^ *(unsigned char*)(x-1))
437 					goto not_found; /* more than one full byte is different */
438 				node_bit += 8;
439 				if (node_bit >= 0)
440 					break;
441 			}
442 		}
443 
444 		/* here we know that only the last byte differs, so 0 <= node_bit <= 7.
445 		 * We have 2 possibilities :
446 		 *   - more than the last bit differs => data does not match
447 		 *   - walk down on side = (x[pos] >> node_bit) & 1
448 		 */
449 		side = *(unsigned char *)x >> node_bit;
450 		if (((node->key[pos] >> node_bit) ^ side) > 1)
451 			goto not_found;
452 
453 		if (!(node->node.bit & 1)) {
454 			/* This is a cover node, let's keep a reference to it
455 			 * for later. The covering subtree is on the left, and
456 			 * the covered subtree is on the right, so we have to
457 			 * walk down right.
458 			 */
459 			cover = node->node.branches.b[EB_LEFT];
460 			troot = node->node.branches.b[EB_RGHT];
461 			continue;
462 		}
463 		side &= 1;
464 		troot = node->node.branches.b[side];
465 	}
466 
467  not_found:
468 	/* Walk down last cover tree if it exists. It does not matter if cover is NULL */
469 	return ebmb_entry(eb_walk_down(cover, EB_LEFT), struct ebmb_node, node);
470 }
471 
472 
473 /* Find the first occurrence of a prefix matching a key <x> of <pfx> BITS in the
474  * tree <root>. It's the caller's responsibility to ensure that key <x> is at
475  * least as long as the keys in the tree. Note that this can be ensured by
476  * having a byte at the end of <x> which cannot be part of any prefix, typically
477  * the trailing zero for a string. If none can be found, return NULL.
478  */
__ebmb_lookup_prefix(struct eb_root * root,const void * x,unsigned int pfx)479 static forceinline struct ebmb_node *__ebmb_lookup_prefix(struct eb_root *root, const void *x, unsigned int pfx)
480 {
481 	struct ebmb_node *node;
482 	eb_troot_t *troot;
483 	int pos, side;
484 	int node_bit;
485 
486 	troot = root->b[EB_LEFT];
487 	if (unlikely(troot == NULL))
488 		return NULL;
489 
490 	pos = 0;
491 	while (1) {
492 		if ((eb_gettag(troot) == EB_LEAF)) {
493 			node = container_of(eb_untag(troot, EB_LEAF),
494 					    struct ebmb_node, node.branches);
495 			if (node->node.pfx != pfx)
496 				return NULL;
497 			if (check_bits(x - pos, node->key, pos, node->node.pfx))
498 				return NULL;
499 			return node;
500 		}
501 		node = container_of(eb_untag(troot, EB_NODE),
502 				    struct ebmb_node, node.branches);
503 
504 		node_bit = node->node.bit;
505 		if (node_bit < 0) {
506 			/* We have a dup tree now. Either it's for the same
507 			 * value, and we walk down left, or it's a different
508 			 * one and we don't have our key.
509 			 */
510 			if (node->node.pfx != pfx)
511 				return NULL;
512 			if (check_bits(x - pos, node->key, pos, node->node.pfx))
513 				return NULL;
514 
515 			troot = node->node.branches.b[EB_LEFT];
516 			while (eb_gettag(troot) != EB_LEAF)
517 				troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT];
518 			node = container_of(eb_untag(troot, EB_LEAF),
519 					    struct ebmb_node, node.branches);
520 			return node;
521 		}
522 
523 		node_bit >>= 1; /* strip cover bit */
524 		node_bit = ~node_bit + (pos << 3) + 8; // = (pos<<3) + (7 - node_bit)
525 		if (node_bit < 0) {
526 			/* This uncommon construction gives better performance
527 			 * because gcc does not try to reorder the loop. Tested to
528 			 * be fine with 2.95 to 4.2.
529 			 */
530 			while (1) {
531 				x++; pos++;
532 				if (node->key[pos-1] ^ *(unsigned char*)(x-1))
533 					return NULL; /* more than one full byte is different */
534 				node_bit += 8;
535 				if (node_bit >= 0)
536 					break;
537 			}
538 		}
539 
540 		/* here we know that only the last byte differs, so 0 <= node_bit <= 7.
541 		 * We have 2 possibilities :
542 		 *   - more than the last bit differs => data does not match
543 		 *   - walk down on side = (x[pos] >> node_bit) & 1
544 		 */
545 		side = *(unsigned char *)x >> node_bit;
546 		if (((node->key[pos] >> node_bit) ^ side) > 1)
547 			return NULL;
548 
549 		if (!(node->node.bit & 1)) {
550 			/* This is a cover node, it may be the entry we're
551 			 * looking for. We already know that it matches all the
552 			 * bits, let's compare prefixes and descend the cover
553 			 * subtree if they match.
554 			 */
555 			if ((unsigned short)node->node.bit >> 1 == pfx)
556 				troot = node->node.branches.b[EB_LEFT];
557 			else
558 				troot = node->node.branches.b[EB_RGHT];
559 			continue;
560 		}
561 		side &= 1;
562 		troot = node->node.branches.b[side];
563 	}
564 }
565 
566 
567 /* Insert ebmb_node <new> into a prefix subtree starting at node root <root>.
568  * Only new->key and new->pfx need be set with the key and its prefix length.
569  * Note that bits between <pfx> and <len> are theoretically ignored and should be
570  * zero, as it is not certain yet that they will always be ignored everywhere
571  * (eg in bit compare functions).
572  * The ebmb_node is returned.
573  * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The
574  * len is specified in bytes.
575  */
576 static forceinline struct ebmb_node *
__ebmb_insert_prefix(struct eb_root * root,struct ebmb_node * new,unsigned int len)577 __ebmb_insert_prefix(struct eb_root *root, struct ebmb_node *new, unsigned int len)
578 {
579 	struct ebmb_node *old;
580 	unsigned int side;
581 	eb_troot_t *troot, **up_ptr;
582 	eb_troot_t *root_right;
583 	int diff;
584 	int bit;
585 	eb_troot_t *new_left, *new_rght;
586 	eb_troot_t *new_leaf;
587 	int old_node_bit;
588 
589 	side = EB_LEFT;
590 	troot = root->b[EB_LEFT];
591 	root_right = root->b[EB_RGHT];
592 	if (unlikely(troot == NULL)) {
593 		/* Tree is empty, insert the leaf part below the left branch */
594 		root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF);
595 		new->node.leaf_p = eb_dotag(root, EB_LEFT);
596 		new->node.node_p = NULL; /* node part unused */
597 		return new;
598 	}
599 
600 	len <<= 3;
601 	if (len > new->node.pfx)
602 		len = new->node.pfx;
603 
604 	/* The tree descent is fairly easy :
605 	 *  - first, check if we have reached a leaf node
606 	 *  - second, check if we have gone too far
607 	 *  - third, reiterate
608 	 * Everywhere, we use <new> for the node node we are inserting, <root>
609 	 * for the node we attach it to, and <old> for the node we are
610 	 * displacing below <new>. <troot> will always point to the future node
611 	 * (tagged with its type). <side> carries the side the node <new> is
612 	 * attached to below its parent, which is also where previous node
613 	 * was attached.
614 	 */
615 
616 	bit = 0;
617 	while (1) {
618 		if (unlikely(eb_gettag(troot) == EB_LEAF)) {
619 			/* Insert above a leaf. Note that this leaf could very
620 			 * well be part of a cover node.
621 			 */
622 			old = container_of(eb_untag(troot, EB_LEAF),
623 					    struct ebmb_node, node.branches);
624 			new->node.node_p = old->node.leaf_p;
625 			up_ptr = &old->node.leaf_p;
626 			goto check_bit_and_break;
627 		}
628 
629 		/* OK we're walking down this link */
630 		old = container_of(eb_untag(troot, EB_NODE),
631 				   struct ebmb_node, node.branches);
632 		old_node_bit = old->node.bit;
633 		/* Note that old_node_bit can be :
634 		 *   < 0    : dup tree
635 		 *   = 2N   : cover node for N bits
636 		 *   = 2N+1 : normal node at N bits
637 		 */
638 
639 		if (unlikely(old_node_bit < 0)) {
640 			/* We're above a duplicate tree, so we must compare the whole value */
641 			new->node.node_p = old->node.node_p;
642 			up_ptr = &old->node.node_p;
643 		check_bit_and_break:
644 			/* No need to compare everything if the leaves are shorter than the new one. */
645 			if (len > old->node.pfx)
646 				len = old->node.pfx;
647 			bit = equal_bits(new->key, old->key, bit, len);
648 			break;
649 		}
650 
651 		/* WARNING: for the two blocks below, <bit> is counted in half-bits */
652 
653 		bit = equal_bits(new->key, old->key, bit, old_node_bit >> 1);
654 		bit = (bit << 1) + 1; // assume comparisons with normal nodes
655 
656 		/* we must always check that our prefix is larger than the nodes
657 		 * we visit, otherwise we have to stop going down. The following
658 		 * test is able to stop before both normal and cover nodes.
659 		 */
660 		if (bit >= (new->node.pfx << 1) && (new->node.pfx << 1) < old_node_bit) {
661 			/* insert cover node here on the left */
662 			new->node.node_p = old->node.node_p;
663 			up_ptr = &old->node.node_p;
664 			new->node.bit = new->node.pfx << 1;
665 			diff = -1;
666 			goto insert_above;
667 		}
668 
669 		if (unlikely(bit < old_node_bit)) {
670 			/* The tree did not contain the key, so we insert <new> before the
671 			 * node <old>, and set ->bit to designate the lowest bit position in
672 			 * <new> which applies to ->branches.b[]. We know that the bit is not
673 			 * greater than the prefix length thanks to the test above.
674 			 */
675 			new->node.node_p = old->node.node_p;
676 			up_ptr = &old->node.node_p;
677 			new->node.bit = bit;
678 			diff = cmp_bits(new->key, old->key, bit >> 1);
679 			goto insert_above;
680 		}
681 
682 		if (!(old_node_bit & 1)) {
683 			/* if we encounter a cover node with our exact prefix length, it's
684 			 * necessarily the same value, so we insert there as a duplicate on
685 			 * the left. For that, we go down on the left and the leaf detection
686 			 * code will finish the job.
687 			 */
688 			if ((new->node.pfx << 1) == old_node_bit) {
689 				root = &old->node.branches;
690 				side = EB_LEFT;
691 				troot = root->b[side];
692 				continue;
693 			}
694 
695 			/* cover nodes are always walked through on the right */
696 			side = EB_RGHT;
697 			bit = old_node_bit >> 1; /* recheck that bit */
698 			root = &old->node.branches;
699 			troot = root->b[side];
700 			continue;
701 		}
702 
703 		/* we don't want to skip bits for further comparisons, so we must limit <bit>.
704 		 * However, since we're going down around <old_node_bit>, we know it will be
705 		 * properly matched, so we can skip this bit.
706 		 */
707 		old_node_bit >>= 1;
708 		bit = old_node_bit + 1;
709 
710 		/* walk down */
711 		root = &old->node.branches;
712 		side = old_node_bit & 7;
713 		side ^= 7;
714 		side = (new->key[old_node_bit >> 3] >> side) & 1;
715 		troot = root->b[side];
716 	}
717 
718 	/* Right here, we have 4 possibilities :
719 	 * - the tree does not contain any leaf matching the
720 	 *   key, and we have new->key < old->key. We insert
721 	 *   new above old, on the left ;
722 	 *
723 	 * - the tree does not contain any leaf matching the
724 	 *   key, and we have new->key > old->key. We insert
725 	 *   new above old, on the right ;
726 	 *
727 	 * - the tree does contain the key with the same prefix
728 	 *   length. We add the new key next to it as a first
729 	 *   duplicate (since it was alone).
730 	 *
731 	 * The last two cases can easily be partially merged.
732 	 *
733 	 * - the tree contains a leaf matching the key, we have
734 	 *   to insert above it as a cover node. The leaf with
735 	 *   the shortest prefix becomes the left subtree and
736 	 *   the leaf with the longest prefix becomes the right
737 	 *   one. The cover node gets the min of both prefixes
738 	 *   as its new bit.
739 	 */
740 
741 	/* first we want to ensure that we compare the correct bit, which means
742 	 * the largest common to both nodes.
743 	 */
744 	if (bit > new->node.pfx)
745 		bit = new->node.pfx;
746 	if (bit > old->node.pfx)
747 		bit = old->node.pfx;
748 
749 	new->node.bit = (bit << 1) + 1; /* assume normal node by default */
750 
751 	/* if one prefix is included in the second one, we don't compare bits
752 	 * because they won't necessarily match, we just proceed with a cover
753 	 * node insertion.
754 	 */
755 	diff = 0;
756 	if (bit < old->node.pfx && bit < new->node.pfx)
757 		diff = cmp_bits(new->key, old->key, bit);
758 
759 	if (diff == 0) {
760 		/* Both keys match. Either it's a duplicate entry or we have to
761 		 * put the shortest prefix left and the largest one right below
762 		 * a new cover node. By default, diff==0 means we'll be inserted
763 		 * on the right.
764 		 */
765 		new->node.bit--; /* anticipate cover node insertion */
766 		if (new->node.pfx == old->node.pfx) {
767 			new->node.bit = -1; /* mark as new dup tree, just in case */
768 
769 			if (unlikely(eb_gettag(root_right))) {
770 				/* we refuse to duplicate this key if the tree is
771 				 * tagged as containing only unique keys.
772 				 */
773 				return old;
774 			}
775 
776 			if (eb_gettag(troot) != EB_LEAF) {
777 				/* there was already a dup tree below */
778 				struct eb_node *ret;
779 				ret = eb_insert_dup(&old->node, &new->node);
780 				return container_of(ret, struct ebmb_node, node);
781 			}
782 			/* otherwise fall through to insert first duplicate */
783 		}
784 		/* otherwise we just rely on the tests below to select the right side */
785 		else if (new->node.pfx < old->node.pfx)
786 			diff = -1; /* force insertion to left side */
787 	}
788 
789  insert_above:
790 	new_left = eb_dotag(&new->node.branches, EB_LEFT);
791 	new_rght = eb_dotag(&new->node.branches, EB_RGHT);
792 	new_leaf = eb_dotag(&new->node.branches, EB_LEAF);
793 
794 	if (diff >= 0) {
795 		new->node.branches.b[EB_LEFT] = troot;
796 		new->node.branches.b[EB_RGHT] = new_leaf;
797 		new->node.leaf_p = new_rght;
798 		*up_ptr = new_left;
799 	}
800 	else {
801 		new->node.branches.b[EB_LEFT] = new_leaf;
802 		new->node.branches.b[EB_RGHT] = troot;
803 		new->node.leaf_p = new_left;
804 		*up_ptr = new_rght;
805 	}
806 
807 	root->b[side] = eb_dotag(&new->node.branches, EB_NODE);
808 	return new;
809 }
810 
811 
812 
813 #endif /* _EBMBTREE_H */
814 
815