1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012-2024 Marko Zec
5 * Copyright (c) 2005, 2018 University of Zagreb
6 * Copyright (c) 2005 International Computer Science Institute
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 /*
31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32 * structures and a trivial search procedure. More significant bits of
33 * the search key are used to directly index a two-stage trie, while the
34 * remaining bits are used for finding the next hop in a sorted array.
35 * More details in:
36 *
37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38 * second in software, ACM SIGCOMM Computer Communication Review, September
39 * 2012
40 *
41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43 */
44
45 #include <sys/cdefs.h>
46 #include "opt_inet.h"
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/ctype.h>
51 #include <sys/epoch.h>
52 #include <sys/malloc.h>
53 #include <sys/module.h>
54 #include <sys/socket.h>
55 #include <sys/sysctl.h>
56 #include <sys/syslog.h>
57
58 #include <vm/uma.h>
59
60 #include <netinet/in.h>
61 #include <netinet/in_fib.h>
62
63 #include <net/route.h>
64 #include <net/route/route_ctl.h>
65 #include <net/route/fib_algo.h>
66
67 #define DXR_TRIE_BITS 20
68
69 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
70
71 #if DXR_TRIE_BITS > 16
72 #define DXR_D 16
73 #else
74 #define DXR_D (DXR_TRIE_BITS - 1)
75 #endif
76
77 #define D_TBL_SIZE (1 << DXR_D)
78 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
79 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS)
80 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS)
81
82 #define DESC_BASE_BITS 22
83 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS)
84 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1)
85 #define RTBL_SIZE_INCR (BASE_MAX / 64)
86
87 #if DXR_TRIE_BITS < 24
88 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1)
89 #else
90 #define FRAGS_MASK_SHORT 0
91 #endif
92 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \
93 ~FRAGS_MASK_SHORT)
94 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1)
95 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2)
96
97 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
98 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
99 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL)
100
101 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1)
102
103 #define CHUNK_HASH_BITS 16
104 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS)
105 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1)
106
107 #define TRIE_HASH_BITS 16
108 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS)
109 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1)
110
111 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16)
112
113 #define UNUSED_BUCKETS 8
114
115 /* Lookup structure elements */
116
117 struct direct_entry {
118 uint32_t fragments: DESC_FRAGMENTS_BITS,
119 base: DESC_BASE_BITS;
120 };
121
122 struct range_entry_long {
123 uint32_t start: DXR_RANGE_SHIFT,
124 nexthop: DXR_TRIE_BITS;
125 };
126
127 #if DXR_TRIE_BITS < 24
128 struct range_entry_short {
129 uint16_t start: DXR_RANGE_SHIFT - 8,
130 nexthop: DXR_TRIE_BITS - 8;
131 };
132 #endif
133
134 /* Auxiliary structures */
135
136 struct heap_entry {
137 uint32_t start;
138 uint32_t end;
139 uint32_t preflen;
140 uint32_t nexthop;
141 };
142
143 struct chunk_desc {
144 LIST_ENTRY(chunk_desc) cd_all_le;
145 LIST_ENTRY(chunk_desc) cd_hash_le;
146 uint32_t cd_hash;
147 uint32_t cd_refcnt;
148 uint32_t cd_base;
149 uint32_t cd_cur_size;
150 uint32_t cd_max_size;
151 };
152
153 struct trie_desc {
154 LIST_ENTRY(trie_desc) td_all_le;
155 LIST_ENTRY(trie_desc) td_hash_le;
156 uint32_t td_hash;
157 uint32_t td_index;
158 uint32_t td_refcnt;
159 };
160
161 struct dxr_aux {
162 /* Glue to external state */
163 struct fib_data *fd;
164 uint32_t fibnum;
165 int refcnt;
166
167 /* Auxiliary build-time tables */
168 struct direct_entry direct_tbl[DIRECT_TBL_SIZE];
169 uint16_t d_tbl[D_TBL_SIZE];
170 struct direct_entry *x_tbl;
171 union {
172 struct range_entry_long re;
173 uint32_t fragments;
174 } *range_tbl;
175
176 /* Auxiliary internal state */
177 uint32_t updates_mask[DIRECT_TBL_SIZE / 32];
178 struct trie_desc *trietbl[D_TBL_SIZE];
179 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
180 LIST_HEAD(, chunk_desc) all_chunks;
181 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
182 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE];
183 LIST_HEAD(, trie_desc) all_trie;
184 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */
185 struct sockaddr_in dst;
186 struct sockaddr_in mask;
187 struct heap_entry heap[33];
188 uint32_t prefixes;
189 uint32_t updates_low;
190 uint32_t updates_high;
191 uint32_t unused_chunks_size;
192 uint32_t xtbl_size;
193 uint32_t all_trie_cnt;
194 uint32_t unused_trie_cnt;
195 uint32_t trie_rebuilt_prefixes;
196 uint32_t heap_index;
197 uint32_t d_bits;
198 uint32_t rtbl_size;
199 uint32_t rtbl_top;
200 uint32_t rtbl_work_frags;
201 uint32_t work_chunk;
202 };
203
204 /* Main lookup structure container */
205
206 struct dxr {
207 /* Lookup tables */
208 void *d;
209 void *x;
210 void *r;
211 struct nhop_object **nh_tbl;
212
213 /* Glue to external state */
214 struct dxr_aux *aux;
215 struct fib_data *fd;
216 struct epoch_context epoch_ctx;
217 uint32_t fibnum;
218 uint16_t d_shift;
219 uint16_t x_shift;
220 uint32_t x_mask;
221 };
222
223 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
224 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
225
226 uma_zone_t chunk_zone;
227 uma_zone_t trie_zone;
228
229 VNET_DEFINE_STATIC(int, frag_limit) = 100;
230 #define V_frag_limit VNET(frag_limit)
231
232 /* Binary search for a matching address range */
233 #define DXR_LOOKUP_STAGE \
234 if (masked_dst < range[middle].start) { \
235 upperbound = middle; \
236 middle = (middle + lowerbound) / 2; \
237 } else if (masked_dst < range[middle + 1].start) \
238 return (range[middle].nexthop); \
239 else { \
240 lowerbound = middle + 1; \
241 middle = (upperbound + middle + 1) / 2; \
242 } \
243 if (upperbound == lowerbound) \
244 return (range[lowerbound].nexthop);
245
246 static int
range_lookup(struct range_entry_long * rt,struct direct_entry de,uint32_t dst)247 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
248 {
249 uint32_t base;
250 uint32_t upperbound;
251 uint32_t middle;
252 uint32_t lowerbound;
253 uint32_t masked_dst;
254
255 base = de.base;
256 lowerbound = 0;
257 masked_dst = dst & DXR_RANGE_MASK;
258
259 #if DXR_TRIE_BITS < 24
260 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
261 upperbound = de.fragments & FRAGS_MASK_SHORT;
262 struct range_entry_short *range =
263 (struct range_entry_short *) &rt[base];
264
265 masked_dst >>= 8;
266 middle = upperbound;
267 upperbound = upperbound * 2 + 1;
268
269 for (;;) {
270 DXR_LOOKUP_STAGE
271 DXR_LOOKUP_STAGE
272 }
273 }
274 #endif
275
276 upperbound = de.fragments;
277 middle = upperbound / 2;
278 struct range_entry_long *range = &rt[base];
279 if (__predict_false(IS_XL_FORMAT(de.fragments))) {
280 upperbound = *((uint32_t *) range);
281 range++;
282 middle = upperbound / 2;
283 }
284
285 for (;;) {
286 DXR_LOOKUP_STAGE
287 DXR_LOOKUP_STAGE
288 }
289 }
290
291 #define DXR_LOOKUP_DEFINE(D) \
292 static int inline \
293 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \
294 { \
295 struct direct_entry de; \
296 uint16_t *dt = dxr->d; \
297 struct direct_entry *xt = dxr->x; \
298 \
299 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
300 + ((dst >> DXR_RANGE_SHIFT) & \
301 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \
302 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \
303 return (de.base); \
304 return (range_lookup(dxr->r, de, dst)); \
305 } \
306 \
307 static struct nhop_object * \
308 dxr_fib_lookup_##D(void *algo_data, \
309 const struct flm_lookup_key key, uint32_t scopeid __unused) \
310 { \
311 struct dxr *dxr = algo_data; \
312 \
313 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \
314 ntohl(key.addr4.s_addr))]); \
315 }
316
317 #if DXR_TRIE_BITS > 16
318 DXR_LOOKUP_DEFINE(16)
319 #endif
320 DXR_LOOKUP_DEFINE(15)
321 DXR_LOOKUP_DEFINE(14)
322 DXR_LOOKUP_DEFINE(13)
323 DXR_LOOKUP_DEFINE(12)
324 DXR_LOOKUP_DEFINE(11)
325 DXR_LOOKUP_DEFINE(10)
326 DXR_LOOKUP_DEFINE(9)
327
328 static int inline
dxr_lookup(struct dxr * dxr,uint32_t dst)329 dxr_lookup(struct dxr *dxr, uint32_t dst)
330 {
331 struct direct_entry de;
332 uint16_t *dt = dxr->d;
333 struct direct_entry *xt = dxr->x;
334
335 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
336 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
337 if (__predict_true(de.fragments == FRAGS_MARK_HIT))
338 return (de.base);
339 return (range_lookup(dxr->r, de, dst));
340 }
341
342 static void
initheap(struct dxr_aux * da,uint32_t dst_u32,uint32_t chunk)343 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
344 {
345 struct heap_entry *fhp = &da->heap[0];
346 struct rtentry *rt;
347 struct route_nhop_data rnd;
348
349 da->heap_index = 0;
350 da->dst.sin_addr.s_addr = htonl(dst_u32);
351 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
352 &rnd);
353 if (rt != NULL) {
354 struct in_addr addr;
355 uint32_t scopeid;
356
357 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
358 fhp->start = ntohl(addr.s_addr);
359 fhp->end = fhp->start;
360 if (fhp->preflen < 32)
361 fhp->end |= (0xffffffffU >> fhp->preflen);
362 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
363 } else {
364 fhp->preflen = fhp->nexthop = fhp->start = 0;
365 fhp->end = 0xffffffffU;
366 }
367 }
368
369 static uint32_t
chunk_size(struct dxr_aux * da,struct direct_entry * fdesc)370 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
371 {
372
373 if (IS_SHORT_FORMAT(fdesc->fragments))
374 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
375 else if (IS_XL_FORMAT(fdesc->fragments))
376 return (da->range_tbl[fdesc->base].fragments + 2);
377 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
378 return (fdesc->fragments + 1);
379 }
380
381 static uint32_t
chunk_hash(struct dxr_aux * da,struct direct_entry * fdesc)382 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
383 {
384 uint32_t size = chunk_size(da, fdesc);
385 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
386 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
387 uint32_t hash = fdesc->fragments;
388
389 for (; p < l; p++)
390 hash = (hash << 7) + (hash >> 13) + *p;
391
392 return (hash + (hash >> 16));
393 }
394
395 static int
chunk_ref(struct dxr_aux * da,uint32_t chunk)396 chunk_ref(struct dxr_aux *da, uint32_t chunk)
397 {
398 struct direct_entry *fdesc = &da->direct_tbl[chunk];
399 struct chunk_desc *cdp, *empty_cdp;
400 uint32_t base = fdesc->base;
401 uint32_t size = chunk_size(da, fdesc);
402 uint32_t hash = chunk_hash(da, fdesc);
403 int i;
404
405 /* Find an existing descriptor */
406 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
407 cd_hash_le) {
408 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
409 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
410 sizeof(struct range_entry_long) * size))
411 continue;
412 da->rtbl_top = fdesc->base;
413 fdesc->base = cdp->cd_base;
414 cdp->cd_refcnt++;
415 return (0);
416 }
417
418 /* No matching chunks found. Find an empty one to recycle. */
419 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
420 cdp = LIST_FIRST(&da->unused_chunks[i]);
421
422 if (cdp == NULL)
423 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
424 if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
425 empty_cdp->cd_max_size < cdp->cd_max_size)) {
426 cdp = empty_cdp;
427 if (empty_cdp->cd_max_size == size)
428 break;
429 }
430
431 if (cdp != NULL) {
432 /* Copy from heap into the recycled chunk */
433 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
434 size * sizeof(struct range_entry_long));
435 fdesc->base = cdp->cd_base;
436 da->rtbl_top -= size;
437 da->unused_chunks_size -= cdp->cd_max_size;
438 if (cdp->cd_max_size > size) {
439 /* Split the range in two, need a new descriptor */
440 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
441 if (empty_cdp == NULL)
442 return (1);
443 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
444 empty_cdp->cd_base = cdp->cd_base + size;
445 empty_cdp->cd_cur_size = 0;
446 empty_cdp->cd_max_size = cdp->cd_max_size - size;
447
448 i = empty_cdp->cd_max_size;
449 if (i >= UNUSED_BUCKETS)
450 i = 0;
451 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
452 cd_hash_le);
453
454 da->unused_chunks_size += empty_cdp->cd_max_size;
455 cdp->cd_max_size = size;
456 }
457 LIST_REMOVE(cdp, cd_hash_le);
458 } else {
459 /* Alloc a new descriptor at the top of the heap*/
460 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
461 if (cdp == NULL)
462 return (1);
463 cdp->cd_max_size = size;
464 cdp->cd_base = fdesc->base;
465 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
466 MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
467 }
468
469 cdp->cd_hash = hash;
470 cdp->cd_refcnt = 1;
471 cdp->cd_cur_size = size;
472 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
473 cd_hash_le);
474 if (da->rtbl_top >= da->rtbl_size) {
475 if (da->rtbl_top >= BASE_MAX) {
476 FIB_PRINTF(LOG_ERR, da->fd,
477 "structural limit exceeded at %d "
478 "range table elements", da->rtbl_top);
479 return (1);
480 }
481 da->rtbl_size += RTBL_SIZE_INCR;
482 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
483 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
484 da->rtbl_top * 100 / BASE_MAX);
485 da->range_tbl = realloc(da->range_tbl,
486 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
487 M_DXRAUX, M_NOWAIT);
488 if (da->range_tbl == NULL) {
489 FIB_PRINTF(LOG_NOTICE, da->fd,
490 "Unable to allocate DXR range table");
491 return (1);
492 }
493 }
494
495 return (0);
496 }
497
498 static void
chunk_unref(struct dxr_aux * da,uint32_t chunk)499 chunk_unref(struct dxr_aux *da, uint32_t chunk)
500 {
501 struct direct_entry *fdesc = &da->direct_tbl[chunk];
502 struct chunk_desc *cdp, *cdp2;
503 uint32_t base = fdesc->base;
504 uint32_t size = chunk_size(da, fdesc);
505 uint32_t hash = chunk_hash(da, fdesc);
506 int i;
507
508 /* Find the corresponding descriptor */
509 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
510 cd_hash_le)
511 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
512 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
513 sizeof(struct range_entry_long) * size) == 0)
514 break;
515
516 MPASS(cdp != NULL);
517 if (--cdp->cd_refcnt > 0)
518 return;
519
520 LIST_REMOVE(cdp, cd_hash_le);
521 da->unused_chunks_size += cdp->cd_max_size;
522 cdp->cd_cur_size = 0;
523
524 /* Attempt to merge with the preceding chunk, if empty */
525 cdp2 = LIST_NEXT(cdp, cd_all_le);
526 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
527 MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
528 LIST_REMOVE(cdp, cd_all_le);
529 LIST_REMOVE(cdp2, cd_hash_le);
530 cdp2->cd_max_size += cdp->cd_max_size;
531 uma_zfree(chunk_zone, cdp);
532 cdp = cdp2;
533 }
534
535 /* Attempt to merge with the subsequent chunk, if empty */
536 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
537 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
538 MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
539 LIST_REMOVE(cdp, cd_all_le);
540 LIST_REMOVE(cdp2, cd_hash_le);
541 cdp2->cd_max_size += cdp->cd_max_size;
542 cdp2->cd_base = cdp->cd_base;
543 uma_zfree(chunk_zone, cdp);
544 cdp = cdp2;
545 }
546
547 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
548 /* Free the chunk on the top of the range heap, trim the heap */
549 MPASS(cdp == LIST_FIRST(&da->all_chunks));
550 da->rtbl_top -= cdp->cd_max_size;
551 da->unused_chunks_size -= cdp->cd_max_size;
552 LIST_REMOVE(cdp, cd_all_le);
553 uma_zfree(chunk_zone, cdp);
554 return;
555 }
556
557 i = cdp->cd_max_size;
558 if (i >= UNUSED_BUCKETS)
559 i = 0;
560 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
561 }
562
563 static uint32_t
trie_hash(struct dxr_aux * da,uint32_t dxr_x,uint32_t index)564 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
565 {
566 uint32_t i, *val;
567 uint32_t hash = 0;
568
569 for (i = 0; i < (1 << dxr_x); i++) {
570 hash = (hash << 3) ^ (hash >> 3);
571 val = (uint32_t *)
572 (void *) &da->direct_tbl[(index << dxr_x) + i];
573 hash += (*val << 5);
574 hash += (*val >> 5);
575 }
576
577 return (hash + (hash >> 16));
578 }
579
580 static int
trie_ref(struct dxr_aux * da,uint32_t index)581 trie_ref(struct dxr_aux *da, uint32_t index)
582 {
583 struct trie_desc *tp;
584 uint32_t dxr_d = da->d_bits;
585 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
586 uint32_t hash = trie_hash(da, dxr_x, index);
587
588 /* Find an existing descriptor */
589 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
590 if (tp->td_hash == hash &&
591 memcmp(&da->direct_tbl[index << dxr_x],
592 &da->x_tbl[tp->td_index << dxr_x],
593 sizeof(*da->x_tbl) << dxr_x) == 0) {
594 tp->td_refcnt++;
595 da->trietbl[index] = tp;
596 return(tp->td_index);
597 }
598
599 tp = LIST_FIRST(&da->unused_trie);
600 if (tp != NULL) {
601 LIST_REMOVE(tp, td_hash_le);
602 da->unused_trie_cnt--;
603 } else {
604 tp = uma_zalloc(trie_zone, M_NOWAIT);
605 if (tp == NULL)
606 return (-1);
607 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
608 tp->td_index = da->all_trie_cnt++;
609 }
610
611 tp->td_hash = hash;
612 tp->td_refcnt = 1;
613 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
614 td_hash_le);
615 memcpy(&da->x_tbl[tp->td_index << dxr_x],
616 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
617 da->trietbl[index] = tp;
618 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
619 da->xtbl_size += XTBL_SIZE_INCR;
620 da->x_tbl = realloc(da->x_tbl,
621 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
622 if (da->x_tbl == NULL) {
623 FIB_PRINTF(LOG_NOTICE, da->fd,
624 "Unable to allocate DXR extension table");
625 return (-1);
626 }
627 }
628 return(tp->td_index);
629 }
630
631 static void
trie_unref(struct dxr_aux * da,uint32_t index)632 trie_unref(struct dxr_aux *da, uint32_t index)
633 {
634 struct trie_desc *tp = da->trietbl[index];
635
636 if (tp == NULL)
637 return;
638 da->trietbl[index] = NULL;
639 if (--tp->td_refcnt > 0)
640 return;
641
642 LIST_REMOVE(tp, td_hash_le);
643 da->unused_trie_cnt++;
644 if (tp->td_index != da->all_trie_cnt - 1) {
645 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
646 return;
647 }
648
649 do {
650 da->all_trie_cnt--;
651 da->unused_trie_cnt--;
652 LIST_REMOVE(tp, td_all_le);
653 uma_zfree(trie_zone, tp);
654 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
655 if (tp->td_index == da->all_trie_cnt - 1) {
656 LIST_REMOVE(tp, td_hash_le);
657 break;
658 }
659 } while (tp != NULL);
660 }
661
662 static void
heap_inject(struct dxr_aux * da,uint32_t start,uint32_t end,uint32_t preflen,uint32_t nh)663 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
664 uint32_t nh)
665 {
666 struct heap_entry *fhp;
667 int i;
668
669 for (i = da->heap_index; i >= 0; i--) {
670 if (preflen > da->heap[i].preflen)
671 break;
672 else if (preflen < da->heap[i].preflen)
673 da->heap[i + 1] = da->heap[i];
674 else
675 return;
676 }
677
678 fhp = &da->heap[i + 1];
679 fhp->preflen = preflen;
680 fhp->start = start;
681 fhp->end = end;
682 fhp->nexthop = nh;
683 da->heap_index++;
684 }
685
686 static int
dxr_walk(struct rtentry * rt,void * arg)687 dxr_walk(struct rtentry *rt, void *arg)
688 {
689 struct dxr_aux *da = arg;
690 uint32_t chunk = da->work_chunk;
691 uint32_t first = chunk << DXR_RANGE_SHIFT;
692 uint32_t last = first | DXR_RANGE_MASK;
693 struct range_entry_long *fp =
694 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
695 struct heap_entry *fhp = &da->heap[da->heap_index];
696 uint32_t preflen, nh, start, end, scopeid;
697 struct in_addr addr;
698
699 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
700 start = ntohl(addr.s_addr);
701 if (start > last)
702 return (-1); /* Beyond chunk boundaries, we are done */
703 if (start < first)
704 return (0); /* Skip this route */
705
706 end = start;
707 if (preflen < 32)
708 end |= (0xffffffffU >> preflen);
709 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
710
711 if (start == fhp->start)
712 heap_inject(da, start, end, preflen, nh);
713 else {
714 /* start > fhp->start */
715 while (start > fhp->end) {
716 uint32_t oend = fhp->end;
717
718 if (da->heap_index > 0) {
719 fhp--;
720 da->heap_index--;
721 } else
722 initheap(da, fhp->end + 1, chunk);
723 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
724 fp++;
725 da->rtbl_work_frags++;
726 fp->start = (oend + 1) & DXR_RANGE_MASK;
727 fp->nexthop = fhp->nexthop;
728 }
729 }
730 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
731 nh != fp->nexthop) {
732 fp++;
733 da->rtbl_work_frags++;
734 fp->start = start & DXR_RANGE_MASK;
735 } else if (da->rtbl_work_frags) {
736 if ((--fp)->nexthop == nh)
737 da->rtbl_work_frags--;
738 else
739 fp++;
740 }
741 fp->nexthop = nh;
742 heap_inject(da, start, end, preflen, nh);
743 }
744
745 return (0);
746 }
747
748 static int
update_chunk(struct dxr_aux * da,uint32_t chunk)749 update_chunk(struct dxr_aux *da, uint32_t chunk)
750 {
751 struct range_entry_long *fp;
752 #if DXR_TRIE_BITS < 24
753 struct range_entry_short *fps;
754 uint32_t start, nh, i;
755 #endif
756 struct heap_entry *fhp;
757 uint32_t first = chunk << DXR_RANGE_SHIFT;
758 uint32_t last = first | DXR_RANGE_MASK;
759
760 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
761 chunk_unref(da, chunk);
762
763 initheap(da, first, chunk);
764
765 fp = &da->range_tbl[da->rtbl_top].re;
766 da->rtbl_work_frags = 0;
767 fp->start = first & DXR_RANGE_MASK;
768 fp->nexthop = da->heap[0].nexthop;
769
770 da->dst.sin_addr.s_addr = htonl(first);
771 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
772
773 da->work_chunk = chunk;
774 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
775 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
776 dxr_walk, da);
777
778 /* Flush any remaining objects on the heap */
779 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
780 fhp = &da->heap[da->heap_index];
781 while (fhp->preflen > DXR_TRIE_BITS) {
782 uint32_t oend = fhp->end;
783
784 if (da->heap_index > 0) {
785 fhp--;
786 da->heap_index--;
787 } else
788 initheap(da, fhp->end + 1, chunk);
789 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
790 /* Have we crossed the upper chunk boundary? */
791 if (oend >= last)
792 break;
793 fp++;
794 da->rtbl_work_frags++;
795 fp->start = (oend + 1) & DXR_RANGE_MASK;
796 fp->nexthop = fhp->nexthop;
797 }
798 }
799
800 /* Direct hit if the chunk contains only a single fragment */
801 if (da->rtbl_work_frags == 0) {
802 da->direct_tbl[chunk].base = fp->nexthop;
803 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
804 return (0);
805 }
806
807 da->direct_tbl[chunk].base = da->rtbl_top;
808 da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
809
810 #if DXR_TRIE_BITS < 24
811 /* Check whether the chunk can be more compactly encoded */
812 fp = &da->range_tbl[da->rtbl_top].re;
813 for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
814 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
815 break;
816 if (i == da->rtbl_work_frags + 1) {
817 fp = &da->range_tbl[da->rtbl_top].re;
818 fps = (void *) fp;
819 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
820 start = fp->start;
821 nh = fp->nexthop;
822 fps->start = start >> 8;
823 fps->nexthop = nh;
824 }
825 fps->start = start >> 8;
826 fps->nexthop = nh;
827 da->rtbl_work_frags >>= 1;
828 da->direct_tbl[chunk].fragments =
829 da->rtbl_work_frags | FRAGS_PREF_SHORT;
830 } else
831 #endif
832 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
833 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
834 memmove(&da->range_tbl[da->rtbl_top + 1],
835 &da->range_tbl[da->rtbl_top],
836 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
837 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
838 da->rtbl_work_frags++;
839 }
840 da->rtbl_top += (da->rtbl_work_frags + 1);
841 return (chunk_ref(da, chunk));
842 }
843
844 static void
dxr_build(struct dxr * dxr)845 dxr_build(struct dxr *dxr)
846 {
847 struct dxr_aux *da = dxr->aux;
848 struct chunk_desc *cdp;
849 struct rib_rtable_info rinfo;
850 struct timeval t0, t1, t2, t3;
851 uint32_t r_size, dxr_tot_size;
852 uint32_t i, m, range_rebuild = 0;
853 uint32_t range_frag;
854 struct trie_desc *tp;
855 uint32_t d_tbl_size, dxr_x, d_size, x_size;
856 uint32_t ti, trie_rebuild = 0, prev_size = 0;
857 uint32_t trie_frag;
858
859 MPASS(dxr->d == NULL);
860
861 if (da == NULL) {
862 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
863 if (da == NULL) {
864 FIB_PRINTF(LOG_NOTICE, dxr->fd,
865 "Unable to allocate DXR aux struct");
866 return;
867 }
868 dxr->aux = da;
869 da->fibnum = dxr->fibnum;
870 da->fd = dxr->fd;
871 da->refcnt = 1;
872 LIST_INIT(&da->all_chunks);
873 LIST_INIT(&da->all_trie);
874 da->rtbl_size = RTBL_SIZE_INCR;
875 da->range_tbl = NULL;
876 da->xtbl_size = XTBL_SIZE_INCR;
877 da->x_tbl = NULL;
878 bzero(&da->dst, sizeof(da->dst));
879 bzero(&da->mask, sizeof(da->mask));
880 da->dst.sin_len = sizeof(da->dst);
881 da->mask.sin_len = sizeof(da->mask);
882 da->dst.sin_family = AF_INET;
883 da->mask.sin_family = AF_INET;
884 }
885 if (da->range_tbl == NULL) {
886 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
887 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
888 if (da->range_tbl == NULL) {
889 FIB_PRINTF(LOG_NOTICE, da->fd,
890 "Unable to allocate DXR range table");
891 return;
892 }
893 range_rebuild = 1;
894 }
895 if (da->x_tbl == NULL) {
896 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
897 M_DXRAUX, M_NOWAIT);
898 if (da->x_tbl == NULL) {
899 FIB_PRINTF(LOG_NOTICE, da->fd,
900 "Unable to allocate DXR extension table");
901 return;
902 }
903 trie_rebuild = 1;
904 }
905
906 microuptime(&t0);
907
908 dxr->nh_tbl = fib_get_nhop_array(da->fd);
909 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
910
911 if (da->updates_low > da->updates_high)
912 range_rebuild = 1;
913
914 range_build:
915 if (range_rebuild) {
916 /* Bulk cleanup */
917 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
918 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
919 LIST_REMOVE(cdp, cd_all_le);
920 uma_zfree(chunk_zone, cdp);
921 }
922 for (i = 0; i < UNUSED_BUCKETS; i++)
923 LIST_INIT(&da->unused_chunks[i]);
924 da->unused_chunks_size = 0;
925 da->rtbl_top = 0;
926 da->updates_low = 0;
927 da->updates_high = DIRECT_TBL_SIZE - 1;
928 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
929 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
930 da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
931 da->direct_tbl[i].base = 0;
932 }
933 }
934 da->prefixes = rinfo.num_prefixes;
935
936 /* DXR: construct direct & range table */
937 for (i = da->updates_low; i <= da->updates_high; i++) {
938 m = da->updates_mask[i >> 5] >> (i & 0x1f);
939 if (m == 0)
940 i |= 0x1f;
941 else if (m & 1 && update_chunk(da, i) != 0)
942 return;
943 }
944
945 range_frag = 0;
946 if (da->rtbl_top)
947 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
948 if (range_frag > V_frag_limit) {
949 range_rebuild = 1;
950 goto range_build;
951 }
952
953 r_size = sizeof(*da->range_tbl) * da->rtbl_top;
954 microuptime(&t1);
955
956 if (range_rebuild ||
957 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
958 trie_rebuild = 1;
959
960 trie_build:
961 if (trie_rebuild) {
962 da->trie_rebuilt_prefixes = da->prefixes;
963 da->d_bits = DXR_D;
964 da->updates_low = 0;
965 da->updates_high = DIRECT_TBL_SIZE - 1;
966 if (!range_rebuild)
967 memset(da->updates_mask, 0xff,
968 sizeof(da->updates_mask));
969 }
970
971 dxr2_try_squeeze:
972 if (trie_rebuild) {
973 /* Bulk cleanup */
974 bzero(da->trietbl, sizeof(da->trietbl));
975 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
976 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
977 LIST_REMOVE(tp, td_all_le);
978 uma_zfree(trie_zone, tp);
979 }
980 LIST_INIT(&da->unused_trie);
981 da->all_trie_cnt = da->unused_trie_cnt = 0;
982 }
983
984 /* Populate d_tbl, x_tbl */
985 dxr_x = DXR_TRIE_BITS - da->d_bits;
986 d_tbl_size = (1 << da->d_bits);
987
988 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
989 i++) {
990 if (!trie_rebuild) {
991 m = 0;
992 for (int j = 0; j < (1 << dxr_x); j += 32)
993 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
994 if (m == 0)
995 continue;
996 trie_unref(da, i);
997 }
998 ti = trie_ref(da, i);
999 if (ti < 0)
1000 return;
1001 da->d_tbl[i] = ti;
1002 }
1003
1004 trie_frag = 0;
1005 if (da->all_trie_cnt)
1006 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1007 if (trie_frag > V_frag_limit) {
1008 trie_rebuild = 1;
1009 goto trie_build;
1010 }
1011
1012 d_size = sizeof(*da->d_tbl) * d_tbl_size;
1013 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1014 * da->all_trie_cnt;
1015 dxr_tot_size = d_size + x_size + r_size;
1016
1017 if (trie_rebuild == 1) {
1018 /* Try to find a more compact D/X split */
1019 if (prev_size == 0 || dxr_tot_size <= prev_size)
1020 da->d_bits--;
1021 else {
1022 da->d_bits++;
1023 trie_rebuild = 2;
1024 }
1025 prev_size = dxr_tot_size;
1026 goto dxr2_try_squeeze;
1027 }
1028 microuptime(&t2);
1029
1030 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1031 if (dxr->d == NULL) {
1032 FIB_PRINTF(LOG_NOTICE, da->fd,
1033 "Unable to allocate DXR lookup table");
1034 return;
1035 }
1036 memcpy(dxr->d, da->d_tbl, d_size);
1037 dxr->x = ((char *) dxr->d) + d_size;
1038 memcpy(dxr->x, da->x_tbl, x_size);
1039 dxr->r = ((char *) dxr->x) + x_size;
1040 dxr->d_shift = 32 - da->d_bits;
1041 dxr->x_shift = dxr_x;
1042 dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1043 memcpy(dxr->r, da->range_tbl, r_size);
1044
1045 if (da->updates_low <= da->updates_high)
1046 bzero(&da->updates_mask[da->updates_low / 32],
1047 (da->updates_high - da->updates_low) / 8 + 1);
1048 da->updates_low = DIRECT_TBL_SIZE - 1;
1049 da->updates_high = 0;
1050 microuptime(&t3);
1051
1052 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1053 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1054 i = dxr_tot_size * 100;
1055 if (rinfo.num_prefixes)
1056 i /= rinfo.num_prefixes;
1057 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1058 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1059 i / 100, i % 100);
1060 FIB_PRINTF(LOG_INFO, da->fd,
1061 "%d.%02d%% trie, %d.%02d%% range fragmentation",
1062 trie_frag / 100, trie_frag % 100,
1063 range_frag / 100, range_frag % 100);
1064 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1065 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1066 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1067 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1068 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1069 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1070 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1071 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1072 i / 1000, i % 1000);
1073 }
1074
1075 /*
1076 * Glue functions for attaching to the FIB_ALGO infrastructure.
1077 */
1078
1079 static struct nhop_object *
dxr_fib_lookup(void * algo_data,const struct flm_lookup_key key,uint32_t scopeid)1080 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1081 uint32_t scopeid)
1082 {
1083 struct dxr *dxr = algo_data;
1084
1085 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1086 }
1087
1088 static enum flm_op_result
dxr_init(uint32_t fibnum,struct fib_data * fd,void * old_data,void ** data)1089 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1090 {
1091 struct dxr *old_dxr = old_data;
1092 struct dxr_aux *da = NULL;
1093 struct dxr *dxr;
1094
1095 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1096 if (dxr == NULL) {
1097 FIB_PRINTF(LOG_NOTICE, fd,
1098 "Unable to allocate DXR container struct");
1099 return (FLM_REBUILD);
1100 }
1101
1102 /* Check whether we may reuse the old auxiliary structures */
1103 if (old_dxr != NULL && old_dxr->aux != NULL &&
1104 old_dxr->aux->fd == fd) {
1105 da = old_dxr->aux;
1106 atomic_add_int(&da->refcnt, 1);
1107 }
1108
1109 dxr->aux = da;
1110 dxr->d = NULL;
1111 dxr->fd = fd;
1112 dxr->fibnum = fibnum;
1113 *data = dxr;
1114
1115 return (FLM_SUCCESS);
1116 }
1117
1118 static void
dxr_destroy(void * data)1119 dxr_destroy(void *data)
1120 {
1121 struct dxr *dxr = data;
1122 struct dxr_aux *da = dxr->aux;
1123 struct chunk_desc *cdp;
1124 struct trie_desc *tp;
1125
1126 free(dxr->d, M_DXRLPM);
1127 free(dxr, M_DXRAUX);
1128
1129 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1130 return;
1131
1132 /* Release auxiliary structures */
1133 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1134 LIST_REMOVE(cdp, cd_all_le);
1135 uma_zfree(chunk_zone, cdp);
1136 }
1137 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1138 LIST_REMOVE(tp, td_all_le);
1139 uma_zfree(trie_zone, tp);
1140 }
1141 free(da->range_tbl, M_DXRAUX);
1142 free(da->x_tbl, M_DXRAUX);
1143 free(da, M_DXRAUX);
1144 }
1145
1146 static void
epoch_dxr_destroy(epoch_context_t ctx)1147 epoch_dxr_destroy(epoch_context_t ctx)
1148 {
1149 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1150
1151 dxr_destroy(dxr);
1152 }
1153
1154 static void *
choose_lookup_fn(struct dxr_aux * da)1155 choose_lookup_fn(struct dxr_aux *da)
1156 {
1157
1158 switch (da->d_bits) {
1159 #if DXR_TRIE_BITS > 16
1160 case 16:
1161 return (dxr_fib_lookup_16);
1162 #endif
1163 case 15:
1164 return (dxr_fib_lookup_15);
1165 case 14:
1166 return (dxr_fib_lookup_14);
1167 case 13:
1168 return (dxr_fib_lookup_13);
1169 case 12:
1170 return (dxr_fib_lookup_12);
1171 case 11:
1172 return (dxr_fib_lookup_11);
1173 case 10:
1174 return (dxr_fib_lookup_10);
1175 case 9:
1176 return (dxr_fib_lookup_9);
1177 }
1178 return (dxr_fib_lookup);
1179 }
1180
1181 static enum flm_op_result
dxr_dump_end(void * data,struct fib_dp * dp)1182 dxr_dump_end(void *data, struct fib_dp *dp)
1183 {
1184 struct dxr *dxr = data;
1185 struct dxr_aux *da;
1186
1187 dxr_build(dxr);
1188
1189 da = dxr->aux;
1190 if (da == NULL || dxr->d == NULL)
1191 return (FLM_REBUILD);
1192
1193 if (da->rtbl_top >= BASE_MAX)
1194 return (FLM_ERROR);
1195
1196 dp->f = choose_lookup_fn(da);
1197 dp->arg = dxr;
1198
1199 return (FLM_SUCCESS);
1200 }
1201
1202 static enum flm_op_result
dxr_dump_rib_item(struct rtentry * rt,void * data)1203 dxr_dump_rib_item(struct rtentry *rt, void *data)
1204 {
1205
1206 return (FLM_SUCCESS);
1207 }
1208
1209 static enum flm_op_result
dxr_change_rib_item(struct rib_head * rnh,struct rib_cmd_info * rc,void * data)1210 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1211 void *data)
1212 {
1213
1214 return (FLM_BATCH);
1215 }
1216
1217 static enum flm_op_result
dxr_change_rib_batch(struct rib_head * rnh,struct fib_change_queue * q,void * data)1218 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1219 void *data)
1220 {
1221 struct dxr *dxr = data;
1222 struct dxr *new_dxr;
1223 struct dxr_aux *da;
1224 struct fib_dp new_dp;
1225 enum flm_op_result res;
1226 uint32_t ip, plen, hmask, start, end, i, ui;
1227 #ifdef INVARIANTS
1228 struct rib_rtable_info rinfo;
1229 int update_delta = 0;
1230 #endif
1231
1232 MPASS(data != NULL);
1233 MPASS(q != NULL);
1234 MPASS(q->count < q->size);
1235
1236 da = dxr->aux;
1237 MPASS(da != NULL);
1238 MPASS(da->fd == dxr->fd);
1239 MPASS(da->refcnt > 0);
1240
1241 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1242 for (ui = 0; ui < q->count; ui++) {
1243 #ifdef INVARIANTS
1244 if (q->entries[ui].nh_new != NULL)
1245 update_delta++;
1246 if (q->entries[ui].nh_old != NULL)
1247 update_delta--;
1248 #endif
1249 plen = q->entries[ui].plen;
1250 ip = ntohl(q->entries[ui].addr4.s_addr);
1251 if (plen < 32)
1252 hmask = 0xffffffffU >> plen;
1253 else
1254 hmask = 0;
1255 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1256 end = (ip | hmask) >> DXR_RANGE_SHIFT;
1257
1258 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1259 for (i = start >> 5; i <= end >> 5; i++)
1260 da->updates_mask[i] = 0xffffffffU;
1261 else
1262 for (i = start; i <= end; i++)
1263 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1264 if (start < da->updates_low)
1265 da->updates_low = start;
1266 if (end > da->updates_high)
1267 da->updates_high = end;
1268 }
1269
1270 #ifdef INVARIANTS
1271 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1272 MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1273 #endif
1274
1275 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1276 if (res != FLM_SUCCESS)
1277 return (res);
1278
1279 dxr_build(new_dxr);
1280
1281 /* Structural limit exceeded, hard error */
1282 if (da->rtbl_top >= BASE_MAX) {
1283 dxr_destroy(new_dxr);
1284 return (FLM_ERROR);
1285 }
1286
1287 if (new_dxr->d == NULL) {
1288 dxr_destroy(new_dxr);
1289 return (FLM_REBUILD);
1290 }
1291
1292 new_dp.f = choose_lookup_fn(da);
1293 new_dp.arg = new_dxr;
1294 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1295 fib_set_algo_ptr(dxr->fd, new_dxr);
1296 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1297 return (FLM_SUCCESS);
1298 }
1299
1300 FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1301 dxr_destroy(new_dxr);
1302 return (FLM_REBUILD);
1303 }
1304
1305 static uint8_t
dxr_get_pref(const struct rib_rtable_info * rinfo)1306 dxr_get_pref(const struct rib_rtable_info *rinfo)
1307 {
1308
1309 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1310 return (251);
1311 }
1312
1313 SYSCTL_DECL(_net_route_algo);
1314 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1315 "DXR tunables");
1316
1317 static int
sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)1318 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1319 {
1320 char buf[8];
1321 int error, new, i;
1322
1323 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1324 V_frag_limit % 100);
1325 error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1326 if (error != 0 || req->newptr == NULL)
1327 return (error);
1328 if (!isdigit(*buf) && *buf != '.')
1329 return (EINVAL);
1330 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1331 new = new * 10 + buf[i] - '0';
1332 new *= 100;
1333 if (buf[i++] == '.') {
1334 if (!isdigit(buf[i]))
1335 return (EINVAL);
1336 new += (buf[i++] - '0') * 10;
1337 if (isdigit(buf[i]))
1338 new += buf[i++] - '0';
1339 }
1340 if (new > 1000)
1341 return (EINVAL);
1342 V_frag_limit = new;
1343 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1344 V_frag_limit % 100);
1345 return (0);
1346 }
1347
1348 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1349 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1350 0, 0, sysctl_dxr_frag_limit, "A",
1351 "Fragmentation threshold to full rebuild");
1352
1353 static struct fib_lookup_module fib_dxr_mod = {
1354 .flm_name = "dxr",
1355 .flm_family = AF_INET,
1356 .flm_init_cb = dxr_init,
1357 .flm_destroy_cb = dxr_destroy,
1358 .flm_dump_rib_item_cb = dxr_dump_rib_item,
1359 .flm_dump_end_cb = dxr_dump_end,
1360 .flm_change_rib_item_cb = dxr_change_rib_item,
1361 .flm_change_rib_items_cb = dxr_change_rib_batch,
1362 .flm_get_pref = dxr_get_pref,
1363 };
1364
1365 static int
dxr_modevent(module_t mod,int type,void * unused)1366 dxr_modevent(module_t mod, int type, void *unused)
1367 {
1368 int error;
1369
1370 switch (type) {
1371 case MOD_LOAD:
1372 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1373 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1374 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1375 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1376 fib_module_register(&fib_dxr_mod);
1377 return(0);
1378 case MOD_UNLOAD:
1379 error = fib_module_unregister(&fib_dxr_mod);
1380 if (error)
1381 return (error);
1382 uma_zdestroy(chunk_zone);
1383 uma_zdestroy(trie_zone);
1384 return(0);
1385 default:
1386 return(EOPNOTSUPP);
1387 }
1388 }
1389
1390 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1391
1392 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1393 MODULE_VERSION(fib_dxr, 1);
1394