1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012-2022 Marko Zec
5 * Copyright (c) 2005, 2018 University of Zagreb
6 * Copyright (c) 2005 International Computer Science Institute
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 /*
31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32 * structures and a trivial search procedure. More significant bits of
33 * the search key are used to directly index a two-stage trie, while the
34 * remaining bits are used for finding the next hop in a sorted array.
35 * More details in:
36 *
37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38 * second in software, ACM SIGCOMM Computer Communication Review, September
39 * 2012
40 *
41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43 */
44
45 #include <sys/cdefs.h>
46 #include "opt_inet.h"
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/ctype.h>
51 #include <sys/epoch.h>
52 #include <sys/malloc.h>
53 #include <sys/module.h>
54 #include <sys/socket.h>
55 #include <sys/sysctl.h>
56 #include <sys/syslog.h>
57
58 #include <vm/uma.h>
59
60 #include <netinet/in.h>
61 #include <netinet/in_fib.h>
62
63 #include <net/route.h>
64 #include <net/route/route_ctl.h>
65 #include <net/route/fib_algo.h>
66
67 #define DXR_TRIE_BITS 20
68
69 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
70
71 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
72 #define DXR2
73
74 #if DXR_TRIE_BITS > 16
75 #define DXR_D 16
76 #else
77 #define DXR_D (DXR_TRIE_BITS - 1)
78 #endif
79
80 #define D_TBL_SIZE (1 << DXR_D)
81 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
82 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS)
83 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS)
84
85 #define DESC_BASE_BITS 22
86 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS)
87 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1)
88 #define RTBL_SIZE_INCR (BASE_MAX / 64)
89
90 #if DXR_TRIE_BITS < 24
91 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1)
92 #else
93 #define FRAGS_MASK_SHORT 0
94 #endif
95 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \
96 ~FRAGS_MASK_SHORT)
97 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1)
98 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2)
99
100 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
101 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
102 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL)
103
104 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1)
105
106 #define CHUNK_HASH_BITS 16
107 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS)
108 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1)
109
110 #define TRIE_HASH_BITS 16
111 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS)
112 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1)
113
114 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16)
115
116 #define UNUSED_BUCKETS 8
117
118 /* Lookup structure elements */
119
120 struct direct_entry {
121 uint32_t fragments: DESC_FRAGMENTS_BITS,
122 base: DESC_BASE_BITS;
123 };
124
125 struct range_entry_long {
126 uint32_t start: DXR_RANGE_SHIFT,
127 nexthop: DXR_TRIE_BITS;
128 };
129
130 #if DXR_TRIE_BITS < 24
131 struct range_entry_short {
132 uint16_t start: DXR_RANGE_SHIFT - 8,
133 nexthop: DXR_TRIE_BITS - 8;
134 };
135 #endif
136
137 /* Auxiliary structures */
138
139 struct heap_entry {
140 uint32_t start;
141 uint32_t end;
142 uint32_t preflen;
143 uint32_t nexthop;
144 };
145
146 struct chunk_desc {
147 LIST_ENTRY(chunk_desc) cd_all_le;
148 LIST_ENTRY(chunk_desc) cd_hash_le;
149 uint32_t cd_hash;
150 uint32_t cd_refcnt;
151 uint32_t cd_base;
152 uint32_t cd_cur_size;
153 uint32_t cd_max_size;
154 };
155
156 struct trie_desc {
157 LIST_ENTRY(trie_desc) td_all_le;
158 LIST_ENTRY(trie_desc) td_hash_le;
159 uint32_t td_hash;
160 uint32_t td_index;
161 uint32_t td_refcnt;
162 };
163
164 struct dxr_aux {
165 /* Glue to external state */
166 struct fib_data *fd;
167 uint32_t fibnum;
168 int refcnt;
169
170 /* Auxiliary build-time tables */
171 struct direct_entry direct_tbl[DIRECT_TBL_SIZE];
172 uint16_t d_tbl[D_TBL_SIZE];
173 struct direct_entry *x_tbl;
174 union {
175 struct range_entry_long re;
176 uint32_t fragments;
177 } *range_tbl;
178
179 /* Auxiliary internal state */
180 uint32_t updates_mask[DIRECT_TBL_SIZE / 32];
181 struct trie_desc *trietbl[D_TBL_SIZE];
182 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
183 LIST_HEAD(, chunk_desc) all_chunks;
184 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
185 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE];
186 LIST_HEAD(, trie_desc) all_trie;
187 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */
188 struct sockaddr_in dst;
189 struct sockaddr_in mask;
190 struct heap_entry heap[33];
191 uint32_t prefixes;
192 uint32_t updates_low;
193 uint32_t updates_high;
194 uint32_t unused_chunks_size;
195 uint32_t xtbl_size;
196 uint32_t all_trie_cnt;
197 uint32_t unused_trie_cnt;
198 uint32_t trie_rebuilt_prefixes;
199 uint32_t heap_index;
200 uint32_t d_bits;
201 uint32_t rtbl_size;
202 uint32_t rtbl_top;
203 uint32_t rtbl_work_frags;
204 uint32_t work_chunk;
205 };
206
207 /* Main lookup structure container */
208
209 struct dxr {
210 /* Lookup tables */
211 void *d;
212 void *x;
213 void *r;
214 struct nhop_object **nh_tbl;
215
216 /* Glue to external state */
217 struct dxr_aux *aux;
218 struct fib_data *fd;
219 struct epoch_context epoch_ctx;
220 uint32_t fibnum;
221 uint16_t d_shift;
222 uint16_t x_shift;
223 uint32_t x_mask;
224 };
225
226 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
227 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
228
229 uma_zone_t chunk_zone;
230 uma_zone_t trie_zone;
231
232 VNET_DEFINE_STATIC(int, frag_limit) = 100;
233 #define V_frag_limit VNET(frag_limit)
234
235 /* Binary search for a matching address range */
236 #define DXR_LOOKUP_STAGE \
237 if (masked_dst < range[middle].start) { \
238 upperbound = middle; \
239 middle = (middle + lowerbound) / 2; \
240 } else if (masked_dst < range[middle + 1].start) \
241 return (range[middle].nexthop); \
242 else { \
243 lowerbound = middle + 1; \
244 middle = (upperbound + middle + 1) / 2; \
245 } \
246 if (upperbound == lowerbound) \
247 return (range[lowerbound].nexthop);
248
249 static int
range_lookup(struct range_entry_long * rt,struct direct_entry de,uint32_t dst)250 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
251 {
252 uint32_t base;
253 uint32_t upperbound;
254 uint32_t middle;
255 uint32_t lowerbound;
256 uint32_t masked_dst;
257
258 base = de.base;
259 lowerbound = 0;
260 masked_dst = dst & DXR_RANGE_MASK;
261
262 #if DXR_TRIE_BITS < 24
263 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
264 upperbound = de.fragments & FRAGS_MASK_SHORT;
265 struct range_entry_short *range =
266 (struct range_entry_short *) &rt[base];
267
268 masked_dst >>= 8;
269 middle = upperbound;
270 upperbound = upperbound * 2 + 1;
271
272 for (;;) {
273 DXR_LOOKUP_STAGE
274 DXR_LOOKUP_STAGE
275 }
276 }
277 #endif
278
279 upperbound = de.fragments;
280 middle = upperbound / 2;
281 struct range_entry_long *range = &rt[base];
282 if (__predict_false(IS_XL_FORMAT(de.fragments))) {
283 upperbound = *((uint32_t *) range);
284 range++;
285 middle = upperbound / 2;
286 }
287
288 for (;;) {
289 DXR_LOOKUP_STAGE
290 DXR_LOOKUP_STAGE
291 }
292 }
293
294 #define DXR_LOOKUP_DEFINE(D) \
295 static int inline \
296 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \
297 { \
298 struct direct_entry de; \
299 uint16_t *dt = dxr->d; \
300 struct direct_entry *xt = dxr->x; \
301 \
302 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
303 + ((dst >> DXR_RANGE_SHIFT) & \
304 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \
305 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \
306 return (de.base); \
307 return (range_lookup(dxr->r, de, dst)); \
308 } \
309 \
310 static struct nhop_object * \
311 dxr_fib_lookup_##D(void *algo_data, \
312 const struct flm_lookup_key key, uint32_t scopeid __unused) \
313 { \
314 struct dxr *dxr = algo_data; \
315 \
316 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \
317 ntohl(key.addr4.s_addr))]); \
318 }
319
320 #ifdef DXR2
321 #if DXR_TRIE_BITS > 16
322 DXR_LOOKUP_DEFINE(16)
323 #endif
324 DXR_LOOKUP_DEFINE(15)
325 DXR_LOOKUP_DEFINE(14)
326 DXR_LOOKUP_DEFINE(13)
327 DXR_LOOKUP_DEFINE(12)
328 DXR_LOOKUP_DEFINE(11)
329 DXR_LOOKUP_DEFINE(10)
330 DXR_LOOKUP_DEFINE(9)
331 #endif /* DXR2 */
332
333 static int inline
dxr_lookup(struct dxr * dxr,uint32_t dst)334 dxr_lookup(struct dxr *dxr, uint32_t dst)
335 {
336 struct direct_entry de;
337 #ifdef DXR2
338 uint16_t *dt = dxr->d;
339 struct direct_entry *xt = dxr->x;
340
341 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
342 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
343 #else /* !DXR2 */
344 struct direct_entry *dt = dxr->d;
345
346 de = dt[dst >> DXR_RANGE_SHIFT];
347 #endif /* !DXR2 */
348 if (__predict_true(de.fragments == FRAGS_MARK_HIT))
349 return (de.base);
350 return (range_lookup(dxr->r, de, dst));
351 }
352
353 static void
initheap(struct dxr_aux * da,uint32_t dst_u32,uint32_t chunk)354 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
355 {
356 struct heap_entry *fhp = &da->heap[0];
357 struct rtentry *rt;
358 struct route_nhop_data rnd;
359
360 da->heap_index = 0;
361 da->dst.sin_addr.s_addr = htonl(dst_u32);
362 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
363 &rnd);
364 if (rt != NULL) {
365 struct in_addr addr;
366 uint32_t scopeid;
367
368 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
369 fhp->start = ntohl(addr.s_addr);
370 fhp->end = fhp->start;
371 if (fhp->preflen < 32)
372 fhp->end |= (0xffffffffU >> fhp->preflen);
373 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
374 } else {
375 fhp->preflen = fhp->nexthop = fhp->start = 0;
376 fhp->end = 0xffffffffU;
377 }
378 }
379
380 static uint32_t
chunk_size(struct dxr_aux * da,struct direct_entry * fdesc)381 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
382 {
383
384 if (IS_SHORT_FORMAT(fdesc->fragments))
385 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
386 else if (IS_XL_FORMAT(fdesc->fragments))
387 return (da->range_tbl[fdesc->base].fragments + 2);
388 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
389 return (fdesc->fragments + 1);
390 }
391
392 static uint32_t
chunk_hash(struct dxr_aux * da,struct direct_entry * fdesc)393 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
394 {
395 uint32_t size = chunk_size(da, fdesc);
396 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
397 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
398 uint32_t hash = fdesc->fragments;
399
400 for (; p < l; p++)
401 hash = (hash << 7) + (hash >> 13) + *p;
402
403 return (hash + (hash >> 16));
404 }
405
406 static int
chunk_ref(struct dxr_aux * da,uint32_t chunk)407 chunk_ref(struct dxr_aux *da, uint32_t chunk)
408 {
409 struct direct_entry *fdesc = &da->direct_tbl[chunk];
410 struct chunk_desc *cdp, *empty_cdp;
411 uint32_t base = fdesc->base;
412 uint32_t size = chunk_size(da, fdesc);
413 uint32_t hash = chunk_hash(da, fdesc);
414 int i;
415
416 /* Find an existing descriptor */
417 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
418 cd_hash_le) {
419 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
420 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
421 sizeof(struct range_entry_long) * size))
422 continue;
423 da->rtbl_top = fdesc->base;
424 fdesc->base = cdp->cd_base;
425 cdp->cd_refcnt++;
426 return (0);
427 }
428
429 /* No matching chunks found. Find an empty one to recycle. */
430 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
431 cdp = LIST_FIRST(&da->unused_chunks[i]);
432
433 if (cdp == NULL)
434 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
435 if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
436 empty_cdp->cd_max_size < cdp->cd_max_size)) {
437 cdp = empty_cdp;
438 if (empty_cdp->cd_max_size == size)
439 break;
440 }
441
442 if (cdp != NULL) {
443 /* Copy from heap into the recycled chunk */
444 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
445 size * sizeof(struct range_entry_long));
446 fdesc->base = cdp->cd_base;
447 da->rtbl_top -= size;
448 da->unused_chunks_size -= cdp->cd_max_size;
449 if (cdp->cd_max_size > size) {
450 /* Split the range in two, need a new descriptor */
451 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
452 if (empty_cdp == NULL)
453 return (1);
454 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
455 empty_cdp->cd_base = cdp->cd_base + size;
456 empty_cdp->cd_cur_size = 0;
457 empty_cdp->cd_max_size = cdp->cd_max_size - size;
458
459 i = empty_cdp->cd_max_size;
460 if (i >= UNUSED_BUCKETS)
461 i = 0;
462 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
463 cd_hash_le);
464
465 da->unused_chunks_size += empty_cdp->cd_max_size;
466 cdp->cd_max_size = size;
467 }
468 LIST_REMOVE(cdp, cd_hash_le);
469 } else {
470 /* Alloc a new descriptor at the top of the heap*/
471 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
472 if (cdp == NULL)
473 return (1);
474 cdp->cd_max_size = size;
475 cdp->cd_base = fdesc->base;
476 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
477 MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
478 }
479
480 cdp->cd_hash = hash;
481 cdp->cd_refcnt = 1;
482 cdp->cd_cur_size = size;
483 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
484 cd_hash_le);
485 if (da->rtbl_top >= da->rtbl_size) {
486 if (da->rtbl_top >= BASE_MAX) {
487 FIB_PRINTF(LOG_ERR, da->fd,
488 "structural limit exceeded at %d "
489 "range table elements", da->rtbl_top);
490 return (1);
491 }
492 da->rtbl_size += RTBL_SIZE_INCR;
493 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
494 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
495 da->rtbl_top * 100 / BASE_MAX);
496 da->range_tbl = realloc(da->range_tbl,
497 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
498 M_DXRAUX, M_NOWAIT);
499 if (da->range_tbl == NULL) {
500 FIB_PRINTF(LOG_NOTICE, da->fd,
501 "Unable to allocate DXR range table");
502 return (1);
503 }
504 }
505
506 return (0);
507 }
508
509 static void
chunk_unref(struct dxr_aux * da,uint32_t chunk)510 chunk_unref(struct dxr_aux *da, uint32_t chunk)
511 {
512 struct direct_entry *fdesc = &da->direct_tbl[chunk];
513 struct chunk_desc *cdp, *cdp2;
514 uint32_t base = fdesc->base;
515 uint32_t size = chunk_size(da, fdesc);
516 uint32_t hash = chunk_hash(da, fdesc);
517 int i;
518
519 /* Find the corresponding descriptor */
520 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
521 cd_hash_le)
522 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
523 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
524 sizeof(struct range_entry_long) * size) == 0)
525 break;
526
527 MPASS(cdp != NULL);
528 if (--cdp->cd_refcnt > 0)
529 return;
530
531 LIST_REMOVE(cdp, cd_hash_le);
532 da->unused_chunks_size += cdp->cd_max_size;
533 cdp->cd_cur_size = 0;
534
535 /* Attempt to merge with the preceding chunk, if empty */
536 cdp2 = LIST_NEXT(cdp, cd_all_le);
537 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
538 MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
539 LIST_REMOVE(cdp, cd_all_le);
540 LIST_REMOVE(cdp2, cd_hash_le);
541 cdp2->cd_max_size += cdp->cd_max_size;
542 uma_zfree(chunk_zone, cdp);
543 cdp = cdp2;
544 }
545
546 /* Attempt to merge with the subsequent chunk, if empty */
547 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
548 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
549 MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
550 LIST_REMOVE(cdp, cd_all_le);
551 LIST_REMOVE(cdp2, cd_hash_le);
552 cdp2->cd_max_size += cdp->cd_max_size;
553 cdp2->cd_base = cdp->cd_base;
554 uma_zfree(chunk_zone, cdp);
555 cdp = cdp2;
556 }
557
558 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
559 /* Free the chunk on the top of the range heap, trim the heap */
560 MPASS(cdp == LIST_FIRST(&da->all_chunks));
561 da->rtbl_top -= cdp->cd_max_size;
562 da->unused_chunks_size -= cdp->cd_max_size;
563 LIST_REMOVE(cdp, cd_all_le);
564 uma_zfree(chunk_zone, cdp);
565 return;
566 }
567
568 i = cdp->cd_max_size;
569 if (i >= UNUSED_BUCKETS)
570 i = 0;
571 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
572 }
573
574 #ifdef DXR2
575 static uint32_t
trie_hash(struct dxr_aux * da,uint32_t dxr_x,uint32_t index)576 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
577 {
578 uint32_t i, *val;
579 uint32_t hash = 0;
580
581 for (i = 0; i < (1 << dxr_x); i++) {
582 hash = (hash << 3) ^ (hash >> 3);
583 val = (uint32_t *)
584 (void *) &da->direct_tbl[(index << dxr_x) + i];
585 hash += (*val << 5);
586 hash += (*val >> 5);
587 }
588
589 return (hash + (hash >> 16));
590 }
591
592 static int
trie_ref(struct dxr_aux * da,uint32_t index)593 trie_ref(struct dxr_aux *da, uint32_t index)
594 {
595 struct trie_desc *tp;
596 uint32_t dxr_d = da->d_bits;
597 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
598 uint32_t hash = trie_hash(da, dxr_x, index);
599
600 /* Find an existing descriptor */
601 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
602 if (tp->td_hash == hash &&
603 memcmp(&da->direct_tbl[index << dxr_x],
604 &da->x_tbl[tp->td_index << dxr_x],
605 sizeof(*da->x_tbl) << dxr_x) == 0) {
606 tp->td_refcnt++;
607 da->trietbl[index] = tp;
608 return(tp->td_index);
609 }
610
611 tp = LIST_FIRST(&da->unused_trie);
612 if (tp != NULL) {
613 LIST_REMOVE(tp, td_hash_le);
614 da->unused_trie_cnt--;
615 } else {
616 tp = uma_zalloc(trie_zone, M_NOWAIT);
617 if (tp == NULL)
618 return (-1);
619 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
620 tp->td_index = da->all_trie_cnt++;
621 }
622
623 tp->td_hash = hash;
624 tp->td_refcnt = 1;
625 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
626 td_hash_le);
627 memcpy(&da->x_tbl[tp->td_index << dxr_x],
628 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
629 da->trietbl[index] = tp;
630 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
631 da->xtbl_size += XTBL_SIZE_INCR;
632 da->x_tbl = realloc(da->x_tbl,
633 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
634 if (da->x_tbl == NULL) {
635 FIB_PRINTF(LOG_NOTICE, da->fd,
636 "Unable to allocate DXR extension table");
637 return (-1);
638 }
639 }
640 return(tp->td_index);
641 }
642
643 static void
trie_unref(struct dxr_aux * da,uint32_t index)644 trie_unref(struct dxr_aux *da, uint32_t index)
645 {
646 struct trie_desc *tp = da->trietbl[index];
647
648 if (tp == NULL)
649 return;
650 da->trietbl[index] = NULL;
651 if (--tp->td_refcnt > 0)
652 return;
653
654 LIST_REMOVE(tp, td_hash_le);
655 da->unused_trie_cnt++;
656 if (tp->td_index != da->all_trie_cnt - 1) {
657 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
658 return;
659 }
660
661 do {
662 da->all_trie_cnt--;
663 da->unused_trie_cnt--;
664 LIST_REMOVE(tp, td_all_le);
665 uma_zfree(trie_zone, tp);
666 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
667 if (tp->td_index == da->all_trie_cnt - 1) {
668 LIST_REMOVE(tp, td_hash_le);
669 break;
670 }
671 } while (tp != NULL);
672 }
673 #endif
674
675 static void
heap_inject(struct dxr_aux * da,uint32_t start,uint32_t end,uint32_t preflen,uint32_t nh)676 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
677 uint32_t nh)
678 {
679 struct heap_entry *fhp;
680 int i;
681
682 for (i = da->heap_index; i >= 0; i--) {
683 if (preflen > da->heap[i].preflen)
684 break;
685 else if (preflen < da->heap[i].preflen)
686 da->heap[i + 1] = da->heap[i];
687 else
688 return;
689 }
690
691 fhp = &da->heap[i + 1];
692 fhp->preflen = preflen;
693 fhp->start = start;
694 fhp->end = end;
695 fhp->nexthop = nh;
696 da->heap_index++;
697 }
698
699 static int
dxr_walk(struct rtentry * rt,void * arg)700 dxr_walk(struct rtentry *rt, void *arg)
701 {
702 struct dxr_aux *da = arg;
703 uint32_t chunk = da->work_chunk;
704 uint32_t first = chunk << DXR_RANGE_SHIFT;
705 uint32_t last = first | DXR_RANGE_MASK;
706 struct range_entry_long *fp =
707 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
708 struct heap_entry *fhp = &da->heap[da->heap_index];
709 uint32_t preflen, nh, start, end, scopeid;
710 struct in_addr addr;
711
712 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
713 start = ntohl(addr.s_addr);
714 if (start > last)
715 return (-1); /* Beyond chunk boundaries, we are done */
716 if (start < first)
717 return (0); /* Skip this route */
718
719 end = start;
720 if (preflen < 32)
721 end |= (0xffffffffU >> preflen);
722 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
723
724 if (start == fhp->start)
725 heap_inject(da, start, end, preflen, nh);
726 else {
727 /* start > fhp->start */
728 while (start > fhp->end) {
729 uint32_t oend = fhp->end;
730
731 if (da->heap_index > 0) {
732 fhp--;
733 da->heap_index--;
734 } else
735 initheap(da, fhp->end + 1, chunk);
736 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
737 fp++;
738 da->rtbl_work_frags++;
739 fp->start = (oend + 1) & DXR_RANGE_MASK;
740 fp->nexthop = fhp->nexthop;
741 }
742 }
743 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
744 nh != fp->nexthop) {
745 fp++;
746 da->rtbl_work_frags++;
747 fp->start = start & DXR_RANGE_MASK;
748 } else if (da->rtbl_work_frags) {
749 if ((--fp)->nexthop == nh)
750 da->rtbl_work_frags--;
751 else
752 fp++;
753 }
754 fp->nexthop = nh;
755 heap_inject(da, start, end, preflen, nh);
756 }
757
758 return (0);
759 }
760
761 static int
update_chunk(struct dxr_aux * da,uint32_t chunk)762 update_chunk(struct dxr_aux *da, uint32_t chunk)
763 {
764 struct range_entry_long *fp;
765 #if DXR_TRIE_BITS < 24
766 struct range_entry_short *fps;
767 uint32_t start, nh, i;
768 #endif
769 struct heap_entry *fhp;
770 uint32_t first = chunk << DXR_RANGE_SHIFT;
771 uint32_t last = first | DXR_RANGE_MASK;
772
773 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
774 chunk_unref(da, chunk);
775
776 initheap(da, first, chunk);
777
778 fp = &da->range_tbl[da->rtbl_top].re;
779 da->rtbl_work_frags = 0;
780 fp->start = first & DXR_RANGE_MASK;
781 fp->nexthop = da->heap[0].nexthop;
782
783 da->dst.sin_addr.s_addr = htonl(first);
784 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
785
786 da->work_chunk = chunk;
787 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
788 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
789 dxr_walk, da);
790
791 /* Flush any remaining objects on the heap */
792 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
793 fhp = &da->heap[da->heap_index];
794 while (fhp->preflen > DXR_TRIE_BITS) {
795 uint32_t oend = fhp->end;
796
797 if (da->heap_index > 0) {
798 fhp--;
799 da->heap_index--;
800 } else
801 initheap(da, fhp->end + 1, chunk);
802 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
803 /* Have we crossed the upper chunk boundary? */
804 if (oend >= last)
805 break;
806 fp++;
807 da->rtbl_work_frags++;
808 fp->start = (oend + 1) & DXR_RANGE_MASK;
809 fp->nexthop = fhp->nexthop;
810 }
811 }
812
813 /* Direct hit if the chunk contains only a single fragment */
814 if (da->rtbl_work_frags == 0) {
815 da->direct_tbl[chunk].base = fp->nexthop;
816 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
817 return (0);
818 }
819
820 da->direct_tbl[chunk].base = da->rtbl_top;
821 da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
822
823 #if DXR_TRIE_BITS < 24
824 /* Check whether the chunk can be more compactly encoded */
825 fp = &da->range_tbl[da->rtbl_top].re;
826 for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
827 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
828 break;
829 if (i == da->rtbl_work_frags + 1) {
830 fp = &da->range_tbl[da->rtbl_top].re;
831 fps = (void *) fp;
832 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
833 start = fp->start;
834 nh = fp->nexthop;
835 fps->start = start >> 8;
836 fps->nexthop = nh;
837 }
838 fps->start = start >> 8;
839 fps->nexthop = nh;
840 da->rtbl_work_frags >>= 1;
841 da->direct_tbl[chunk].fragments =
842 da->rtbl_work_frags | FRAGS_PREF_SHORT;
843 } else
844 #endif
845 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
846 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
847 memmove(&da->range_tbl[da->rtbl_top + 1],
848 &da->range_tbl[da->rtbl_top],
849 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
850 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
851 da->rtbl_work_frags++;
852 }
853 da->rtbl_top += (da->rtbl_work_frags + 1);
854 return (chunk_ref(da, chunk));
855 }
856
857 static void
dxr_build(struct dxr * dxr)858 dxr_build(struct dxr *dxr)
859 {
860 struct dxr_aux *da = dxr->aux;
861 struct chunk_desc *cdp;
862 struct rib_rtable_info rinfo;
863 struct timeval t0, t1, t2, t3;
864 uint32_t r_size, dxr_tot_size;
865 uint32_t i, m, range_rebuild = 0;
866 uint32_t range_frag;
867 #ifdef DXR2
868 struct trie_desc *tp;
869 uint32_t d_tbl_size, dxr_x, d_size, x_size;
870 uint32_t ti, trie_rebuild = 0, prev_size = 0;
871 uint32_t trie_frag;
872 #endif
873
874 MPASS(dxr->d == NULL);
875
876 if (da == NULL) {
877 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
878 if (da == NULL) {
879 FIB_PRINTF(LOG_NOTICE, dxr->fd,
880 "Unable to allocate DXR aux struct");
881 return;
882 }
883 dxr->aux = da;
884 da->fibnum = dxr->fibnum;
885 da->fd = dxr->fd;
886 da->refcnt = 1;
887 LIST_INIT(&da->all_chunks);
888 LIST_INIT(&da->all_trie);
889 da->rtbl_size = RTBL_SIZE_INCR;
890 da->range_tbl = NULL;
891 da->xtbl_size = XTBL_SIZE_INCR;
892 da->x_tbl = NULL;
893 bzero(&da->dst, sizeof(da->dst));
894 bzero(&da->mask, sizeof(da->mask));
895 da->dst.sin_len = sizeof(da->dst);
896 da->mask.sin_len = sizeof(da->mask);
897 da->dst.sin_family = AF_INET;
898 da->mask.sin_family = AF_INET;
899 }
900 if (da->range_tbl == NULL) {
901 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
902 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
903 if (da->range_tbl == NULL) {
904 FIB_PRINTF(LOG_NOTICE, da->fd,
905 "Unable to allocate DXR range table");
906 return;
907 }
908 range_rebuild = 1;
909 }
910 #ifdef DXR2
911 if (da->x_tbl == NULL) {
912 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
913 M_DXRAUX, M_NOWAIT);
914 if (da->x_tbl == NULL) {
915 FIB_PRINTF(LOG_NOTICE, da->fd,
916 "Unable to allocate DXR extension table");
917 return;
918 }
919 trie_rebuild = 1;
920 }
921 #endif
922
923 microuptime(&t0);
924
925 dxr->nh_tbl = fib_get_nhop_array(da->fd);
926 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
927
928 if (da->updates_low > da->updates_high)
929 range_rebuild = 1;
930
931 range_build:
932 if (range_rebuild) {
933 /* Bulk cleanup */
934 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
935 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
936 LIST_REMOVE(cdp, cd_all_le);
937 uma_zfree(chunk_zone, cdp);
938 }
939 for (i = 0; i < UNUSED_BUCKETS; i++)
940 LIST_INIT(&da->unused_chunks[i]);
941 da->unused_chunks_size = 0;
942 da->rtbl_top = 0;
943 da->updates_low = 0;
944 da->updates_high = DIRECT_TBL_SIZE - 1;
945 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
946 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
947 da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
948 da->direct_tbl[i].base = 0;
949 }
950 }
951 da->prefixes = rinfo.num_prefixes;
952
953 /* DXR: construct direct & range table */
954 for (i = da->updates_low; i <= da->updates_high; i++) {
955 m = da->updates_mask[i >> 5] >> (i & 0x1f);
956 if (m == 0)
957 i |= 0x1f;
958 else if (m & 1 && update_chunk(da, i) != 0)
959 return;
960 }
961
962 range_frag = 0;
963 if (da->rtbl_top)
964 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
965 if (range_frag > V_frag_limit) {
966 range_rebuild = 1;
967 goto range_build;
968 }
969
970 r_size = sizeof(*da->range_tbl) * da->rtbl_top;
971 microuptime(&t1);
972
973 #ifdef DXR2
974 if (range_rebuild ||
975 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
976 trie_rebuild = 1;
977
978 trie_build:
979 if (trie_rebuild) {
980 da->trie_rebuilt_prefixes = da->prefixes;
981 da->d_bits = DXR_D;
982 da->updates_low = 0;
983 da->updates_high = DIRECT_TBL_SIZE - 1;
984 if (!range_rebuild)
985 memset(da->updates_mask, 0xff,
986 sizeof(da->updates_mask));
987 }
988
989 dxr2_try_squeeze:
990 if (trie_rebuild) {
991 /* Bulk cleanup */
992 bzero(da->trietbl, sizeof(da->trietbl));
993 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
994 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
995 LIST_REMOVE(tp, td_all_le);
996 uma_zfree(trie_zone, tp);
997 }
998 LIST_INIT(&da->unused_trie);
999 da->all_trie_cnt = da->unused_trie_cnt = 0;
1000 }
1001
1002 /* Populate d_tbl, x_tbl */
1003 dxr_x = DXR_TRIE_BITS - da->d_bits;
1004 d_tbl_size = (1 << da->d_bits);
1005
1006 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
1007 i++) {
1008 if (!trie_rebuild) {
1009 m = 0;
1010 for (int j = 0; j < (1 << dxr_x); j += 32)
1011 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
1012 if (m == 0)
1013 continue;
1014 trie_unref(da, i);
1015 }
1016 ti = trie_ref(da, i);
1017 if (ti < 0)
1018 return;
1019 da->d_tbl[i] = ti;
1020 }
1021
1022 trie_frag = 0;
1023 if (da->all_trie_cnt)
1024 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1025 if (trie_frag > V_frag_limit) {
1026 trie_rebuild = 1;
1027 goto trie_build;
1028 }
1029
1030 d_size = sizeof(*da->d_tbl) * d_tbl_size;
1031 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1032 * da->all_trie_cnt;
1033 dxr_tot_size = d_size + x_size + r_size;
1034
1035 if (trie_rebuild == 1) {
1036 /* Try to find a more compact D/X split */
1037 if (prev_size == 0 || dxr_tot_size <= prev_size)
1038 da->d_bits--;
1039 else {
1040 da->d_bits++;
1041 trie_rebuild = 2;
1042 }
1043 prev_size = dxr_tot_size;
1044 goto dxr2_try_squeeze;
1045 }
1046 microuptime(&t2);
1047 #else /* !DXR2 */
1048 dxr_tot_size = sizeof(da->direct_tbl) + r_size;
1049 t2 = t1;
1050 #endif
1051
1052 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1053 if (dxr->d == NULL) {
1054 FIB_PRINTF(LOG_NOTICE, da->fd,
1055 "Unable to allocate DXR lookup table");
1056 return;
1057 }
1058 #ifdef DXR2
1059 memcpy(dxr->d, da->d_tbl, d_size);
1060 dxr->x = ((char *) dxr->d) + d_size;
1061 memcpy(dxr->x, da->x_tbl, x_size);
1062 dxr->r = ((char *) dxr->x) + x_size;
1063 dxr->d_shift = 32 - da->d_bits;
1064 dxr->x_shift = dxr_x;
1065 dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1066 #else /* !DXR2 */
1067 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1068 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1069 #endif
1070 memcpy(dxr->r, da->range_tbl, r_size);
1071
1072 if (da->updates_low <= da->updates_high)
1073 bzero(&da->updates_mask[da->updates_low / 32],
1074 (da->updates_high - da->updates_low) / 8 + 1);
1075 da->updates_low = DIRECT_TBL_SIZE - 1;
1076 da->updates_high = 0;
1077 microuptime(&t3);
1078
1079 #ifdef DXR2
1080 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1081 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1082 #else
1083 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1084 DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1085 #endif
1086 i = dxr_tot_size * 100;
1087 if (rinfo.num_prefixes)
1088 i /= rinfo.num_prefixes;
1089 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1090 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1091 i / 100, i % 100);
1092 #ifdef DXR2
1093 FIB_PRINTF(LOG_INFO, da->fd,
1094 "%d.%02d%% trie, %d.%02d%% range fragmentation",
1095 trie_frag / 100, trie_frag % 100,
1096 range_frag / 100, range_frag % 100);
1097 #else
1098 FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
1099 range_frag / 100, range_frag % 100);
1100 #endif
1101 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1102 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1103 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1104 #ifdef DXR2
1105 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1106 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1107 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1108 #endif
1109 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1110 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1111 i / 1000, i % 1000);
1112 }
1113
1114 /*
1115 * Glue functions for attaching to the FIB_ALGO infrastructure.
1116 */
1117
1118 static struct nhop_object *
dxr_fib_lookup(void * algo_data,const struct flm_lookup_key key,uint32_t scopeid)1119 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1120 uint32_t scopeid)
1121 {
1122 struct dxr *dxr = algo_data;
1123
1124 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1125 }
1126
1127 static enum flm_op_result
dxr_init(uint32_t fibnum,struct fib_data * fd,void * old_data,void ** data)1128 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1129 {
1130 struct dxr *old_dxr = old_data;
1131 struct dxr_aux *da = NULL;
1132 struct dxr *dxr;
1133
1134 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1135 if (dxr == NULL) {
1136 FIB_PRINTF(LOG_NOTICE, fd,
1137 "Unable to allocate DXR container struct");
1138 return (FLM_REBUILD);
1139 }
1140
1141 /* Check whether we may reuse the old auxiliary structures */
1142 if (old_dxr != NULL && old_dxr->aux != NULL) {
1143 da = old_dxr->aux;
1144 atomic_add_int(&da->refcnt, 1);
1145 }
1146
1147 dxr->aux = da;
1148 dxr->d = NULL;
1149 dxr->fd = fd;
1150 dxr->fibnum = fibnum;
1151 *data = dxr;
1152
1153 return (FLM_SUCCESS);
1154 }
1155
1156 static void
dxr_destroy(void * data)1157 dxr_destroy(void *data)
1158 {
1159 struct dxr *dxr = data;
1160 struct dxr_aux *da = dxr->aux;
1161 struct chunk_desc *cdp;
1162 struct trie_desc *tp;
1163
1164 free(dxr->d, M_DXRLPM);
1165 free(dxr, M_DXRAUX);
1166
1167 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1168 return;
1169
1170 /* Release auxiliary structures */
1171 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1172 LIST_REMOVE(cdp, cd_all_le);
1173 uma_zfree(chunk_zone, cdp);
1174 }
1175 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1176 LIST_REMOVE(tp, td_all_le);
1177 uma_zfree(trie_zone, tp);
1178 }
1179 free(da->range_tbl, M_DXRAUX);
1180 free(da->x_tbl, M_DXRAUX);
1181 free(da, M_DXRAUX);
1182 }
1183
1184 static void
epoch_dxr_destroy(epoch_context_t ctx)1185 epoch_dxr_destroy(epoch_context_t ctx)
1186 {
1187 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1188
1189 dxr_destroy(dxr);
1190 }
1191
1192 static void *
choose_lookup_fn(struct dxr_aux * da)1193 choose_lookup_fn(struct dxr_aux *da)
1194 {
1195
1196 #ifdef DXR2
1197 switch (da->d_bits) {
1198 #if DXR_TRIE_BITS > 16
1199 case 16:
1200 return (dxr_fib_lookup_16);
1201 #endif
1202 case 15:
1203 return (dxr_fib_lookup_15);
1204 case 14:
1205 return (dxr_fib_lookup_14);
1206 case 13:
1207 return (dxr_fib_lookup_13);
1208 case 12:
1209 return (dxr_fib_lookup_12);
1210 case 11:
1211 return (dxr_fib_lookup_11);
1212 case 10:
1213 return (dxr_fib_lookup_10);
1214 case 9:
1215 return (dxr_fib_lookup_9);
1216 }
1217 #endif /* DXR2 */
1218 return (dxr_fib_lookup);
1219 }
1220
1221 static enum flm_op_result
dxr_dump_end(void * data,struct fib_dp * dp)1222 dxr_dump_end(void *data, struct fib_dp *dp)
1223 {
1224 struct dxr *dxr = data;
1225 struct dxr_aux *da;
1226
1227 dxr_build(dxr);
1228
1229 da = dxr->aux;
1230 if (da == NULL || dxr->d == NULL)
1231 return (FLM_REBUILD);
1232
1233 if (da->rtbl_top >= BASE_MAX)
1234 return (FLM_ERROR);
1235
1236 dp->f = choose_lookup_fn(da);
1237 dp->arg = dxr;
1238
1239 return (FLM_SUCCESS);
1240 }
1241
1242 static enum flm_op_result
dxr_dump_rib_item(struct rtentry * rt,void * data)1243 dxr_dump_rib_item(struct rtentry *rt, void *data)
1244 {
1245
1246 return (FLM_SUCCESS);
1247 }
1248
1249 static enum flm_op_result
dxr_change_rib_item(struct rib_head * rnh,struct rib_cmd_info * rc,void * data)1250 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1251 void *data)
1252 {
1253
1254 return (FLM_BATCH);
1255 }
1256
1257 static enum flm_op_result
dxr_change_rib_batch(struct rib_head * rnh,struct fib_change_queue * q,void * data)1258 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1259 void *data)
1260 {
1261 struct dxr *dxr = data;
1262 struct dxr *new_dxr;
1263 struct dxr_aux *da;
1264 struct fib_dp new_dp;
1265 enum flm_op_result res;
1266 uint32_t ip, plen, hmask, start, end, i, ui;
1267 #ifdef INVARIANTS
1268 struct rib_rtable_info rinfo;
1269 int update_delta = 0;
1270 #endif
1271
1272 MPASS(data != NULL);
1273 MPASS(q != NULL);
1274 MPASS(q->count < q->size);
1275
1276 da = dxr->aux;
1277 MPASS(da != NULL);
1278 MPASS(da->fd != NULL);
1279 MPASS(da->refcnt > 0);
1280
1281 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1282 for (ui = 0; ui < q->count; ui++) {
1283 #ifdef INVARIANTS
1284 if (q->entries[ui].nh_new != NULL)
1285 update_delta++;
1286 if (q->entries[ui].nh_old != NULL)
1287 update_delta--;
1288 #endif
1289 plen = q->entries[ui].plen;
1290 ip = ntohl(q->entries[ui].addr4.s_addr);
1291 if (plen < 32)
1292 hmask = 0xffffffffU >> plen;
1293 else
1294 hmask = 0;
1295 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1296 end = (ip | hmask) >> DXR_RANGE_SHIFT;
1297
1298 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1299 for (i = start >> 5; i <= end >> 5; i++)
1300 da->updates_mask[i] = 0xffffffffU;
1301 else
1302 for (i = start; i <= end; i++)
1303 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1304 if (start < da->updates_low)
1305 da->updates_low = start;
1306 if (end > da->updates_high)
1307 da->updates_high = end;
1308 }
1309
1310 #ifdef INVARIANTS
1311 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1312 MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1313 #endif
1314
1315 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1316 if (res != FLM_SUCCESS)
1317 return (res);
1318
1319 dxr_build(new_dxr);
1320
1321 /* Structural limit exceeded, hard error */
1322 if (da->rtbl_top >= BASE_MAX)
1323 return (FLM_ERROR);
1324
1325 if (new_dxr->d == NULL) {
1326 dxr_destroy(new_dxr);
1327 return (FLM_REBUILD);
1328 }
1329
1330 new_dp.f = choose_lookup_fn(da);
1331 new_dp.arg = new_dxr;
1332 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1333 fib_set_algo_ptr(dxr->fd, new_dxr);
1334 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1335 return (FLM_SUCCESS);
1336 }
1337
1338 FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1339 dxr_destroy(new_dxr);
1340 return (FLM_REBUILD);
1341 }
1342
1343 static uint8_t
dxr_get_pref(const struct rib_rtable_info * rinfo)1344 dxr_get_pref(const struct rib_rtable_info *rinfo)
1345 {
1346
1347 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1348 return (251);
1349 }
1350
1351 SYSCTL_DECL(_net_route_algo);
1352 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1353 "DXR tunables");
1354
1355 static int
sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)1356 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1357 {
1358 char buf[8];
1359 int error, new, i;
1360
1361 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1362 V_frag_limit % 100);
1363 error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1364 if (error != 0 || req->newptr == NULL)
1365 return (error);
1366 if (!isdigit(*buf) && *buf != '.')
1367 return (EINVAL);
1368 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1369 new = new * 10 + buf[i] - '0';
1370 new *= 100;
1371 if (buf[i++] == '.') {
1372 if (!isdigit(buf[i]))
1373 return (EINVAL);
1374 new += (buf[i++] - '0') * 10;
1375 if (isdigit(buf[i]))
1376 new += buf[i++] - '0';
1377 }
1378 if (new > 1000)
1379 return (EINVAL);
1380 V_frag_limit = new;
1381 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1382 V_frag_limit % 100);
1383 return (0);
1384 }
1385
1386 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1387 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1388 0, 0, sysctl_dxr_frag_limit, "A",
1389 "Fragmentation threshold to full rebuild");
1390
1391 static struct fib_lookup_module fib_dxr_mod = {
1392 .flm_name = "dxr",
1393 .flm_family = AF_INET,
1394 .flm_init_cb = dxr_init,
1395 .flm_destroy_cb = dxr_destroy,
1396 .flm_dump_rib_item_cb = dxr_dump_rib_item,
1397 .flm_dump_end_cb = dxr_dump_end,
1398 .flm_change_rib_item_cb = dxr_change_rib_item,
1399 .flm_change_rib_items_cb = dxr_change_rib_batch,
1400 .flm_get_pref = dxr_get_pref,
1401 };
1402
1403 static int
dxr_modevent(module_t mod,int type,void * unused)1404 dxr_modevent(module_t mod, int type, void *unused)
1405 {
1406 int error;
1407
1408 switch (type) {
1409 case MOD_LOAD:
1410 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1411 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1412 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1413 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1414 fib_module_register(&fib_dxr_mod);
1415 return(0);
1416 case MOD_UNLOAD:
1417 error = fib_module_unregister(&fib_dxr_mod);
1418 if (error)
1419 return (error);
1420 uma_zdestroy(chunk_zone);
1421 uma_zdestroy(trie_zone);
1422 return(0);
1423 default:
1424 return(EOPNOTSUPP);
1425 }
1426 }
1427
1428 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1429
1430 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1431 MODULE_VERSION(fib_dxr, 1);
1432