1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012-2021 Marko Zec 5 * Copyright (c) 2005, 2018 University of Zagreb 6 * Copyright (c) 2005 International Computer Science Institute 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup 32 * structures and a trivial search procedure. More significant bits of 33 * the search key are used to directly index a two-stage trie, while the 34 * remaining bits are used for finding the next hop in a sorted array. 35 * More details in: 36 * 37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per 38 * second in software, ACM SIGCOMM Computer Communication Review, September 39 * 2012 40 * 41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing 42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_inet.h" 49 50 #include <sys/param.h> 51 #include <sys/kernel.h> 52 #include <sys/epoch.h> 53 #include <sys/malloc.h> 54 #include <sys/module.h> 55 #include <sys/socket.h> 56 #include <sys/sysctl.h> 57 #include <sys/syslog.h> 58 59 #include <vm/uma.h> 60 61 #include <netinet/in.h> 62 #include <netinet/in_fib.h> 63 64 #include <net/route.h> 65 #include <net/route/route_ctl.h> 66 #include <net/route/fib_algo.h> 67 68 #define DXR_TRIE_BITS 20 69 70 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); 71 72 /* DXR2: two-stage primary trie, instead of a single direct lookup table */ 73 #define DXR2 74 75 #if DXR_TRIE_BITS > 16 76 #define DXR_D 16 77 #else 78 #define DXR_D (DXR_TRIE_BITS - 1) 79 #endif 80 #define DXR_X (DXR_TRIE_BITS - DXR_D) 81 82 #define D_TBL_SIZE (1 << DXR_D) 83 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) 84 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) 85 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) 86 87 #define DESC_BASE_BITS 22 88 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) 89 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) 90 #define RTBL_SIZE_INCR (BASE_MAX / 64) 91 92 #if DXR_TRIE_BITS < 24 93 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) 94 #else 95 #define FRAGS_MASK_SHORT 0 96 #endif 97 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ 98 ~FRAGS_MASK_SHORT) 99 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) 100 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) 101 102 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) 103 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) 104 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) 105 106 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) 107 108 #define CHUNK_HASH_BITS 16 109 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) 110 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) 111 112 #define TRIE_HASH_BITS 16 113 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) 114 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) 115 116 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) 117 118 /* Lookup structure elements */ 119 120 struct direct_entry { 121 uint32_t fragments: DESC_FRAGMENTS_BITS, 122 base: DESC_BASE_BITS; 123 }; 124 125 struct range_entry_long { 126 uint32_t start: DXR_RANGE_SHIFT, 127 nexthop: DXR_TRIE_BITS; 128 }; 129 130 #if DXR_TRIE_BITS < 24 131 struct range_entry_short { 132 uint16_t start: DXR_RANGE_SHIFT - 8, 133 nexthop: DXR_TRIE_BITS - 8; 134 }; 135 #endif 136 137 /* Auxiliary structures */ 138 139 struct heap_entry { 140 uint32_t start; 141 uint32_t end; 142 uint32_t preflen; 143 uint32_t nexthop; 144 }; 145 146 struct chunk_desc { 147 LIST_ENTRY(chunk_desc) cd_all_le; 148 LIST_ENTRY(chunk_desc) cd_hash_le; 149 uint32_t cd_hash; 150 uint32_t cd_refcnt; 151 uint32_t cd_base; 152 uint32_t cd_cur_size; 153 uint32_t cd_max_size; 154 }; 155 156 struct trie_desc { 157 LIST_ENTRY(trie_desc) td_all_le; 158 LIST_ENTRY(trie_desc) td_hash_le; 159 uint32_t td_hash; 160 uint32_t td_index; 161 uint32_t td_refcnt; 162 }; 163 164 struct dxr_aux { 165 /* Glue to external state */ 166 struct fib_data *fd; 167 uint32_t fibnum; 168 int refcnt; 169 170 /* Auxiliary build-time tables */ 171 struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; 172 uint16_t d_tbl[D_TBL_SIZE]; 173 struct direct_entry *x_tbl; 174 union { 175 struct range_entry_long re; 176 uint32_t fragments; 177 } *range_tbl; 178 179 /* Auxiliary internal state */ 180 uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; 181 struct trie_desc *trietbl[D_TBL_SIZE]; 182 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; 183 LIST_HEAD(, chunk_desc) all_chunks; 184 LIST_HEAD(, chunk_desc) unused_chunks; /* abuses hash link entry */ 185 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; 186 LIST_HEAD(, trie_desc) all_trie; 187 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ 188 struct sockaddr_in dst; 189 struct sockaddr_in mask; 190 struct heap_entry heap[33]; 191 uint32_t prefixes; 192 uint32_t updates_low; 193 uint32_t updates_high; 194 uint32_t all_chunks_cnt; 195 uint32_t unused_chunks_cnt; 196 uint32_t xtbl_size; 197 uint32_t all_trie_cnt; 198 uint32_t unused_trie_cnt; 199 uint32_t trie_rebuilt_prefixes; 200 uint32_t heap_index; 201 uint32_t d_bits; 202 uint32_t rtbl_size; 203 uint32_t rtbl_top; 204 uint32_t rtbl_work_frags; 205 uint32_t work_chunk; 206 }; 207 208 /* Main lookup structure container */ 209 210 struct dxr { 211 /* Lookup tables */ 212 uint16_t d_shift; 213 uint16_t x_shift; 214 uint32_t x_mask; 215 void *d; 216 void *x; 217 void *r; 218 struct nhop_object **nh_tbl; 219 220 /* Glue to external state */ 221 struct dxr_aux *aux; 222 struct fib_data *fd; 223 struct epoch_context epoch_ctx; 224 uint32_t fibnum; 225 }; 226 227 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); 228 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); 229 230 uma_zone_t chunk_zone; 231 uma_zone_t trie_zone; 232 233 SYSCTL_DECL(_net_route_algo); 234 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 235 "DXR tunables"); 236 237 VNET_DEFINE_STATIC(int, max_trie_holes) = 8; 238 #define V_max_trie_holes VNET(max_trie_holes) 239 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes, 240 CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0, 241 "Trie fragmentation threshold before triggering a full rebuild"); 242 243 VNET_DEFINE_STATIC(int, max_range_holes) = 16; 244 #define V_max_range_holes VNET(max_range_holes) 245 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes, 246 CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0, 247 "Range table fragmentation threshold before triggering a full rebuild"); 248 249 /* Binary search for a matching address range */ 250 #define DXR_LOOKUP_STAGE \ 251 if (masked_dst < range[middle].start) { \ 252 upperbound = middle; \ 253 middle = (middle + lowerbound) / 2; \ 254 } else if (masked_dst < range[middle + 1].start) \ 255 return (range[middle].nexthop); \ 256 else { \ 257 lowerbound = middle + 1; \ 258 middle = (upperbound + middle + 1) / 2; \ 259 } \ 260 if (upperbound == lowerbound) \ 261 return (range[lowerbound].nexthop); 262 263 static int 264 dxr_lookup(struct dxr *dxr, uint32_t dst) 265 { 266 #ifdef DXR2 267 uint16_t *dt = dxr->d; 268 struct direct_entry *xt = dxr->x; 269 int xi; 270 #else 271 struct direct_entry *dt = dxr->d; 272 #endif 273 struct direct_entry de; 274 struct range_entry_long *rt; 275 uint32_t base; 276 uint32_t upperbound; 277 uint32_t middle; 278 uint32_t lowerbound; 279 uint32_t masked_dst; 280 281 #ifdef DXR2 282 xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) + 283 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask); 284 de = xt[xi]; 285 #else 286 de = dt[dst >> DXR_RANGE_SHIFT]; 287 #endif 288 289 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) 290 return (de.base); 291 292 rt = dxr->r; 293 base = de.base; 294 lowerbound = 0; 295 masked_dst = dst & DXR_RANGE_MASK; 296 297 #if DXR_TRIE_BITS < 24 298 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { 299 upperbound = de.fragments & FRAGS_MASK_SHORT; 300 struct range_entry_short *range = 301 (struct range_entry_short *) &rt[base]; 302 303 masked_dst >>= 8; 304 middle = upperbound; 305 upperbound = upperbound * 2 + 1; 306 307 for (;;) { 308 DXR_LOOKUP_STAGE 309 DXR_LOOKUP_STAGE 310 } 311 } 312 #endif 313 314 upperbound = de.fragments; 315 middle = upperbound / 2; 316 struct range_entry_long *range = &rt[base]; 317 if (__predict_false(IS_XL_FORMAT(de.fragments))) { 318 upperbound = *((uint32_t *) range); 319 range++; 320 middle = upperbound / 2; 321 } 322 323 for (;;) { 324 DXR_LOOKUP_STAGE 325 DXR_LOOKUP_STAGE 326 } 327 } 328 329 static void 330 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) 331 { 332 struct heap_entry *fhp = &da->heap[0]; 333 struct rtentry *rt; 334 struct route_nhop_data rnd; 335 336 da->heap_index = 0; 337 da->dst.sin_addr.s_addr = htonl(dst_u32); 338 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, 339 &rnd); 340 if (rt != NULL) { 341 struct in_addr addr; 342 uint32_t scopeid; 343 344 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); 345 fhp->start = ntohl(addr.s_addr); 346 fhp->end = fhp->start; 347 if (fhp->preflen < 32) 348 fhp->end |= (0xffffffffU >> fhp->preflen); 349 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); 350 } else { 351 fhp->preflen = fhp->nexthop = fhp->start = 0; 352 fhp->end = 0xffffffffU; 353 } 354 } 355 356 static uint32_t 357 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) 358 { 359 360 if (IS_SHORT_FORMAT(fdesc->fragments)) 361 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); 362 else if (IS_XL_FORMAT(fdesc->fragments)) 363 return (da->range_tbl[fdesc->base].fragments + 2); 364 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ 365 return (fdesc->fragments + 1); 366 } 367 368 static uint32_t 369 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) 370 { 371 uint32_t size = chunk_size(da, fdesc); 372 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; 373 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; 374 uint32_t hash = fdesc->fragments; 375 376 for (; p < l; p++) 377 hash = (hash << 7) + (hash >> 13) + *p; 378 379 return (hash + (hash >> 16)); 380 } 381 382 static int 383 chunk_ref(struct dxr_aux *da, uint32_t chunk) 384 { 385 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 386 struct chunk_desc *cdp, *empty_cdp; 387 uint32_t base = fdesc->base; 388 uint32_t size = chunk_size(da, fdesc); 389 uint32_t hash = chunk_hash(da, fdesc); 390 391 /* Find an existing descriptor */ 392 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 393 cd_hash_le) { 394 if (cdp->cd_hash != hash || cdp->cd_cur_size != size || 395 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 396 sizeof(struct range_entry_long) * size)) 397 continue; 398 da->rtbl_top = fdesc->base; 399 fdesc->base = cdp->cd_base; 400 cdp->cd_refcnt++; 401 return (0); 402 } 403 404 /* No matching chunks found. Recycle an empty or allocate a new one */ 405 cdp = NULL; 406 LIST_FOREACH(empty_cdp, &da->unused_chunks, cd_hash_le) 407 if (empty_cdp->cd_max_size >= size && (cdp == NULL || 408 empty_cdp->cd_max_size < cdp->cd_max_size)) { 409 cdp = empty_cdp; 410 if (empty_cdp->cd_max_size == size) 411 break; 412 } 413 414 if (cdp != NULL) { 415 /* Copy from heap into the recycled chunk */ 416 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], 417 size * sizeof(struct range_entry_long)); 418 fdesc->base = cdp->cd_base; 419 da->rtbl_top -= size; 420 da->unused_chunks_cnt--; 421 if (cdp->cd_max_size > size) { 422 /* Split the range in two, need a new descriptor */ 423 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); 424 if (empty_cdp == NULL) 425 return (1); 426 empty_cdp->cd_cur_size = 0; 427 empty_cdp->cd_max_size = cdp->cd_max_size - size; 428 empty_cdp->cd_base = cdp->cd_base + size; 429 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); 430 LIST_INSERT_AFTER(cdp, empty_cdp, cd_hash_le); 431 da->all_chunks_cnt++; 432 da->unused_chunks_cnt++; 433 cdp->cd_max_size = size; 434 } 435 LIST_REMOVE(cdp, cd_hash_le); 436 } else { 437 /* Alloc a new descriptor at the top of the heap*/ 438 cdp = uma_zalloc(chunk_zone, M_NOWAIT); 439 if (cdp == NULL) 440 return (1); 441 cdp->cd_max_size = size; 442 cdp->cd_base = fdesc->base; 443 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); 444 da->all_chunks_cnt++; 445 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top, 446 ("dxr: %s %d", __FUNCTION__, __LINE__)); 447 } 448 449 cdp->cd_hash = hash; 450 cdp->cd_refcnt = 1; 451 cdp->cd_cur_size = size; 452 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, 453 cd_hash_le); 454 if (da->rtbl_top >= da->rtbl_size) { 455 if (da->rtbl_top >= BASE_MAX) { 456 FIB_PRINTF(LOG_ERR, da->fd, 457 "structural limit exceeded at %d " 458 "range table elements", da->rtbl_top); 459 return (1); 460 } 461 da->rtbl_size += RTBL_SIZE_INCR; 462 if (da->rtbl_top >= BASE_MAX / 4) 463 FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%", 464 da->rtbl_top * 100 / BASE_MAX); 465 da->range_tbl = realloc(da->range_tbl, 466 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, 467 M_DXRAUX, M_NOWAIT); 468 if (da->range_tbl == NULL) 469 return (1); 470 } 471 472 return (0); 473 } 474 475 static void 476 chunk_unref(struct dxr_aux *da, uint32_t chunk) 477 { 478 struct direct_entry *fdesc = &da->direct_tbl[chunk]; 479 struct chunk_desc *cdp, *cdp2; 480 uint32_t base = fdesc->base; 481 uint32_t size = chunk_size(da, fdesc); 482 uint32_t hash = chunk_hash(da, fdesc); 483 484 /* Find the corresponding descriptor */ 485 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], 486 cd_hash_le) 487 if (cdp->cd_hash == hash && cdp->cd_cur_size == size && 488 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], 489 sizeof(struct range_entry_long) * size) == 0) 490 break; 491 492 KASSERT(cdp != NULL, ("dxr: dangling chunk")); 493 if (--cdp->cd_refcnt > 0) 494 return; 495 496 LIST_REMOVE(cdp, cd_hash_le); 497 da->unused_chunks_cnt++; 498 cdp->cd_cur_size = 0; 499 500 /* Attempt to merge with the preceding chunk, if empty */ 501 cdp2 = LIST_NEXT(cdp, cd_all_le); 502 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 503 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base, 504 ("dxr: %s %d", __FUNCTION__, __LINE__)); 505 LIST_REMOVE(cdp, cd_all_le); 506 da->all_chunks_cnt--; 507 LIST_REMOVE(cdp2, cd_hash_le); 508 da->unused_chunks_cnt--; 509 cdp2->cd_max_size += cdp->cd_max_size; 510 uma_zfree(chunk_zone, cdp); 511 cdp = cdp2; 512 } 513 514 /* Attempt to merge with the subsequent chunk, if empty */ 515 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); 516 if (cdp2 != NULL && cdp2->cd_cur_size == 0) { 517 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base, 518 ("dxr: %s %d", __FUNCTION__, __LINE__)); 519 LIST_REMOVE(cdp, cd_all_le); 520 da->all_chunks_cnt--; 521 LIST_REMOVE(cdp2, cd_hash_le); 522 da->unused_chunks_cnt--; 523 cdp2->cd_max_size += cdp->cd_max_size; 524 cdp2->cd_base = cdp->cd_base; 525 uma_zfree(chunk_zone, cdp); 526 cdp = cdp2; 527 } 528 529 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { 530 /* Free the chunk on the top of the range heap, trim the heap */ 531 KASSERT(cdp == LIST_FIRST(&da->all_chunks), 532 ("dxr: %s %d", __FUNCTION__, __LINE__)); 533 da->all_chunks_cnt--; 534 da->unused_chunks_cnt--; 535 da->rtbl_top -= cdp->cd_max_size; 536 LIST_REMOVE(cdp, cd_all_le); 537 uma_zfree(chunk_zone, cdp); 538 return; 539 } 540 541 LIST_INSERT_HEAD(&da->unused_chunks, cdp, cd_hash_le); 542 } 543 544 #ifdef DXR2 545 static uint32_t 546 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) 547 { 548 uint32_t i, *val; 549 uint32_t hash = 0; 550 551 for (i = 0; i < (1 << dxr_x); i++) { 552 hash = (hash << 3) ^ (hash >> 3); 553 val = (uint32_t *) 554 (void *) &da->direct_tbl[(index << dxr_x) + i]; 555 hash += (*val << 5); 556 hash += (*val >> 5); 557 } 558 559 return (hash + (hash >> 16)); 560 } 561 562 static int 563 trie_ref(struct dxr_aux *da, uint32_t index) 564 { 565 struct trie_desc *tp; 566 uint32_t dxr_d = da->d_bits; 567 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; 568 uint32_t hash = trie_hash(da, dxr_x, index); 569 570 /* Find an existing descriptor */ 571 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) 572 if (tp->td_hash == hash && 573 memcmp(&da->direct_tbl[index << dxr_x], 574 &da->x_tbl[tp->td_index << dxr_x], 575 sizeof(*da->x_tbl) << dxr_x) == 0) { 576 tp->td_refcnt++; 577 da->trietbl[index] = tp; 578 return(tp->td_index); 579 } 580 581 tp = LIST_FIRST(&da->unused_trie); 582 if (tp != NULL) { 583 LIST_REMOVE(tp, td_hash_le); 584 da->unused_trie_cnt--; 585 } else { 586 tp = uma_zalloc(trie_zone, M_NOWAIT); 587 if (tp == NULL) 588 return (-1); 589 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); 590 tp->td_index = da->all_trie_cnt++; 591 } 592 593 tp->td_hash = hash; 594 tp->td_refcnt = 1; 595 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, 596 td_hash_le); 597 memcpy(&da->x_tbl[tp->td_index << dxr_x], 598 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); 599 da->trietbl[index] = tp; 600 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { 601 da->xtbl_size += XTBL_SIZE_INCR; 602 da->x_tbl = realloc(da->x_tbl, 603 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); 604 if (da->x_tbl == NULL) 605 return (-1); 606 } 607 return(tp->td_index); 608 } 609 610 static void 611 trie_unref(struct dxr_aux *da, uint32_t index) 612 { 613 struct trie_desc *tp = da->trietbl[index]; 614 615 if (tp == NULL) 616 return; 617 da->trietbl[index] = NULL; 618 if (--tp->td_refcnt > 0) 619 return; 620 621 LIST_REMOVE(tp, td_hash_le); 622 da->unused_trie_cnt++; 623 if (tp->td_index != da->all_trie_cnt - 1) { 624 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); 625 return; 626 } 627 628 do { 629 da->all_trie_cnt--; 630 da->unused_trie_cnt--; 631 LIST_REMOVE(tp, td_all_le); 632 uma_zfree(trie_zone, tp); 633 LIST_FOREACH(tp, &da->unused_trie, td_hash_le) 634 if (tp->td_index == da->all_trie_cnt - 1) { 635 LIST_REMOVE(tp, td_hash_le); 636 break; 637 } 638 } while (tp != NULL); 639 } 640 #endif 641 642 static void 643 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, 644 uint32_t nh) 645 { 646 struct heap_entry *fhp; 647 int i; 648 649 for (i = da->heap_index; i >= 0; i--) { 650 if (preflen > da->heap[i].preflen) 651 break; 652 else if (preflen < da->heap[i].preflen) 653 da->heap[i + 1] = da->heap[i]; 654 else 655 return; 656 } 657 658 fhp = &da->heap[i + 1]; 659 fhp->preflen = preflen; 660 fhp->start = start; 661 fhp->end = end; 662 fhp->nexthop = nh; 663 da->heap_index++; 664 } 665 666 static int 667 dxr_walk(struct rtentry *rt, void *arg) 668 { 669 struct dxr_aux *da = arg; 670 uint32_t chunk = da->work_chunk; 671 uint32_t first = chunk << DXR_RANGE_SHIFT; 672 uint32_t last = first | DXR_RANGE_MASK; 673 struct range_entry_long *fp = 674 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 675 struct heap_entry *fhp = &da->heap[da->heap_index]; 676 uint32_t preflen, nh, start, end, scopeid; 677 struct in_addr addr; 678 679 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); 680 start = ntohl(addr.s_addr); 681 if (start > last) 682 return (-1); /* Beyond chunk boundaries, we are done */ 683 if (start < first) 684 return (0); /* Skip this route */ 685 686 end = start; 687 if (preflen < 32) 688 end |= (0xffffffffU >> preflen); 689 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); 690 691 if (start == fhp->start) 692 heap_inject(da, start, end, preflen, nh); 693 else { 694 /* start > fhp->start */ 695 while (start > fhp->end) { 696 uint32_t oend = fhp->end; 697 698 if (da->heap_index > 0) { 699 fhp--; 700 da->heap_index--; 701 } else 702 initheap(da, fhp->end + 1, chunk); 703 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 704 fp++; 705 da->rtbl_work_frags++; 706 fp->start = (oend + 1) & DXR_RANGE_MASK; 707 fp->nexthop = fhp->nexthop; 708 } 709 } 710 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && 711 nh != fp->nexthop) { 712 fp++; 713 da->rtbl_work_frags++; 714 fp->start = start & DXR_RANGE_MASK; 715 } else if (da->rtbl_work_frags) { 716 if ((--fp)->nexthop == nh) 717 da->rtbl_work_frags--; 718 else 719 fp++; 720 } 721 fp->nexthop = nh; 722 heap_inject(da, start, end, preflen, nh); 723 } 724 725 return (0); 726 } 727 728 static int 729 update_chunk(struct dxr_aux *da, uint32_t chunk) 730 { 731 struct range_entry_long *fp; 732 #if DXR_TRIE_BITS < 24 733 struct range_entry_short *fps; 734 uint32_t start, nh, i; 735 #endif 736 struct heap_entry *fhp; 737 uint32_t first = chunk << DXR_RANGE_SHIFT; 738 uint32_t last = first | DXR_RANGE_MASK; 739 740 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) 741 chunk_unref(da, chunk); 742 743 initheap(da, first, chunk); 744 745 fp = &da->range_tbl[da->rtbl_top].re; 746 da->rtbl_work_frags = 0; 747 fp->start = first & DXR_RANGE_MASK; 748 fp->nexthop = da->heap[0].nexthop; 749 750 da->dst.sin_addr.s_addr = htonl(first); 751 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); 752 753 da->work_chunk = chunk; 754 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, 755 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, 756 dxr_walk, da); 757 758 /* Flush any remaining objects on the heap */ 759 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; 760 fhp = &da->heap[da->heap_index]; 761 while (fhp->preflen > DXR_TRIE_BITS) { 762 uint32_t oend = fhp->end; 763 764 if (da->heap_index > 0) { 765 fhp--; 766 da->heap_index--; 767 } else 768 initheap(da, fhp->end + 1, chunk); 769 if (fhp->end > oend && fhp->nexthop != fp->nexthop) { 770 /* Have we crossed the upper chunk boundary? */ 771 if (oend >= last) 772 break; 773 fp++; 774 da->rtbl_work_frags++; 775 fp->start = (oend + 1) & DXR_RANGE_MASK; 776 fp->nexthop = fhp->nexthop; 777 } 778 } 779 780 /* Direct hit if the chunk contains only a single fragment */ 781 if (da->rtbl_work_frags == 0) { 782 da->direct_tbl[chunk].base = fp->nexthop; 783 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; 784 return (0); 785 } 786 787 da->direct_tbl[chunk].base = da->rtbl_top; 788 da->direct_tbl[chunk].fragments = da->rtbl_work_frags; 789 790 #if DXR_TRIE_BITS < 24 791 /* Check whether the chunk can be more compactly encoded */ 792 fp = &da->range_tbl[da->rtbl_top].re; 793 for (i = 0; i <= da->rtbl_work_frags; i++, fp++) 794 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) 795 break; 796 if (i == da->rtbl_work_frags + 1) { 797 fp = &da->range_tbl[da->rtbl_top].re; 798 fps = (void *) fp; 799 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { 800 start = fp->start; 801 nh = fp->nexthop; 802 fps->start = start >> 8; 803 fps->nexthop = nh; 804 } 805 fps->start = start >> 8; 806 fps->nexthop = nh; 807 da->rtbl_work_frags >>= 1; 808 da->direct_tbl[chunk].fragments = 809 da->rtbl_work_frags | FRAGS_PREF_SHORT; 810 } else 811 #endif 812 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { 813 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; 814 memmove(&da->range_tbl[da->rtbl_top + 1], 815 &da->range_tbl[da->rtbl_top], 816 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); 817 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; 818 da->rtbl_work_frags++; 819 } 820 da->rtbl_top += (da->rtbl_work_frags + 1); 821 return (chunk_ref(da, chunk)); 822 } 823 824 static void 825 dxr_build(struct dxr *dxr) 826 { 827 struct dxr_aux *da = dxr->aux; 828 struct chunk_desc *cdp; 829 struct rib_rtable_info rinfo; 830 struct timeval t0, t1, t2, t3; 831 uint32_t r_size, dxr_tot_size; 832 uint32_t i, m, range_rebuild = 0; 833 #ifdef DXR2 834 struct trie_desc *tp; 835 uint32_t d_tbl_size, dxr_x, d_size, x_size; 836 uint32_t ti, trie_rebuild = 0, prev_size = 0; 837 #endif 838 839 KASSERT(dxr->d == NULL, ("dxr: d not free")); 840 841 if (da == NULL) { 842 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); 843 if (da == NULL) 844 return; 845 dxr->aux = da; 846 da->fibnum = dxr->fibnum; 847 da->refcnt = 1; 848 LIST_INIT(&da->all_chunks); 849 LIST_INIT(&da->all_trie); 850 da->rtbl_size = RTBL_SIZE_INCR; 851 da->range_tbl = NULL; 852 da->xtbl_size = XTBL_SIZE_INCR; 853 da->x_tbl = NULL; 854 bzero(&da->dst, sizeof(da->dst)); 855 bzero(&da->mask, sizeof(da->mask)); 856 da->dst.sin_len = sizeof(da->dst); 857 da->mask.sin_len = sizeof(da->mask); 858 da->dst.sin_family = AF_INET; 859 da->mask.sin_family = AF_INET; 860 } 861 if (da->range_tbl == NULL) { 862 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size 863 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); 864 if (da->range_tbl == NULL) 865 return; 866 range_rebuild = 1; 867 } 868 #ifdef DXR2 869 if (da->x_tbl == NULL) { 870 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, 871 M_DXRAUX, M_NOWAIT); 872 if (da->x_tbl == NULL) 873 return; 874 trie_rebuild = 1; 875 } 876 #endif 877 da->fd = dxr->fd; 878 879 microuptime(&t0); 880 881 dxr->nh_tbl = fib_get_nhop_array(da->fd); 882 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 883 884 if (da->updates_low > da->updates_high || 885 da->unused_chunks_cnt > V_max_range_holes) 886 range_rebuild = 1; 887 if (range_rebuild) { 888 /* Bulk cleanup */ 889 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); 890 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 891 LIST_REMOVE(cdp, cd_all_le); 892 uma_zfree(chunk_zone, cdp); 893 } 894 LIST_INIT(&da->unused_chunks); 895 da->all_chunks_cnt = da->unused_chunks_cnt = 0; 896 da->rtbl_top = 0; 897 da->updates_low = 0; 898 da->updates_high = DIRECT_TBL_SIZE - 1; 899 memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); 900 for (i = 0; i < DIRECT_TBL_SIZE; i++) { 901 da->direct_tbl[i].fragments = FRAGS_MARK_HIT; 902 da->direct_tbl[i].base = 0; 903 } 904 } 905 da->prefixes = rinfo.num_prefixes; 906 907 /* DXR: construct direct & range table */ 908 for (i = da->updates_low; i <= da->updates_high; i++) { 909 m = da->updates_mask[i >> 5] >> (i & 0x1f); 910 if (m == 0) 911 i |= 0x1f; 912 else if (m & 1 && update_chunk(da, i) != 0) 913 return; 914 } 915 r_size = sizeof(*da->range_tbl) * da->rtbl_top; 916 microuptime(&t1); 917 918 #ifdef DXR2 919 if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes || 920 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) 921 trie_rebuild = 1; 922 if (trie_rebuild) { 923 da->trie_rebuilt_prefixes = da->prefixes; 924 da->d_bits = DXR_D; 925 da->updates_low = 0; 926 da->updates_high = DIRECT_TBL_SIZE - 1; 927 } 928 929 dxr2_try_squeeze: 930 if (trie_rebuild) { 931 /* Bulk cleanup */ 932 bzero(da->trietbl, sizeof(da->trietbl)); 933 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); 934 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 935 LIST_REMOVE(tp, td_all_le); 936 uma_zfree(trie_zone, tp); 937 } 938 LIST_INIT(&da->unused_trie); 939 da->all_trie_cnt = da->unused_trie_cnt = 0; 940 } 941 942 /* Populate d_tbl, x_tbl */ 943 dxr_x = DXR_TRIE_BITS - da->d_bits; 944 d_tbl_size = (1 << da->d_bits); 945 946 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; 947 i++) { 948 if (!trie_rebuild) { 949 m = 0; 950 for (int j = 0; j < (1 << dxr_x); j += 32) 951 m |= da->updates_mask[((i << dxr_x) + j) >> 5]; 952 if (m == 0) 953 continue; 954 trie_unref(da, i); 955 } 956 ti = trie_ref(da, i); 957 if (ti < 0) 958 return; 959 da->d_tbl[i] = ti; 960 } 961 962 d_size = sizeof(*da->d_tbl) * d_tbl_size; 963 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size 964 * da->all_trie_cnt; 965 dxr_tot_size = d_size + x_size + r_size; 966 967 if (trie_rebuild == 1) { 968 /* Try to find a more compact D/X split */ 969 if (prev_size == 0 || dxr_tot_size <= prev_size) 970 da->d_bits--; 971 else { 972 da->d_bits++; 973 trie_rebuild = 2; 974 } 975 prev_size = dxr_tot_size; 976 goto dxr2_try_squeeze; 977 } 978 microuptime(&t2); 979 #else /* !DXR2 */ 980 dxr_tot_size = sizeof(da->direct_tbl) + r_size; 981 t2 = t1; 982 #endif 983 984 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); 985 if (dxr->d == NULL) 986 return; 987 #ifdef DXR2 988 memcpy(dxr->d, da->d_tbl, d_size); 989 dxr->x = ((char *) dxr->d) + d_size; 990 memcpy(dxr->x, da->x_tbl, x_size); 991 dxr->r = ((char *) dxr->x) + x_size; 992 dxr->d_shift = 32 - da->d_bits; 993 dxr->x_shift = dxr_x; 994 dxr->x_mask = 0xffffffffU >> (32 - dxr_x); 995 #else /* !DXR2 */ 996 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl)); 997 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl); 998 #endif 999 memcpy(dxr->r, da->range_tbl, r_size); 1000 1001 if (da->updates_low <= da->updates_high) 1002 bzero(&da->updates_mask[da->updates_low / 32], 1003 (da->updates_high - da->updates_low) / 8 + 1); 1004 da->updates_low = DIRECT_TBL_SIZE - 1; 1005 da->updates_high = 0; 1006 microuptime(&t3); 1007 1008 #ifdef DXR2 1009 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", 1010 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); 1011 #else 1012 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)", 1013 DXR_D, rinfo.num_prefixes, rinfo.num_nhops); 1014 #endif 1015 i = dxr_tot_size * 100; 1016 if (rinfo.num_prefixes) 1017 i /= rinfo.num_prefixes; 1018 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", 1019 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, 1020 i / 100, i % 100); 1021 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; 1022 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", 1023 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1024 #ifdef DXR2 1025 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; 1026 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", 1027 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); 1028 #endif 1029 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; 1030 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", 1031 i / 1000, i % 1000); 1032 FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes", 1033 da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt, 1034 da->unused_chunks_cnt); 1035 } 1036 1037 /* 1038 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure. 1039 */ 1040 1041 static struct nhop_object * 1042 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, 1043 uint32_t scopeid) 1044 { 1045 struct dxr *dxr = algo_data; 1046 uint32_t nh; 1047 1048 nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr)); 1049 1050 return (dxr->nh_tbl[nh]); 1051 } 1052 1053 static enum flm_op_result 1054 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) 1055 { 1056 struct dxr *old_dxr = old_data; 1057 struct dxr_aux *da = NULL; 1058 struct dxr *dxr; 1059 1060 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); 1061 if (dxr == NULL) 1062 return (FLM_REBUILD); 1063 1064 /* Check whether we may reuse the old auxiliary structures */ 1065 if (old_dxr != NULL && old_dxr->aux != NULL) { 1066 da = old_dxr->aux; 1067 atomic_add_int(&da->refcnt, 1); 1068 } 1069 1070 dxr->aux = da; 1071 dxr->d = NULL; 1072 dxr->fd = fd; 1073 dxr->fibnum = fibnum; 1074 *data = dxr; 1075 1076 return (FLM_SUCCESS); 1077 } 1078 1079 static void 1080 dxr_destroy(void *data) 1081 { 1082 struct dxr *dxr = data; 1083 struct dxr_aux *da; 1084 struct chunk_desc *cdp; 1085 struct trie_desc *tp; 1086 1087 if (dxr->d != NULL) 1088 free(dxr->d, M_DXRLPM); 1089 1090 da = dxr->aux; 1091 free(dxr, M_DXRAUX); 1092 1093 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) 1094 return; 1095 1096 /* Release auxiliary structures */ 1097 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { 1098 LIST_REMOVE(cdp, cd_all_le); 1099 uma_zfree(chunk_zone, cdp); 1100 } 1101 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { 1102 LIST_REMOVE(tp, td_all_le); 1103 uma_zfree(trie_zone, tp); 1104 } 1105 free(da->range_tbl, M_DXRAUX); 1106 free(da->x_tbl, M_DXRAUX); 1107 free(da, M_DXRAUX); 1108 } 1109 1110 static void 1111 epoch_dxr_destroy(epoch_context_t ctx) 1112 { 1113 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); 1114 1115 dxr_destroy(dxr); 1116 } 1117 1118 static enum flm_op_result 1119 dxr_dump_end(void *data, struct fib_dp *dp) 1120 { 1121 struct dxr *dxr = data; 1122 struct dxr_aux *da; 1123 1124 dxr_build(dxr); 1125 1126 da = dxr->aux; 1127 if (da == NULL) 1128 return (FLM_REBUILD); 1129 1130 /* Structural limit exceeded, hard error */ 1131 if (da->rtbl_top >= BASE_MAX) 1132 return (FLM_ERROR); 1133 1134 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1135 if (dxr->d == NULL) 1136 return (FLM_REBUILD); 1137 1138 dp->f = dxr_fib_lookup; 1139 dp->arg = dxr; 1140 1141 return (FLM_SUCCESS); 1142 } 1143 1144 static enum flm_op_result 1145 dxr_dump_rib_item(struct rtentry *rt, void *data) 1146 { 1147 1148 return (FLM_SUCCESS); 1149 } 1150 1151 static enum flm_op_result 1152 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, 1153 void *data) 1154 { 1155 1156 return (FLM_BATCH); 1157 } 1158 1159 static enum flm_op_result 1160 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, 1161 void *data) 1162 { 1163 struct dxr *dxr = data; 1164 struct dxr *new_dxr; 1165 struct dxr_aux *da; 1166 struct fib_dp new_dp; 1167 enum flm_op_result res; 1168 uint32_t ip, plen, hmask, start, end, i, ui; 1169 #ifdef INVARIANTS 1170 struct rib_rtable_info rinfo; 1171 int update_delta = 0; 1172 #endif 1173 1174 KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__)); 1175 KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__)); 1176 KASSERT(q->count < q->size, ("%s: q->count %d q->size %d", 1177 __FUNCTION__, q->count, q->size)); 1178 1179 da = dxr->aux; 1180 KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__)); 1181 1182 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); 1183 for (ui = 0; ui < q->count; ui++) { 1184 #ifdef INVARIANTS 1185 if (q->entries[ui].nh_new != NULL) 1186 update_delta++; 1187 if (q->entries[ui].nh_old != NULL) 1188 update_delta--; 1189 #endif 1190 plen = q->entries[ui].plen; 1191 ip = ntohl(q->entries[ui].addr4.s_addr); 1192 if (plen < 32) 1193 hmask = 0xffffffffU >> plen; 1194 else 1195 hmask = 0; 1196 start = (ip & ~hmask) >> DXR_RANGE_SHIFT; 1197 end = (ip | hmask) >> DXR_RANGE_SHIFT; 1198 1199 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) 1200 for (i = start >> 5; i <= end >> 5; i++) 1201 da->updates_mask[i] = 0xffffffffU; 1202 else 1203 for (i = start; i <= end; i++) 1204 da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); 1205 if (start < da->updates_low) 1206 da->updates_low = start; 1207 if (end > da->updates_high) 1208 da->updates_high = end; 1209 } 1210 1211 #ifdef INVARIANTS 1212 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); 1213 KASSERT(da->prefixes + update_delta == rinfo.num_prefixes, 1214 ("%s: update count mismatch", __FUNCTION__)); 1215 #endif 1216 1217 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); 1218 if (res != FLM_SUCCESS) 1219 return (res); 1220 1221 dxr_build(new_dxr); 1222 1223 /* Structural limit exceeded, hard error */ 1224 if (da->rtbl_top >= BASE_MAX) { 1225 dxr_destroy(new_dxr); 1226 return (FLM_ERROR); 1227 } 1228 1229 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ 1230 if (new_dxr->d == NULL) { 1231 dxr_destroy(new_dxr); 1232 return (FLM_REBUILD); 1233 } 1234 1235 new_dp.f = dxr_fib_lookup; 1236 new_dp.arg = new_dxr; 1237 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { 1238 fib_set_algo_ptr(dxr->fd, new_dxr); 1239 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); 1240 return (FLM_SUCCESS); 1241 } 1242 1243 dxr_destroy(new_dxr); 1244 return (FLM_REBUILD); 1245 } 1246 1247 static uint8_t 1248 dxr_get_pref(const struct rib_rtable_info *rinfo) 1249 { 1250 1251 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ 1252 return (251); 1253 } 1254 1255 static struct fib_lookup_module fib_dxr_mod = { 1256 .flm_name = "dxr", 1257 .flm_family = AF_INET, 1258 .flm_init_cb = dxr_init, 1259 .flm_destroy_cb = dxr_destroy, 1260 .flm_dump_rib_item_cb = dxr_dump_rib_item, 1261 .flm_dump_end_cb = dxr_dump_end, 1262 .flm_change_rib_item_cb = dxr_change_rib_item, 1263 .flm_change_rib_items_cb = dxr_change_rib_batch, 1264 .flm_get_pref = dxr_get_pref, 1265 }; 1266 1267 static int 1268 dxr_modevent(module_t mod, int type, void *unused) 1269 { 1270 int error; 1271 1272 switch (type) { 1273 case MOD_LOAD: 1274 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), 1275 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1276 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), 1277 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1278 fib_module_register(&fib_dxr_mod); 1279 return(0); 1280 case MOD_UNLOAD: 1281 error = fib_module_unregister(&fib_dxr_mod); 1282 if (error) 1283 return (error); 1284 uma_zdestroy(chunk_zone); 1285 uma_zdestroy(trie_zone); 1286 return(0); 1287 default: 1288 return(EOPNOTSUPP); 1289 } 1290 } 1291 1292 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; 1293 1294 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); 1295 MODULE_VERSION(fib_dxr, 1); 1296