1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 1990 Mentat Inc. 24 */ 25 26 /* 27 * This file contains routines that manipulate Internet Routing Entries (IREs). 28 */ 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/ddi.h> 33 #include <sys/cmn_err.h> 34 35 #include <sys/systm.h> 36 #include <sys/param.h> 37 #include <sys/socket.h> 38 #include <net/if.h> 39 #include <net/route.h> 40 #include <netinet/in.h> 41 #include <net/if_dl.h> 42 #include <netinet/ip6.h> 43 #include <netinet/icmp6.h> 44 45 #include <inet/common.h> 46 #include <inet/mi.h> 47 #include <inet/ip.h> 48 #include <inet/ip6.h> 49 #include <inet/ip_ndp.h> 50 #include <inet/ip_if.h> 51 #include <inet/ip_ire.h> 52 #include <inet/ipclassifier.h> 53 #include <inet/nd.h> 54 #include <inet/tunables.h> 55 #include <sys/kmem.h> 56 #include <sys/zone.h> 57 58 #include <sys/tsol/label.h> 59 #include <sys/tsol/tnet.h> 60 61 #define IS_DEFAULT_ROUTE_V6(ire) \ 62 (((ire)->ire_type & IRE_DEFAULT) || \ 63 (((ire)->ire_type & IRE_INTERFACE) && \ 64 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) 65 66 static ire_t ire_null; 67 68 static ire_t * 69 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 70 const in6_addr_t *gateway, int type, const ill_t *ill, 71 zoneid_t zoneid, const ts_label_t *tsl, int flags, 72 ip_stack_t *ipst); 73 74 /* 75 * Initialize the ire that is specific to IPv6 part and call 76 * ire_init_common to finish it. 77 * Returns zero or errno. 78 */ 79 int 80 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 81 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, 82 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 83 { 84 int error; 85 86 /* 87 * Reject IRE security attmakeribute creation/initialization 88 * if system is not running in Trusted mode. 89 */ 90 if (gc != NULL && !is_system_labeled()) 91 return (EINVAL); 92 93 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 94 if (v6addr != NULL) 95 ire->ire_addr_v6 = *v6addr; 96 if (v6gateway != NULL) 97 ire->ire_gateway_addr_v6 = *v6gateway; 98 99 /* Make sure we don't have stray values in some fields */ 100 switch (type) { 101 case IRE_LOOPBACK: 102 case IRE_HOST: 103 case IRE_LOCAL: 104 case IRE_IF_CLONE: 105 ire->ire_mask_v6 = ipv6_all_ones; 106 ire->ire_masklen = IPV6_ABITS; 107 break; 108 case IRE_PREFIX: 109 case IRE_DEFAULT: 110 case IRE_IF_RESOLVER: 111 case IRE_IF_NORESOLVER: 112 if (v6mask != NULL) { 113 ire->ire_mask_v6 = *v6mask; 114 ire->ire_masklen = 115 ip_mask_to_plen_v6(&ire->ire_mask_v6); 116 } 117 break; 118 case IRE_MULTICAST: 119 case IRE_NOROUTE: 120 ASSERT(v6mask == NULL); 121 break; 122 default: 123 ASSERT(0); 124 return (EINVAL); 125 } 126 127 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, 128 gc, ipst); 129 if (error != 0) 130 return (error); 131 132 /* Determine which function pointers to use */ 133 ire->ire_postfragfn = ip_xmit; /* Common case */ 134 135 switch (ire->ire_type) { 136 case IRE_LOCAL: 137 ire->ire_sendfn = ire_send_local_v6; 138 ire->ire_recvfn = ire_recv_local_v6; 139 ASSERT(ire->ire_ill != NULL); 140 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 141 ire->ire_recvfn = ire_recv_noaccept_v6; 142 break; 143 case IRE_LOOPBACK: 144 ire->ire_sendfn = ire_send_local_v6; 145 ire->ire_recvfn = ire_recv_loopback_v6; 146 break; 147 case IRE_MULTICAST: 148 ire->ire_postfragfn = ip_postfrag_loopcheck; 149 ire->ire_sendfn = ire_send_multicast_v6; 150 ire->ire_recvfn = ire_recv_multicast_v6; 151 break; 152 default: 153 /* 154 * For IRE_IF_ALL and IRE_OFFLINK we forward received 155 * packets by default. 156 */ 157 ire->ire_sendfn = ire_send_wire_v6; 158 ire->ire_recvfn = ire_recv_forward_v6; 159 break; 160 } 161 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 162 ire->ire_sendfn = ire_send_noroute_v6; 163 ire->ire_recvfn = ire_recv_noroute_v6; 164 } else if (ire->ire_flags & RTF_MULTIRT) { 165 ire->ire_postfragfn = ip_postfrag_multirt_v6; 166 ire->ire_sendfn = ire_send_multirt_v6; 167 ire->ire_recvfn = ire_recv_multirt_v6; 168 } 169 ire->ire_nce_capable = ire_determine_nce_capable(ire); 170 return (0); 171 } 172 173 /* 174 * ire_create_v6 is called to allocate and initialize a new IRE. 175 * 176 * NOTE : This is called as writer sometimes though not required 177 * by this function. 178 */ 179 /* ARGSUSED */ 180 ire_t * 181 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 182 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, 183 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 184 { 185 ire_t *ire; 186 int error; 187 188 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 189 190 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 191 if (ire == NULL) { 192 DTRACE_PROBE(kmem__cache__alloc); 193 return (NULL); 194 } 195 *ire = ire_null; 196 197 error = ire_init_v6(ire, v6addr, v6mask, v6gateway, 198 type, ill, zoneid, flags, gc, ipst); 199 200 if (error != 0) { 201 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); 202 kmem_cache_free(ire_cache, ire); 203 return (NULL); 204 } 205 return (ire); 206 } 207 208 /* 209 * Find the ill matching a multicast group. 210 * Allows different routes for multicast addresses 211 * in the unicast routing table (akin to FF::0/8 but could be more specific) 212 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 213 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 214 * specify the interface to join on. 215 * 216 * Supports link-local addresses by using ire_route_recursive which follows 217 * the ill when recursing. 218 * 219 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 220 * and the MULTIRT property can be different for different groups, we 221 * extract RTF_MULTIRT from the special unicast route added for a group 222 * with CGTP and pass that back in the multirtp argument. 223 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 224 * We have a setsrcp argument for the same reason. 225 */ 226 ill_t * 227 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, 228 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) 229 { 230 ire_t *ire; 231 ill_t *ill; 232 233 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, 234 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 235 ASSERT(ire != NULL); 236 237 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 238 ire_refrele(ire); 239 return (NULL); 240 } 241 242 if (multirtp != NULL) 243 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 244 245 ill = ire_nexthop_ill(ire); 246 ire_refrele(ire); 247 return (ill); 248 } 249 250 /* 251 * This function takes a mask and returns number of bits set in the 252 * mask (the represented prefix length). Assumes a contiguous mask. 253 */ 254 int 255 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 256 { 257 int bits; 258 int plen = IPV6_ABITS; 259 int i; 260 261 for (i = 3; i >= 0; i--) { 262 if (v6mask->s6_addr32[i] == 0) { 263 plen -= 32; 264 continue; 265 } 266 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 267 if (bits == 0) 268 break; 269 plen -= bits; 270 } 271 272 return (plen); 273 } 274 275 /* 276 * Convert a prefix length to the mask for that prefix. 277 * Returns the argument bitmask. 278 */ 279 in6_addr_t * 280 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 281 { 282 uint32_t *ptr; 283 284 if (plen < 0 || plen > IPV6_ABITS) 285 return (NULL); 286 *bitmask = ipv6_all_zeros; 287 if (plen == 0) 288 return (bitmask); 289 290 ptr = (uint32_t *)bitmask; 291 while (plen > 32) { 292 *ptr++ = 0xffffffffU; 293 plen -= 32; 294 } 295 *ptr = htonl(0xffffffffU << (32 - plen)); 296 return (bitmask); 297 } 298 299 /* 300 * Add a fully initialized IPv6 IRE to the forwarding table. 301 * This returns NULL on failure, or a held IRE on success. 302 * Normally the returned IRE is the same as the argument. But a different 303 * IRE will be returned if the added IRE is deemed identical to an existing 304 * one. In that case ire_identical_ref will be increased. 305 * The caller always needs to do an ire_refrele() on the returned IRE. 306 */ 307 ire_t * 308 ire_add_v6(ire_t *ire) 309 { 310 ire_t *ire1; 311 int mask_table_index; 312 irb_t *irb_ptr; 313 ire_t **irep; 314 int match_flags; 315 int error; 316 ip_stack_t *ipst = ire->ire_ipst; 317 318 ASSERT(ire->ire_ipversion == IPV6_VERSION); 319 320 /* Make sure the address is properly masked. */ 321 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 322 323 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 324 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { 325 irb_t *ptr; 326 int i; 327 328 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * 329 sizeof (irb_t))); 330 if (ptr == NULL) { 331 ire_delete(ire); 332 return (NULL); 333 } 334 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 335 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); 336 ptr[i].irb_ipst = ipst; 337 } 338 mutex_enter(&ipst->ips_ire_ft_init_lock); 339 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == 340 NULL) { 341 ipst->ips_ip_forwarding_table_v6[mask_table_index] = 342 ptr; 343 mutex_exit(&ipst->ips_ire_ft_init_lock); 344 } else { 345 /* 346 * Some other thread won the race in 347 * initializing the forwarding table at the 348 * same index. 349 */ 350 mutex_exit(&ipst->ips_ire_ft_init_lock); 351 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 352 rw_destroy(&ptr[i].irb_lock); 353 } 354 mi_free(ptr); 355 } 356 } 357 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 358 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 359 ipst->ips_ip6_ftable_hash_size)]); 360 361 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 362 if (ire->ire_ill != NULL) 363 match_flags |= MATCH_IRE_ILL; 364 /* 365 * Start the atomic add of the ire. Grab the bucket lock and the 366 * ill lock. Check for condemned. 367 */ 368 error = ire_atomic_start(irb_ptr, ire); 369 if (error != 0) { 370 ire_delete(ire); 371 return (NULL); 372 } 373 374 /* 375 * If we are creating a hidden IRE, make sure we search for 376 * hidden IREs when searching for duplicates below. 377 * Otherwise, we might find an IRE on some other interface 378 * that's not marked hidden. 379 */ 380 if (ire->ire_testhidden) 381 match_flags |= MATCH_IRE_TESTHIDDEN; 382 383 /* 384 * Atomically check for duplicate and insert in the table. 385 */ 386 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 387 if (IRE_IS_CONDEMNED(ire1)) 388 continue; 389 /* 390 * Here we need an exact match on zoneid, i.e., 391 * ire_match_args doesn't fit. 392 */ 393 if (ire1->ire_zoneid != ire->ire_zoneid) 394 continue; 395 396 if (ire1->ire_type != ire->ire_type) 397 continue; 398 399 /* 400 * Note: We do not allow multiple routes that differ only 401 * in the gateway security attributes; such routes are 402 * considered duplicates. 403 * To change that we explicitly have to treat them as 404 * different here. 405 */ 406 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 407 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 408 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, 409 match_flags)) { 410 /* 411 * Return the old ire after doing a REFHOLD. 412 * As most of the callers continue to use the IRE 413 * after adding, we return a held ire. This will 414 * avoid a lookup in the caller again. If the callers 415 * don't want to use it, they need to do a REFRELE. 416 * 417 * We only allow exactly one IRE_IF_CLONE for any dst, 418 * so, if the is an IF_CLONE, return the ire without 419 * an identical_ref, but with an ire_ref held. 420 */ 421 if (ire->ire_type != IRE_IF_CLONE) { 422 atomic_add_32(&ire1->ire_identical_ref, 1); 423 DTRACE_PROBE2(ire__add__exist, ire_t *, ire1, 424 ire_t *, ire); 425 } 426 ip1dbg(("found dup ire existing %p new %p", 427 (void *)ire1, (void *)ire)); 428 ire_refhold(ire1); 429 ire_atomic_end(irb_ptr, ire); 430 ire_delete(ire); 431 return (ire1); 432 } 433 } 434 435 /* 436 * Normally we do head insertion since most things do not care about 437 * the order of the IREs in the bucket. 438 * However, due to shared-IP zones (and restrict_interzone_loopback) 439 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 440 * address. For that reason we do tail insertion for IRE_IF_CLONE. 441 */ 442 irep = (ire_t **)irb_ptr; 443 if (ire->ire_type & IRE_IF_CLONE) { 444 while ((ire1 = *irep) != NULL) 445 irep = &ire1->ire_next; 446 } 447 /* Insert at *irep */ 448 ire1 = *irep; 449 if (ire1 != NULL) 450 ire1->ire_ptpn = &ire->ire_next; 451 ire->ire_next = ire1; 452 /* Link the new one in. */ 453 ire->ire_ptpn = irep; 454 /* 455 * ire_walk routines de-reference ire_next without holding 456 * a lock. Before we point to the new ire, we want to make 457 * sure the store that sets the ire_next of the new ire 458 * reaches global visibility, so that ire_walk routines 459 * don't see a truncated list of ires i.e if the ire_next 460 * of the new ire gets set after we do "*irep = ire" due 461 * to re-ordering, the ire_walk thread will see a NULL 462 * once it accesses the ire_next of the new ire. 463 * membar_producer() makes sure that the following store 464 * happens *after* all of the above stores. 465 */ 466 membar_producer(); 467 *irep = ire; 468 ire->ire_bucket = irb_ptr; 469 /* 470 * We return a bumped up IRE above. Keep it symmetrical 471 * so that the callers will always have to release. This 472 * helps the callers of this function because they continue 473 * to use the IRE after adding and hence they don't have to 474 * lookup again after we return the IRE. 475 * 476 * NOTE : We don't have to use atomics as this is appearing 477 * in the list for the first time and no one else can bump 478 * up the reference count on this yet. 479 */ 480 ire_refhold_locked(ire); 481 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 482 irb_ptr->irb_ire_cnt++; 483 484 if (ire->ire_ill != NULL) { 485 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, 486 (char *), "ire", (void *), ire); 487 ire->ire_ill->ill_ire_cnt++; 488 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 489 } 490 ire_atomic_end(irb_ptr, ire); 491 492 /* Make any caching of the IREs be notified or updated */ 493 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 494 495 return (ire); 496 } 497 498 /* 499 * Search for all HOST REDIRECT routes that are 500 * pointing at the specified gateway and 501 * delete them. This routine is called only 502 * when a default gateway is going away. 503 */ 504 static void 505 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 506 { 507 irb_t *irb_ptr; 508 irb_t *irb; 509 ire_t *ire; 510 in6_addr_t gw_addr_v6; 511 int i; 512 513 /* get the hash table for HOST routes */ 514 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 515 if (irb_ptr == NULL) 516 return; 517 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 518 irb = &irb_ptr[i]; 519 irb_refhold(irb); 520 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 521 if (!(ire->ire_flags & RTF_DYNAMIC)) 522 continue; 523 mutex_enter(&ire->ire_lock); 524 gw_addr_v6 = ire->ire_gateway_addr_v6; 525 mutex_exit(&ire->ire_lock); 526 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 527 ire_delete(ire); 528 } 529 irb_refrele(irb); 530 } 531 } 532 533 /* 534 * Delete the specified IRE. 535 * All calls should use ire_delete(). 536 * Sometimes called as writer though not required by this function. 537 * 538 * NOTE : This function is called only if the ire was added 539 * in the list. 540 */ 541 void 542 ire_delete_v6(ire_t *ire) 543 { 544 in6_addr_t gw_addr_v6; 545 ip_stack_t *ipst = ire->ire_ipst; 546 547 /* 548 * Make sure ire_generation increases from ire_flush_cache happen 549 * after any lookup/reader has read ire_generation. 550 * Since the rw_enter makes us wait until any lookup/reader has 551 * completed we can exit the lock immediately. 552 */ 553 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 554 rw_exit(&ipst->ips_ip6_ire_head_lock); 555 556 ASSERT(ire->ire_refcnt >= 1); 557 ASSERT(ire->ire_ipversion == IPV6_VERSION); 558 559 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 560 561 if (ire->ire_type == IRE_DEFAULT) { 562 /* 563 * when a default gateway is going away 564 * delete all the host redirects pointing at that 565 * gateway. 566 */ 567 mutex_enter(&ire->ire_lock); 568 gw_addr_v6 = ire->ire_gateway_addr_v6; 569 mutex_exit(&ire->ire_lock); 570 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 571 } 572 573 /* 574 * If we are deleting an IRE_INTERFACE then we make sure we also 575 * delete any IRE_IF_CLONE that has been created from it. 576 * Those are always in ire_dep_children. 577 */ 578 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) 579 ire_dep_delete_if_clone(ire); 580 581 /* Remove from parent dependencies and child */ 582 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 583 if (ire->ire_dep_parent != NULL) { 584 ire_dep_remove(ire); 585 } 586 while (ire->ire_dep_children != NULL) 587 ire_dep_remove(ire->ire_dep_children); 588 rw_exit(&ipst->ips_ire_dep_lock); 589 } 590 591 /* 592 * When an IRE is added or deleted this routine is called to make sure 593 * any caching of IRE information is notified or updated. 594 * 595 * The flag argument indicates if the flush request is due to addition 596 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 597 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 598 */ 599 void 600 ire_flush_cache_v6(ire_t *ire, int flag) 601 { 602 ip_stack_t *ipst = ire->ire_ipst; 603 604 /* 605 * IRE_IF_CLONE ire's don't provide any new information 606 * than the parent from which they are cloned, so don't 607 * perturb the generation numbers. 608 */ 609 if (ire->ire_type & IRE_IF_CLONE) 610 return; 611 612 /* 613 * Ensure that an ire_add during a lookup serializes the updates of 614 * the generation numbers under ire_head_lock so that the lookup gets 615 * either the old ire and old generation number, or a new ire and new 616 * generation number. 617 */ 618 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 619 620 /* 621 * If a route was just added, we need to notify everybody that 622 * has cached an IRE_NOROUTE since there might now be a better 623 * route for them. 624 */ 625 if (flag == IRE_FLUSH_ADD) { 626 ire_increment_generation(ipst->ips_ire_reject_v6); 627 ire_increment_generation(ipst->ips_ire_blackhole_v6); 628 } 629 630 /* Adding a default can't otherwise provide a better route */ 631 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 632 rw_exit(&ipst->ips_ip6_ire_head_lock); 633 return; 634 } 635 636 switch (flag) { 637 case IRE_FLUSH_DELETE: 638 case IRE_FLUSH_GWCHANGE: 639 /* 640 * Update ire_generation for all ire_dep_children chains 641 * starting with this IRE 642 */ 643 ire_dep_incr_generation(ire); 644 break; 645 case IRE_FLUSH_ADD: { 646 in6_addr_t addr; 647 in6_addr_t mask; 648 ip_stack_t *ipst = ire->ire_ipst; 649 uint_t masklen; 650 651 /* 652 * Find an IRE which is a shorter match than the ire to be added 653 * For any such IRE (which we repeat) we update the 654 * ire_generation the same way as in the delete case. 655 */ 656 addr = ire->ire_addr_v6; 657 mask = ire->ire_mask_v6; 658 masklen = ip_mask_to_plen_v6(&mask); 659 660 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, 661 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 662 while (ire != NULL) { 663 /* We need to handle all in the same bucket */ 664 irb_increment_generation(ire->ire_bucket); 665 666 mask = ire->ire_mask_v6; 667 ASSERT(masklen > ip_mask_to_plen_v6(&mask)); 668 masklen = ip_mask_to_plen_v6(&mask); 669 ire_refrele(ire); 670 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, 671 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 672 } 673 } 674 break; 675 } 676 rw_exit(&ipst->ips_ip6_ire_head_lock); 677 } 678 679 /* 680 * Matches the arguments passed with the values in the ire. 681 * 682 * Note: for match types that match using "ill" passed in, ill 683 * must be checked for non-NULL before calling this routine. 684 */ 685 boolean_t 686 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, 687 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, 688 const ts_label_t *tsl, int match_flags) 689 { 690 in6_addr_t gw_addr_v6; 691 ill_t *ire_ill = NULL, *dst_ill; 692 ip_stack_t *ipst = ire->ire_ipst; 693 694 ASSERT(ire->ire_ipversion == IPV6_VERSION); 695 ASSERT(addr != NULL); 696 ASSERT(mask != NULL); 697 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); 698 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) || 699 (ill != NULL && ill->ill_isv6)); 700 701 /* 702 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it 703 * is in fact hidden, to ensure the caller gets the right one. 704 */ 705 if (ire->ire_testhidden) { 706 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 707 return (B_FALSE); 708 } 709 710 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 711 ire->ire_zoneid != ALL_ZONES) { 712 /* 713 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 714 * does not match that of ire_zoneid, a failure to 715 * match is reported at this point. Otherwise, since some IREs 716 * that are available in the global zone can be used in local 717 * zones, additional checks need to be performed: 718 * 719 * IRE_LOOPBACK 720 * entries should never be matched in this situation. 721 * Each zone has its own IRE_LOOPBACK. 722 * 723 * IRE_LOCAL 724 * We allow them for any zoneid. ire_route_recursive 725 * does additional checks when 726 * ip_restrict_interzone_loopback is set. 727 * 728 * If ill_usesrc_ifindex is set 729 * Then we check if the zone has a valid source address 730 * on the usesrc ill. 731 * 732 * If ire_ill is set, then check that the zone has an ipif 733 * on that ill. 734 * 735 * Outside of this function (in ire_round_robin) we check 736 * that any IRE_OFFLINK has a gateway that reachable from the 737 * zone when we have multiple choices (ECMP). 738 */ 739 if (match_flags & MATCH_IRE_ZONEONLY) 740 return (B_FALSE); 741 if (ire->ire_type & IRE_LOOPBACK) 742 return (B_FALSE); 743 744 if (ire->ire_type & IRE_LOCAL) 745 goto matchit; 746 747 /* 748 * The normal case of IRE_ONLINK has a matching zoneid. 749 * Here we handle the case when shared-IP zones have been 750 * configured with IP addresses on vniN. In that case it 751 * is ok for traffic from a zone to use IRE_ONLINK routes 752 * if the ill has a usesrc pointing at vniN 753 * Applies to IRE_INTERFACE. 754 */ 755 dst_ill = ire->ire_ill; 756 if (ire->ire_type & IRE_ONLINK) { 757 uint_t ifindex; 758 759 /* 760 * Note there is no IRE_INTERFACE on vniN thus 761 * can't do an IRE lookup for a matching route. 762 */ 763 ifindex = dst_ill->ill_usesrc_ifindex; 764 if (ifindex == 0) 765 return (B_FALSE); 766 767 /* 768 * If there is a usable source address in the 769 * zone, then it's ok to return this IRE_INTERFACE 770 */ 771 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 772 zoneid, ipst)) { 773 ip3dbg(("ire_match_args: no usrsrc for zone" 774 " dst_ill %p\n", (void *)dst_ill)); 775 return (B_FALSE); 776 } 777 } 778 /* 779 * For example, with 780 * route add 11.0.0.0 gw1 -ifp bge0 781 * route add 11.0.0.0 gw2 -ifp bge1 782 * this code would differentiate based on 783 * where the sending zone has addresses. 784 * Only if the zone has an address on bge0 can it use the first 785 * route. It isn't clear if this behavior is documented 786 * anywhere. 787 */ 788 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 789 ipif_t *tipif; 790 791 mutex_enter(&dst_ill->ill_lock); 792 for (tipif = dst_ill->ill_ipif; 793 tipif != NULL; tipif = tipif->ipif_next) { 794 if (!IPIF_IS_CONDEMNED(tipif) && 795 (tipif->ipif_flags & IPIF_UP) && 796 (tipif->ipif_zoneid == zoneid || 797 tipif->ipif_zoneid == ALL_ZONES)) 798 break; 799 } 800 mutex_exit(&dst_ill->ill_lock); 801 if (tipif == NULL) 802 return (B_FALSE); 803 } 804 } 805 806 matchit: 807 ire_ill = ire->ire_ill; 808 if (match_flags & MATCH_IRE_GW) { 809 mutex_enter(&ire->ire_lock); 810 gw_addr_v6 = ire->ire_gateway_addr_v6; 811 mutex_exit(&ire->ire_lock); 812 } 813 if (match_flags & MATCH_IRE_ILL) { 814 815 /* 816 * If asked to match an ill, we *must* match 817 * on the ire_ill for ipmp test addresses, or 818 * any of the ill in the group for data addresses. 819 * If we don't, we may as well fail. 820 * However, we need an exception for IRE_LOCALs to ensure 821 * we loopback packets even sent to test addresses on different 822 * interfaces in the group. 823 */ 824 if ((match_flags & MATCH_IRE_TESTHIDDEN) && 825 !(ire->ire_type & IRE_LOCAL)) { 826 if (ire->ire_ill != ill) 827 return (B_FALSE); 828 } else { 829 match_flags &= ~MATCH_IRE_TESTHIDDEN; 830 /* 831 * We know that ill is not NULL, but ire_ill could be 832 * NULL 833 */ 834 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 835 return (B_FALSE); 836 } 837 } 838 if (match_flags & MATCH_IRE_SRC_ILL) { 839 if (ire_ill == NULL) 840 return (B_FALSE); 841 if (!IS_ON_SAME_LAN(ill, ire_ill)) { 842 if (ire_ill->ill_usesrc_ifindex == 0 || 843 (ire_ill->ill_usesrc_ifindex != 844 ill->ill_phyint->phyint_ifindex)) 845 return (B_FALSE); 846 } 847 } 848 849 /* No ire_addr_v6 bits set past the mask */ 850 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, 851 ire->ire_addr_v6)); 852 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && 853 ((!(match_flags & MATCH_IRE_GW)) || 854 ((!(match_flags & MATCH_IRE_DIRECT)) || 855 !(ire->ire_flags & RTF_INDIRECT)) && 856 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && 857 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 858 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 859 ((!(match_flags & MATCH_IRE_MASK)) || 860 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && 861 ((!(match_flags & MATCH_IRE_SECATTR)) || 862 (!is_system_labeled()) || 863 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 864 /* We found the matched IRE */ 865 return (B_TRUE); 866 } 867 return (B_FALSE); 868 } 869 870 /* 871 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 872 * gateway address. If ill is non-NULL we also match on it. 873 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 874 */ 875 boolean_t 876 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, 877 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 878 { 879 ire_t *ire; 880 uint_t match_flags; 881 882 if (lock_held) 883 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); 884 else 885 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 886 887 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 888 if (ill != NULL) 889 match_flags |= MATCH_IRE_ILL; 890 891 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, 892 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, 893 ipst); 894 895 if (!lock_held) 896 rw_exit(&ipst->ips_ip6_ire_head_lock); 897 if (ire != NULL) { 898 ire_refrele(ire); 899 return (B_TRUE); 900 } else { 901 return (B_FALSE); 902 } 903 } 904 905 /* 906 * Lookup a route in forwarding table. 907 * specific lookup is indicated by passing the 908 * required parameters and indicating the 909 * match required in flag field. 910 * 911 * Supports link-local addresses by following the ipif/ill when recursing. 912 */ 913 ire_t * 914 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, 915 const in6_addr_t *gateway, int type, const ill_t *ill, 916 zoneid_t zoneid, const ts_label_t *tsl, int flags, 917 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 918 { 919 ire_t *ire = NULL; 920 921 ASSERT(addr != NULL); 922 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); 923 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); 924 ASSERT(ill == NULL || ill->ill_isv6); 925 926 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 927 928 /* 929 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL 930 * or MATCH_IRE_SRC_ILL is set. 931 */ 932 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 933 return (NULL); 934 935 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 936 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, 937 tsl, flags, ipst); 938 if (ire == NULL) { 939 rw_exit(&ipst->ips_ip6_ire_head_lock); 940 return (NULL); 941 } 942 943 /* 944 * round-robin only if we have more than one route in the bucket. 945 * ips_ip_ecmp_behavior controls when we do ECMP 946 * 2: always 947 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 948 * 0: never 949 * 950 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 951 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 952 * and the IRE_INTERFACESs are likely to be shorter matches. 953 */ 954 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 955 if (ipst->ips_ip_ecmp_behavior == 2 || 956 (ipst->ips_ip_ecmp_behavior == 1 && 957 IS_DEFAULT_ROUTE_V6(ire))) { 958 ire_t *next_ire; 959 ire_ftable_args_t margs; 960 961 bzero(&margs, sizeof (margs)); 962 margs.ift_addr_v6 = *addr; 963 if (mask != NULL) 964 margs.ift_mask_v6 = *mask; 965 if (gateway != NULL) 966 margs.ift_gateway_v6 = *gateway; 967 margs.ift_type = type; 968 margs.ift_ill = ill; 969 margs.ift_zoneid = zoneid; 970 margs.ift_tsl = tsl; 971 margs.ift_flags = flags; 972 973 next_ire = ire_round_robin(ire->ire_bucket, &margs, 974 xmit_hint, ire, ipst); 975 if (next_ire == NULL) { 976 /* keep ire if next_ire is null */ 977 goto done; 978 } 979 ire_refrele(ire); 980 ire = next_ire; 981 } 982 } 983 984 done: 985 /* Return generation before dropping lock */ 986 if (generationp != NULL) 987 *generationp = ire->ire_generation; 988 989 rw_exit(&ipst->ips_ip6_ire_head_lock); 990 991 /* 992 * For shared-IP zones we need additional checks to what was 993 * done in ire_match_args to make sure IRE_LOCALs are handled. 994 * 995 * When ip_restrict_interzone_loopback is set, then 996 * we ensure that IRE_LOCAL are only used for loopback 997 * between zones when the logical "Ethernet" would 998 * have looped them back. That is, if in the absense of 999 * the IRE_LOCAL we would have sent to packet out the 1000 * same ill. 1001 */ 1002 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 1003 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 1004 ipst->ips_ip_restrict_interzone_loopback) { 1005 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 1006 ASSERT(ire != NULL); 1007 } 1008 1009 return (ire); 1010 } 1011 1012 /* 1013 * Look up a single ire. The caller holds either the read or write lock. 1014 */ 1015 ire_t * 1016 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 1017 const in6_addr_t *gateway, int type, const ill_t *ill, 1018 zoneid_t zoneid, const ts_label_t *tsl, int flags, 1019 ip_stack_t *ipst) 1020 { 1021 irb_t *irb_ptr; 1022 ire_t *ire = NULL; 1023 int i; 1024 1025 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); 1026 1027 /* 1028 * If the mask is known, the lookup 1029 * is simple, if the mask is not known 1030 * we need to search. 1031 */ 1032 if (flags & MATCH_IRE_MASK) { 1033 uint_t masklen; 1034 1035 masklen = ip_mask_to_plen_v6(mask); 1036 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { 1037 return (NULL); 1038 } 1039 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ 1040 IRE_ADDR_MASK_HASH_V6(*addr, *mask, 1041 ipst->ips_ip6_ftable_hash_size)]); 1042 rw_enter(&irb_ptr->irb_lock, RW_READER); 1043 for (ire = irb_ptr->irb_ire; ire != NULL; 1044 ire = ire->ire_next) { 1045 if (IRE_IS_CONDEMNED(ire)) 1046 continue; 1047 if (ire_match_args_v6(ire, addr, mask, gateway, type, 1048 ill, zoneid, tsl, flags)) 1049 goto found_ire; 1050 } 1051 rw_exit(&irb_ptr->irb_lock); 1052 } else { 1053 uint_t masklen; 1054 1055 /* 1056 * In this case we don't know the mask, we need to 1057 * search the table assuming different mask sizes. 1058 */ 1059 if (flags & MATCH_IRE_SHORTERMASK) { 1060 masklen = ip_mask_to_plen_v6(mask); 1061 if (masklen == 0) { 1062 /* Nothing shorter than zero */ 1063 return (NULL); 1064 } 1065 masklen--; 1066 } else { 1067 masklen = IP6_MASK_TABLE_SIZE - 1; 1068 } 1069 1070 for (i = masklen; i >= 0; i--) { 1071 in6_addr_t tmpmask; 1072 1073 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) 1074 continue; 1075 (void) ip_plen_to_mask_v6(i, &tmpmask); 1076 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ 1077 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, 1078 ipst->ips_ip6_ftable_hash_size)]; 1079 rw_enter(&irb_ptr->irb_lock, RW_READER); 1080 for (ire = irb_ptr->irb_ire; ire != NULL; 1081 ire = ire->ire_next) { 1082 if (IRE_IS_CONDEMNED(ire)) 1083 continue; 1084 if (ire_match_args_v6(ire, addr, 1085 &ire->ire_mask_v6, gateway, type, ill, 1086 zoneid, tsl, flags)) 1087 goto found_ire; 1088 } 1089 rw_exit(&irb_ptr->irb_lock); 1090 } 1091 } 1092 ASSERT(ire == NULL); 1093 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); 1094 return (NULL); 1095 1096 found_ire: 1097 ire_refhold(ire); 1098 rw_exit(&irb_ptr->irb_lock); 1099 return (ire); 1100 } 1101 1102 1103 /* 1104 * This function is called by 1105 * ip_input/ire_route_recursive when doing a route lookup on only the 1106 * destination address. 1107 * 1108 * The optimizations of this function over ire_ftable_lookup are: 1109 * o removing unnecessary flag matching 1110 * o doing longest prefix match instead of overloading it further 1111 * with the unnecessary "best_prefix_match" 1112 * 1113 * If no route is found we return IRE_NOROUTE. 1114 */ 1115 ire_t * 1116 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, 1117 ip_stack_t *ipst, uint_t *generationp) 1118 { 1119 ire_t *ire; 1120 1121 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, 1122 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); 1123 if (ire == NULL) { 1124 ire = ire_reject(ipst, B_TRUE); 1125 if (generationp != NULL) 1126 *generationp = IRE_GENERATION_VERIFY; 1127 } 1128 /* ftable_lookup did round robin */ 1129 return (ire); 1130 } 1131 1132 ire_t * 1133 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src, 1134 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1135 int *errorp, boolean_t *multirtp) 1136 { 1137 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); 1138 1139 return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp, 1140 multirtp)); 1141 } 1142 1143 /* 1144 * Recursively look for a route to the destination. Can also match on 1145 * the zoneid, ill, and label. Used for the data paths. See also 1146 * ire_route_recursive_dstonly. 1147 * 1148 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1149 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1150 * forwarding. 1151 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1152 * resolve the gateway. 1153 * 1154 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1155 * instead. 1156 * 1157 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1158 * is an error. 1159 * Allow at most one RTF_INDIRECT. 1160 */ 1161 ire_t * 1162 ire_route_recursive_impl_v6(ire_t *ire, 1163 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, 1164 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1165 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1166 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1167 { 1168 int i, j; 1169 in6_addr_t v6nexthop = *nexthop; 1170 ire_t *ires[MAX_IRE_RECURSION]; 1171 uint_t generation; 1172 uint_t generations[MAX_IRE_RECURSION]; 1173 boolean_t need_refrele = B_FALSE; 1174 boolean_t invalidate = B_FALSE; 1175 ill_t *ill = NULL; 1176 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK); 1177 1178 if (setsrcp != NULL) 1179 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1180 if (gwattrp != NULL) 1181 ASSERT(*gwattrp == NULL); 1182 1183 /* 1184 * We iterate up to three times to resolve a route, even though 1185 * we have four slots in the array. The extra slot is for an 1186 * IRE_IF_CLONE we might need to create. 1187 */ 1188 i = 0; 1189 while (i < MAX_IRE_RECURSION - 1) { 1190 /* ire_ftable_lookup handles round-robin/ECMP */ 1191 if (ire == NULL) { 1192 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, 1193 (ill != NULL ? ill : ill_arg), zoneid, tsl, 1194 match_args, xmit_hint, ipst, &generation); 1195 } else { 1196 /* Caller passed it; extra hold since we will rele */ 1197 ire_refhold(ire); 1198 if (generationp != NULL) 1199 generation = *generationp; 1200 else 1201 generation = IRE_GENERATION_VERIFY; 1202 } 1203 1204 if (ire == NULL) { 1205 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) { 1206 ire = ires[0]; 1207 ire_refhold(ire); 1208 } else { 1209 ire = ire_reject(ipst, B_TRUE); 1210 } 1211 goto error; 1212 } 1213 1214 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1215 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1216 goto error; 1217 1218 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1219 /* 1220 * Verify that the IRE_IF_CLONE has a consistent generation 1221 * number. 1222 */ 1223 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) { 1224 ire_refrele(ire); 1225 ire = NULL; 1226 continue; 1227 } 1228 1229 /* 1230 * Don't allow anything unusual past the first iteration. 1231 * After the first lookup, we should no longer look for 1232 * (IRE_LOCAL|IRE_LOOPBACK) or RTF_INDIRECT routes. 1233 * 1234 * In addition, after we have found a direct IRE_OFFLINK, 1235 * we should only look for interface or clone routes. 1236 */ 1237 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */ 1238 if ((ire->ire_type & IRE_OFFLINK) && 1239 !(ire->ire_flags & RTF_INDIRECT)) { 1240 ire_type = IRE_IF_ALL; 1241 } else { 1242 if (!(match_args & MATCH_IRE_TYPE)) 1243 ire_type = (IRE_OFFLINK|IRE_ONLINK); 1244 ire_type &= ~maskoff; /* no more LOCAL, LOOPBACK */ 1245 } 1246 match_args |= MATCH_IRE_TYPE; 1247 /* We have a usable IRE */ 1248 ires[i] = ire; 1249 generations[i] = generation; 1250 i++; 1251 1252 /* The first RTF_SETSRC address is passed back if setsrcp */ 1253 if ((ire->ire_flags & RTF_SETSRC) && 1254 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { 1255 ASSERT(!IN6_IS_ADDR_UNSPECIFIED( 1256 &ire->ire_setsrc_addr_v6)); 1257 *setsrcp = ire->ire_setsrc_addr_v6; 1258 } 1259 1260 /* The first ire_gw_secattr is passed back if gwattrp */ 1261 if (ire->ire_gw_secattr != NULL && 1262 gwattrp != NULL && *gwattrp == NULL) 1263 *gwattrp = ire->ire_gw_secattr; 1264 1265 /* 1266 * Check if we have a short-cut pointer to an IRE for this 1267 * destination, and that the cached dependency isn't stale. 1268 * In that case we've rejoined an existing tree towards a 1269 * parent, thus we don't need to continue the loop to 1270 * discover the rest of the tree. 1271 */ 1272 mutex_enter(&ire->ire_lock); 1273 if (ire->ire_dep_parent != NULL && 1274 ire->ire_dep_parent->ire_generation == 1275 ire->ire_dep_parent_generation) { 1276 mutex_exit(&ire->ire_lock); 1277 ire = NULL; 1278 goto done; 1279 } 1280 mutex_exit(&ire->ire_lock); 1281 1282 /* 1283 * If this type should have an ire_nce_cache (even if it 1284 * doesn't yet have one) then we are done. Includes 1285 * IRE_INTERFACE with a full 128 bit mask. 1286 */ 1287 if (ire->ire_nce_capable) { 1288 ire = NULL; 1289 goto done; 1290 } 1291 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1292 /* 1293 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1294 * particular destination 1295 */ 1296 if (ire->ire_type & IRE_INTERFACE) { 1297 ire_t *clone; 1298 1299 ASSERT(ire->ire_masklen != IPV6_ABITS); 1300 1301 /* 1302 * In the case of ip_input and ILLF_FORWARDING not 1303 * being set, and in the case of RTM_GET, there is 1304 * no point in allocating an IRE_IF_CLONE. We return 1305 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1306 * result in a ire_dep_parent which is IRE_IF_* 1307 * without an IRE_IF_CLONE. 1308 * We recover from that when we need to send packets 1309 * by ensuring that the generations become 1310 * IRE_GENERATION_VERIFY in this case. 1311 */ 1312 if (!(irr_flags & IRR_ALLOCATE)) { 1313 invalidate = B_TRUE; 1314 ire = NULL; 1315 goto done; 1316 } 1317 1318 clone = ire_create_if_clone(ire, &v6nexthop, 1319 &generation); 1320 if (clone == NULL) { 1321 /* 1322 * Temporary failure - no memory. 1323 * Don't want caller to cache IRE_NOROUTE. 1324 */ 1325 invalidate = B_TRUE; 1326 ire = ire_blackhole(ipst, B_TRUE); 1327 goto error; 1328 } 1329 /* 1330 * Make clone next to last entry and the 1331 * IRE_INTERFACE the last in the dependency 1332 * chain since the clone depends on the 1333 * IRE_INTERFACE. 1334 */ 1335 ASSERT(i >= 1); 1336 ASSERT(i < MAX_IRE_RECURSION); 1337 1338 ires[i] = ires[i-1]; 1339 generations[i] = generations[i-1]; 1340 ires[i-1] = clone; 1341 generations[i-1] = generation; 1342 i++; 1343 1344 ire = NULL; 1345 goto done; 1346 } 1347 1348 /* 1349 * We only match on the type and optionally ILL when 1350 * recursing. The type match is used by some callers 1351 * to exclude certain types (such as IRE_IF_CLONE or 1352 * IRE_LOCAL|IRE_LOOPBACK). 1353 * 1354 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1355 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1356 * ire_ill, so we set ill to the ire_ill 1357 */ 1358 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT); 1359 v6nexthop = ire->ire_gateway_addr_v6; 1360 if (ill == NULL && ire->ire_ill != NULL) { 1361 ill = ire->ire_ill; 1362 need_refrele = B_TRUE; 1363 ill_refhold(ill); 1364 match_args |= MATCH_IRE_ILL; 1365 } 1366 ire = NULL; 1367 } 1368 ASSERT(ire == NULL); 1369 ire = ire_reject(ipst, B_TRUE); 1370 1371 error: 1372 ASSERT(ire != NULL); 1373 if (need_refrele) 1374 ill_refrele(ill); 1375 1376 /* 1377 * In the case of MULTIRT we want to try a different IRE the next 1378 * time. We let the next packet retry in that case. 1379 */ 1380 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1381 (void) ire_no_good(ires[0]); 1382 1383 cleanup: 1384 /* cleanup ires[i] */ 1385 ire_dep_unbuild(ires, i); 1386 for (j = 0; j < i; j++) 1387 ire_refrele(ires[j]); 1388 1389 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1390 (irr_flags & IRR_INCOMPLETE)); 1391 /* 1392 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1393 * ip_select_route since the reject or lack of memory might be gone. 1394 */ 1395 if (generationp != NULL) 1396 *generationp = IRE_GENERATION_VERIFY; 1397 return (ire); 1398 1399 done: 1400 ASSERT(ire == NULL); 1401 if (need_refrele) 1402 ill_refrele(ill); 1403 1404 /* Build dependencies */ 1405 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1406 /* Something in chain was condemned; tear it apart */ 1407 ire = ire_blackhole(ipst, B_TRUE); 1408 goto cleanup; 1409 } 1410 1411 /* 1412 * Release all refholds except the one for ires[0] that we 1413 * will return to the caller. 1414 */ 1415 for (j = 1; j < i; j++) 1416 ire_refrele(ires[j]); 1417 1418 if (invalidate) { 1419 /* 1420 * Since we needed to allocate but couldn't we need to make 1421 * sure that the dependency chain is rebuilt the next time. 1422 */ 1423 ire_dep_invalidate_generations(ires[0]); 1424 generation = IRE_GENERATION_VERIFY; 1425 } else { 1426 /* 1427 * IREs can have been added or deleted while we did the 1428 * recursive lookup and we can't catch those until we've built 1429 * the dependencies. We verify the stored 1430 * ire_dep_parent_generation to catch any such changes and 1431 * return IRE_GENERATION_VERIFY (which will cause 1432 * ip_select_route to be called again so we can redo the 1433 * recursive lookup next time we send a packet. 1434 */ 1435 if (ires[0]->ire_dep_parent == NULL) 1436 generation = ires[0]->ire_generation; 1437 else 1438 generation = ire_dep_validate_generations(ires[0]); 1439 if (generations[0] != ires[0]->ire_generation) { 1440 /* Something changed at the top */ 1441 generation = IRE_GENERATION_VERIFY; 1442 } 1443 } 1444 if (generationp != NULL) 1445 *generationp = generation; 1446 1447 return (ires[0]); 1448 } 1449 1450 ire_t * 1451 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, 1452 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1453 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1454 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1455 { 1456 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, 1457 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1458 gwattrp, generationp)); 1459 } 1460 1461 /* 1462 * Recursively look for a route to the destination. 1463 * We only handle a destination match here, yet we have the same arguments 1464 * as the full match to allow function pointers to select between the two. 1465 * 1466 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1467 * instead. 1468 * 1469 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1470 * is an error. 1471 * Allow at most one RTF_INDIRECT. 1472 */ 1473 ire_t * 1474 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags, 1475 uint32_t xmit_hint, ip_stack_t *ipst) 1476 { 1477 ire_t *ire; 1478 ire_t *ire1; 1479 uint_t generation; 1480 1481 /* ire_ftable_lookup handles round-robin/ECMP */ 1482 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, 1483 &generation); 1484 ASSERT(ire != NULL); 1485 1486 /* 1487 * If the IRE has a current cached parent we know that the whole 1488 * parent chain is current, hence we don't need to discover and 1489 * build any dependencies by doing a recursive lookup. 1490 */ 1491 mutex_enter(&ire->ire_lock); 1492 if (ire->ire_dep_parent != NULL) { 1493 if (ire->ire_dep_parent->ire_generation == 1494 ire->ire_dep_parent_generation) { 1495 mutex_exit(&ire->ire_lock); 1496 return (ire); 1497 } 1498 mutex_exit(&ire->ire_lock); 1499 } else { 1500 mutex_exit(&ire->ire_lock); 1501 /* 1502 * If this type should have an ire_nce_cache (even if it 1503 * doesn't yet have one) then we are done. Includes 1504 * IRE_INTERFACE with a full 128 bit mask. 1505 */ 1506 if (ire->ire_nce_capable) 1507 return (ire); 1508 } 1509 1510 /* 1511 * Fallback to loop in the normal code starting with the ire 1512 * we found. Normally this would return the same ire. 1513 */ 1514 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, 1515 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1516 &generation); 1517 ire_refrele(ire); 1518 return (ire1); 1519 } 1520