1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/sysmacros.h> 28 #include <sys/kmem.h> 29 #include <sys/ksynch.h> 30 #include <sys/systm.h> 31 #include <sys/socket.h> 32 #include <sys/disp.h> 33 #include <sys/taskq.h> 34 #include <sys/cmn_err.h> 35 #include <sys/strsun.h> 36 #include <sys/sdt.h> 37 #include <sys/atomic.h> 38 #include <netinet/in.h> 39 #include <inet/ip.h> 40 #include <inet/ip6.h> 41 #include <inet/tcp.h> 42 #include <inet/udp_impl.h> 43 #include <inet/kstatcom.h> 44 45 #include <inet/ilb_ip.h> 46 #include "ilb_alg.h" 47 #include "ilb_nat.h" 48 #include "ilb_conn.h" 49 50 /* ILB kmem cache flag */ 51 int ilb_kmem_flags = 0; 52 53 /* 54 * The default size for the different hash tables. Global for all stacks. 55 * But each stack has its own table, just that their sizes are the same. 56 */ 57 static size_t ilb_rule_hash_size = 2048; 58 59 static size_t ilb_conn_hash_size = 262144; 60 61 static size_t ilb_sticky_hash_size = 262144; 62 63 /* This should be a prime number. */ 64 static size_t ilb_nat_src_hash_size = 97; 65 66 /* Default NAT cache entry expiry time. */ 67 static uint32_t ilb_conn_tcp_expiry = 120; 68 static uint32_t ilb_conn_udp_expiry = 60; 69 70 /* Default sticky entry expiry time. */ 71 static uint32_t ilb_sticky_expiry = 60; 72 73 /* addr is assumed to be a uint8_t * to an ipaddr_t. */ 74 #define ILB_RULE_HASH(addr, hash_size) \ 75 ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \ 76 *(addr)) & ((hash_size) - 1)) 77 78 /* 79 * Note on ILB delayed processing 80 * 81 * To avoid in line removal on some of the data structures, such as rules, 82 * servers and ilb_conn_hash entries, ILB delays such processing to a taskq. 83 * There are three types of ILB taskq: 84 * 85 * 1. rule handling: created at stack initialialization time, ilb_stack_init() 86 * 2. conn hash handling: created at conn hash initialization time, 87 * ilb_conn_hash_init() 88 * 3. sticky hash handling: created at sticky hash initialization time, 89 * ilb_sticky_hash_init() 90 * 91 * The rule taskq is for processing rule and server removal. When a user 92 * land rule/server removal request comes in, a taskq is dispatched after 93 * removing the rule/server from all related hashes. This taskq will wait 94 * until all references to the rule/server are gone before removing it. 95 * So the user land thread requesting the removal does not need to wait 96 * for the removal completion. 97 * 98 * The conn hash/sticky hash taskq is for processing ilb_conn_hash and 99 * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers 100 * and ilb_sticky_timer_size timers running for ilb_conn_hash and 101 * ilb_sticky_hash cleanup respectively. Each timer is responsible for one 102 * portion (same size) of the hash table. When a timer fires, it dispatches 103 * a conn hash taskq to clean up its portion of the table. This avoids in 104 * line processing of the removal. 105 * 106 * There is another delayed processing, the clean up of NAT source address 107 * table. We just use the timer to directly handle it instead of using 108 * a taskq. The reason is that the table is small so it is OK to use the 109 * timer. 110 */ 111 112 /* ILB rule taskq constants. */ 113 #define ILB_RULE_TASKQ_NUM_THR 20 114 115 /* Argument passed to ILB rule taskq routines. */ 116 typedef struct { 117 ilb_stack_t *ilbs; 118 ilb_rule_t *rule; 119 } ilb_rule_tq_t; 120 121 /* kstat handling routines. */ 122 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *); 123 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *); 124 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *); 125 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *, 126 ilb_server_t *); 127 128 /* Rule hash handling routines. */ 129 static void ilb_rule_hash_init(ilb_stack_t *); 130 static void ilb_rule_hash_fini(ilb_stack_t *); 131 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *); 132 static void ilb_rule_hash_del(ilb_rule_t *); 133 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *, 134 in_port_t, zoneid_t, uint32_t, boolean_t *); 135 136 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *); 137 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *); 138 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *); 139 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *, 140 int *); 141 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int, 142 int, in_port_t, in_port_t, const in6_addr_t *); 143 144 /* Back end server handling routines. */ 145 static void ilb_server_free(ilb_server_t *); 146 147 /* Network stack handling routines. */ 148 static void *ilb_stack_init(netstackid_t, netstack_t *); 149 static void ilb_stack_shutdown(netstackid_t, void *); 150 static void ilb_stack_fini(netstackid_t, void *); 151 152 /* Sticky connection handling routines. */ 153 static void ilb_rule_sticky_init(ilb_rule_t *); 154 static void ilb_rule_sticky_fini(ilb_rule_t *); 155 156 /* Handy macro to check for unspecified address. */ 157 #define IS_ADDR_UNSPEC(addr) \ 158 (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \ 159 IN6_IS_ADDR_UNSPECIFIED(addr)) 160 161 /* 162 * Global kstat instance counter. When a rule is created, its kstat instance 163 * number is assigned by ilb_kstat_instance and ilb_kstat_instance is 164 * incremented. 165 */ 166 static uint_t ilb_kstat_instance = 0; 167 168 /* 169 * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME. 170 * A rule's kstat has ILB_RULE_KS_CNAME class name. 171 */ 172 #define ILB_G_KS_NAME "global" 173 #define ILB_G_KS_CNAME "kstat" 174 #define ILB_RULE_KS_CNAME "rulestat" 175 176 static kstat_t * 177 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs) 178 { 179 kstat_t *ksp; 180 ilb_g_kstat_t template = { 181 { "num_rules", KSTAT_DATA_UINT64, 0 }, 182 { "ip_frag_in", KSTAT_DATA_UINT64, 0 }, 183 { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 } 184 }; 185 186 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME, 187 ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t), 188 KSTAT_FLAG_VIRTUAL, stackid); 189 if (ksp == NULL) 190 return (NULL); 191 bcopy(&template, ilbs->ilbs_kstat, sizeof (template)); 192 ksp->ks_data = ilbs->ilbs_kstat; 193 ksp->ks_private = (void *)(uintptr_t)stackid; 194 195 kstat_install(ksp); 196 return (ksp); 197 } 198 199 static void 200 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs) 201 { 202 if (ilbs->ilbs_ksp != NULL) { 203 ASSERT(stackid == (netstackid_t)(uintptr_t) 204 ilbs->ilbs_ksp->ks_private); 205 kstat_delete_netstack(ilbs->ilbs_ksp, stackid); 206 ilbs->ilbs_ksp = NULL; 207 } 208 } 209 210 static kstat_t * 211 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule) 212 { 213 kstat_t *ksp; 214 ilb_rule_kstat_t template = { 215 { "num_servers", KSTAT_DATA_UINT64, 0 }, 216 { "bytes_not_processed", KSTAT_DATA_UINT64, 0 }, 217 { "pkt_not_processed", KSTAT_DATA_UINT64, 0 }, 218 { "bytes_dropped", KSTAT_DATA_UINT64, 0 }, 219 { "pkt_dropped", KSTAT_DATA_UINT64, 0 }, 220 { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 }, 221 { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 }, 222 { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 }, 223 { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 }, 224 { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 }, 225 { "icmp_dropped", KSTAT_DATA_UINT64, 0 }, 226 { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 }, 227 { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 } 228 }; 229 230 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, 231 rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED, 232 NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); 233 if (ksp == NULL) 234 return (NULL); 235 236 bcopy(&template, &rule->ir_kstat, sizeof (template)); 237 ksp->ks_data = &rule->ir_kstat; 238 ksp->ks_private = (void *)(uintptr_t)stackid; 239 240 kstat_install(ksp); 241 return (ksp); 242 } 243 244 static kstat_t * 245 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule, 246 ilb_server_t *server) 247 { 248 kstat_t *ksp; 249 ilb_server_kstat_t template = { 250 { "bytes_processed", KSTAT_DATA_UINT64, 0 }, 251 { "pkt_processed", KSTAT_DATA_UINT64, 0 }, 252 { "ip_address", KSTAT_DATA_STRING, 0 } 253 }; 254 char cname_buf[KSTAT_STRLEN]; 255 256 /* 7 is "-sstat" */ 257 ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN); 258 (void) sprintf(cname_buf, "%s-sstat", rule->ir_name); 259 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, 260 server->iser_name, cname_buf, KSTAT_TYPE_NAMED, 261 NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); 262 if (ksp == NULL) 263 return (NULL); 264 265 bcopy(&template, &server->iser_kstat, sizeof (template)); 266 ksp->ks_data = &server->iser_kstat; 267 ksp->ks_private = (void *)(uintptr_t)stackid; 268 269 kstat_named_setstr(&server->iser_kstat.ip_address, 270 server->iser_ip_addr); 271 /* We never change the IP address */ 272 ksp->ks_data_size += strlen(server->iser_ip_addr) + 1; 273 274 kstat_install(ksp); 275 return (ksp); 276 } 277 278 /* Initialize the rule hash table. */ 279 static void 280 ilb_rule_hash_init(ilb_stack_t *ilbs) 281 { 282 int i; 283 284 /* 285 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to 286 * the next power of 2. 287 */ 288 if (!ISP2(ilbs->ilbs_rule_hash_size)) { 289 for (i = 0; i < 31; i++) { 290 if (ilbs->ilbs_rule_hash_size < (1 << i)) 291 break; 292 } 293 ilbs->ilbs_rule_hash_size = 1 << i; 294 } 295 ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) * 296 ilbs->ilbs_rule_hash_size, KM_SLEEP); 297 for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) { 298 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL, 299 MUTEX_DEFAULT, NULL); 300 } 301 } 302 303 /* Clean up the rule hash table. */ 304 static void 305 ilb_rule_hash_fini(ilb_stack_t *ilbs) 306 { 307 if (ilbs->ilbs_g_hash == NULL) 308 return; 309 kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) * 310 ilbs->ilbs_rule_hash_size); 311 } 312 313 /* Add a rule to the rule hash table. */ 314 static void 315 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr) 316 { 317 int i; 318 319 i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3], 320 ilbs->ilbs_rule_hash_size); 321 DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i); 322 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 323 rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule; 324 if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL) 325 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule; 326 rule->ir_hash_prev = NULL; 327 ilbs->ilbs_g_hash[i].ilb_hash_rule = rule; 328 329 rule->ir_hash = &ilbs->ilbs_g_hash[i]; 330 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 331 } 332 333 /* 334 * Remove a rule from the rule hash table. Note that the rule is not freed 335 * in this routine. 336 */ 337 static void 338 ilb_rule_hash_del(ilb_rule_t *rule) 339 { 340 mutex_enter(&rule->ir_hash->ilb_hash_lock); 341 if (rule->ir_hash->ilb_hash_rule == rule) { 342 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next; 343 if (rule->ir_hash_next != NULL) 344 rule->ir_hash_next->ir_hash_prev = NULL; 345 } else { 346 if (rule->ir_hash_prev != NULL) 347 rule->ir_hash_prev->ir_hash_next = 348 rule->ir_hash_next; 349 if (rule->ir_hash_next != NULL) { 350 rule->ir_hash_next->ir_hash_prev = 351 rule->ir_hash_prev; 352 } 353 } 354 mutex_exit(&rule->ir_hash->ilb_hash_lock); 355 356 rule->ir_hash_next = NULL; 357 rule->ir_hash_prev = NULL; 358 rule->ir_hash = NULL; 359 } 360 361 /* 362 * Given the info of a packet, look for a match in the rule hash table. 363 */ 364 static ilb_rule_t * 365 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr, 366 in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy) 367 { 368 int i; 369 ilb_rule_t *rule; 370 ipaddr_t v4_addr; 371 372 *busy = B_FALSE; 373 IN6_V4MAPPED_TO_IPADDR(addr, v4_addr); 374 i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size); 375 port = ntohs(port); 376 377 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 378 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 379 rule = rule->ir_hash_next) { 380 if (!rule->ir_port_range) { 381 if (rule->ir_min_port != port) 382 continue; 383 } else { 384 if (port < rule->ir_min_port || 385 port > rule->ir_max_port) { 386 continue; 387 } 388 } 389 if (rule->ir_ipver != l3 || rule->ir_proto != l4 || 390 rule->ir_zoneid != zoneid) { 391 continue; 392 } 393 394 if (l3 == IPPROTO_IP) { 395 if (rule->ir_target_v4 != INADDR_ANY && 396 rule->ir_target_v4 != v4_addr) { 397 continue; 398 } 399 } else { 400 if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) && 401 !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) { 402 continue; 403 } 404 } 405 406 /* 407 * Just update the stats if the rule is disabled. 408 */ 409 mutex_enter(&rule->ir_lock); 410 if (!(rule->ir_flags & ILB_RULE_ENABLED)) { 411 ILB_R_KSTAT(rule, pkt_not_processed); 412 ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len); 413 mutex_exit(&rule->ir_lock); 414 rule = NULL; 415 break; 416 } else if (rule->ir_flags & ILB_RULE_BUSY) { 417 /* 418 * If we are busy... 419 * 420 * XXX we should have a queue to postpone the 421 * packet processing. But this requires a 422 * mechanism in IP to re-start the packet 423 * processing. So for now, just drop the packet. 424 */ 425 ILB_R_KSTAT(rule, pkt_dropped); 426 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len); 427 mutex_exit(&rule->ir_lock); 428 *busy = B_TRUE; 429 rule = NULL; 430 break; 431 } else { 432 rule->ir_refcnt++; 433 ASSERT(rule->ir_refcnt != 1); 434 mutex_exit(&rule->ir_lock); 435 break; 436 } 437 } 438 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 439 return (rule); 440 } 441 442 /* 443 * Add a rule to the global rule list. This list is for finding all rules 444 * in an IP stack. The caller is assumed to hold the ilbs_g_lock. 445 */ 446 static void 447 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule) 448 { 449 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 450 rule->ir_next = ilbs->ilbs_rule_head; 451 ilbs->ilbs_rule_head = rule; 452 ILB_KSTAT_UPDATE(ilbs, num_rules, 1); 453 } 454 455 /* The call is assumed to hold the ilbs_g_lock. */ 456 static void 457 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule) 458 { 459 ilb_rule_t *tmp_rule; 460 ilb_rule_t *prev_rule; 461 462 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 463 prev_rule = NULL; 464 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 465 prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) { 466 if (tmp_rule == rule) 467 break; 468 } 469 if (tmp_rule == NULL) { 470 mutex_exit(&ilbs->ilbs_g_lock); 471 return; 472 } 473 if (prev_rule == NULL) 474 ilbs->ilbs_rule_head = tmp_rule->ir_next; 475 else 476 prev_rule->ir_next = tmp_rule->ir_next; 477 ILB_KSTAT_UPDATE(ilbs, num_rules, -1); 478 } 479 480 /* 481 * Helper routine to calculate how many source addresses are in a given 482 * range. 483 */ 484 static int64_t 485 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2) 486 { 487 int64_t ret; 488 uint32_t addr1, addr2; 489 490 /* 491 * Here we assume that the max number of NAT source cannot be 492 * large such that the most significant 2 s6_addr32 must be 493 * equal. 494 */ 495 addr1 = ntohl(a1->s6_addr32[3]); 496 addr2 = ntohl(a2->s6_addr32[3]); 497 if (a1->s6_addr32[0] != a2->s6_addr32[0] || 498 a1->s6_addr32[1] != a2->s6_addr32[1] || 499 a1->s6_addr32[2] > a2->s6_addr32[2] || 500 (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) { 501 return (-1); 502 } 503 if (a1->s6_addr32[2] == a2->s6_addr32[2]) { 504 return (addr2 - addr1 + 1); 505 } else { 506 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2])); 507 ret <<= 32; 508 ret = ret + addr1 - addr2; 509 return (ret + 1); 510 } 511 } 512 513 /* 514 * Add an ILB rule. 515 */ 516 int 517 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd) 518 { 519 ilb_rule_t *rule; 520 netstackid_t stackid; 521 int ret; 522 in_port_t min_port, max_port; 523 int64_t num_src; 524 525 /* Sanity checks. */ 526 if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6) 527 return (EINVAL); 528 529 /* Need to support SCTP... */ 530 if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP) 531 return (EINVAL); 532 533 /* For full NAT, the NAT source must be supplied. */ 534 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 535 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) || 536 IS_ADDR_UNSPEC(&cmd->nat_src_end)) { 537 return (EINVAL); 538 } 539 } 540 541 /* Check invalid mask */ 542 if ((cmd->flags & ILB_RULE_STICKY) && 543 IS_ADDR_UNSPEC(&cmd->sticky_mask)) { 544 return (EINVAL); 545 } 546 547 /* Port is passed in network byte order. */ 548 min_port = ntohs(cmd->min_port); 549 max_port = ntohs(cmd->max_port); 550 if (min_port > max_port) 551 return (EINVAL); 552 553 /* min_port == 0 means "all ports". Make it so */ 554 if (min_port == 0) { 555 min_port = 1; 556 max_port = 65535; 557 } 558 559 /* Funny address checking. */ 560 if (cmd->ip_ver == IPPROTO_IP) { 561 in_addr_t v4_addr1, v4_addr2; 562 563 v4_addr1 = cmd->vip.s6_addr32[3]; 564 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || 565 CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST || 566 v4_addr1 == INADDR_ANY || 567 !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { 568 return (EINVAL); 569 } 570 571 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 572 v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]); 573 v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]); 574 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || 575 (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET || 576 v4_addr1 == INADDR_BROADCAST || 577 v4_addr2 == INADDR_BROADCAST || 578 v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY || 579 CLASSD(v4_addr1) || CLASSD(v4_addr2) || 580 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || 581 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { 582 return (EINVAL); 583 } 584 585 num_src = v4_addr2 - v4_addr1 + 1; 586 if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC) 587 return (EINVAL); 588 } 589 } else { 590 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) || 591 IN6_IS_ADDR_MULTICAST(&cmd->vip) || 592 IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) || 593 IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { 594 return (EINVAL); 595 } 596 597 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 598 if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) || 599 IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) || 600 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) || 601 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) || 602 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) || 603 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) || 604 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || 605 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { 606 return (EINVAL); 607 } 608 609 if ((num_src = num_nat_src_v6(&cmd->nat_src_start, 610 &cmd->nat_src_end)) < 0 || 611 num_src > ILB_MAX_NAT_SRC) { 612 return (EINVAL); 613 } 614 } 615 } 616 617 mutex_enter(&ilbs->ilbs_g_lock); 618 if (ilbs->ilbs_g_hash == NULL) 619 ilb_rule_hash_init(ilbs); 620 if (ilbs->ilbs_c2s_conn_hash == NULL) { 621 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 622 ilb_conn_hash_init(ilbs); 623 ilb_nat_src_init(ilbs); 624 } 625 626 /* Make sure that the new rule does not duplicate an existing one. */ 627 if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto, 628 min_port, max_port, &cmd->vip)) { 629 mutex_exit(&ilbs->ilbs_g_lock); 630 return (EEXIST); 631 } 632 633 rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP); 634 if (rule == NULL) { 635 mutex_exit(&ilbs->ilbs_g_lock); 636 return (ENOMEM); 637 } 638 639 /* ir_name is all 0 to begin with */ 640 (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1); 641 642 rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance); 643 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 644 if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) { 645 ret = ENOMEM; 646 goto error; 647 } 648 649 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 650 rule->ir_nat_src_start = cmd->nat_src_start; 651 rule->ir_nat_src_end = cmd->nat_src_end; 652 } 653 654 rule->ir_ipver = cmd->ip_ver; 655 rule->ir_proto = cmd->proto; 656 rule->ir_topo = cmd->topo; 657 658 rule->ir_min_port = min_port; 659 rule->ir_max_port = max_port; 660 if (rule->ir_min_port != rule->ir_max_port) 661 rule->ir_port_range = B_TRUE; 662 else 663 rule->ir_port_range = B_FALSE; 664 665 rule->ir_zoneid = zoneid; 666 667 rule->ir_target_v6 = cmd->vip; 668 rule->ir_servers = NULL; 669 670 /* 671 * The default connection drain timeout is indefinite (value 0), 672 * meaning we will wait for all connections to finish. So we 673 * can assign cmd->conn_drain_timeout to it directly. 674 */ 675 rule->ir_conn_drain_timeout = cmd->conn_drain_timeout; 676 if (cmd->nat_expiry != 0) { 677 rule->ir_nat_expiry = cmd->nat_expiry; 678 } else { 679 switch (rule->ir_proto) { 680 case IPPROTO_TCP: 681 rule->ir_nat_expiry = ilb_conn_tcp_expiry; 682 break; 683 case IPPROTO_UDP: 684 rule->ir_nat_expiry = ilb_conn_udp_expiry; 685 break; 686 default: 687 cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p", 688 (void *)rule); 689 break; 690 } 691 } 692 if (cmd->sticky_expiry != 0) 693 rule->ir_sticky_expiry = cmd->sticky_expiry; 694 else 695 rule->ir_sticky_expiry = ilb_sticky_expiry; 696 697 if (cmd->flags & ILB_RULE_STICKY) { 698 rule->ir_flags |= ILB_RULE_STICKY; 699 rule->ir_sticky_mask = cmd->sticky_mask; 700 if (ilbs->ilbs_sticky_hash == NULL) 701 ilb_sticky_hash_init(ilbs); 702 } 703 if (cmd->flags & ILB_RULE_ENABLED) 704 rule->ir_flags |= ILB_RULE_ENABLED; 705 706 mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL); 707 cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL); 708 709 rule->ir_refcnt = 1; 710 711 switch (cmd->algo) { 712 case ILB_ALG_IMPL_ROUNDROBIN: 713 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) { 714 ret = ENOMEM; 715 goto error; 716 } 717 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN; 718 break; 719 case ILB_ALG_IMPL_HASH_IP: 720 case ILB_ALG_IMPL_HASH_IP_SPORT: 721 case ILB_ALG_IMPL_HASH_IP_VIP: 722 if ((rule->ir_alg = ilb_alg_hash_init(rule, 723 &cmd->algo)) == NULL) { 724 ret = ENOMEM; 725 goto error; 726 } 727 rule->ir_alg_type = cmd->algo; 728 break; 729 default: 730 ret = EINVAL; 731 goto error; 732 } 733 734 /* Add it to the global list and hash array at the end. */ 735 ilb_rule_g_add(ilbs, rule); 736 ilb_rule_hash_add(ilbs, rule, &cmd->vip); 737 738 mutex_exit(&ilbs->ilbs_g_lock); 739 740 return (0); 741 742 error: 743 mutex_exit(&ilbs->ilbs_g_lock); 744 if (rule->ir_ksp != NULL) { 745 /* stackid must be initialized if ir_ksp != NULL */ 746 kstat_delete_netstack(rule->ir_ksp, stackid); 747 } 748 kmem_free(rule, sizeof (ilb_rule_t)); 749 return (ret); 750 } 751 752 /* 753 * The final part in deleting a rule. Either called directly or by the 754 * taskq dispatched. 755 */ 756 static void 757 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule) 758 { 759 netstackid_t stackid; 760 ilb_server_t *server; 761 762 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 763 764 /* 765 * Let the algorithm know that the rule is going away. The 766 * algorithm fini routine will free all its resources with this 767 * rule. 768 */ 769 tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg); 770 771 while ((server = tmp_rule->ir_servers) != NULL) { 772 mutex_enter(&server->iser_lock); 773 ilb_destroy_nat_src(&server->iser_nat_src); 774 if (tmp_rule->ir_conn_drain_timeout != 0) { 775 /* 776 * The garbage collection thread checks this value 777 * without grabing a lock. So we need to use 778 * atomic_swap_64() to make sure that the value seen 779 * by gc thread is intact. 780 */ 781 (void) atomic_swap_64( 782 (uint64_t *)&server->iser_die_time, 783 ddi_get_lbolt64() + 784 SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout)); 785 } 786 while (server->iser_refcnt > 1) 787 cv_wait(&server->iser_cv, &server->iser_lock); 788 tmp_rule->ir_servers = server->iser_next; 789 kstat_delete_netstack(server->iser_ksp, stackid); 790 kmem_free(server, sizeof (ilb_server_t)); 791 } 792 793 ASSERT(tmp_rule->ir_ksp != NULL); 794 kstat_delete_netstack(tmp_rule->ir_ksp, stackid); 795 796 kmem_free(tmp_rule, sizeof (ilb_rule_t)); 797 } 798 799 /* The routine executed by the delayed rule taskq. */ 800 static void 801 ilb_rule_del_tq(void *arg) 802 { 803 ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs; 804 ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule; 805 806 mutex_enter(&rule->ir_lock); 807 while (rule->ir_refcnt > 1) 808 cv_wait(&rule->ir_cv, &rule->ir_lock); 809 ilb_rule_del_common(ilbs, rule); 810 kmem_free(arg, sizeof (ilb_rule_tq_t)); 811 } 812 813 /* Routine to delete a rule. */ 814 int 815 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name) 816 { 817 ilb_rule_t *tmp_rule; 818 ilb_rule_tq_t *arg; 819 int err; 820 821 mutex_enter(&ilbs->ilbs_g_lock); 822 if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, 823 &err)) == NULL) { 824 mutex_exit(&ilbs->ilbs_g_lock); 825 return (err); 826 } 827 828 /* 829 * First remove the rule from the hash array and the global list so 830 * that no one can find this rule any more. 831 */ 832 ilb_rule_hash_del(tmp_rule); 833 ilb_rule_g_del(ilbs, tmp_rule); 834 mutex_exit(&ilbs->ilbs_g_lock); 835 ILB_RULE_REFRELE(tmp_rule); 836 837 /* 838 * Now no one can find this rule, we can remove it once all 839 * references to it are dropped and all references to the list 840 * of servers are dropped. So dispatch a task to finish the deletion. 841 * We do this instead of letting the last one referencing the 842 * rule do it. The reason is that the last one may be the 843 * interrupt thread. We want to minimize the work it needs to 844 * do. Rule deletion is not a critical task so it can be delayed. 845 */ 846 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); 847 arg->ilbs = ilbs; 848 arg->rule = tmp_rule; 849 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg, 850 TQ_SLEEP); 851 852 return (0); 853 } 854 855 /* 856 * Given an IP address, check to see if there is a rule using this 857 * as the VIP. It can be used to check if we need to drop a fragment. 858 */ 859 boolean_t 860 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule) 861 { 862 int i; 863 ilb_rule_t *rule; 864 boolean_t ret = B_FALSE; 865 866 i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3], 867 ilbs->ilbs_rule_hash_size); 868 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 869 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 870 rule = rule->ir_hash_next) { 871 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) { 872 mutex_enter(&rule->ir_lock); 873 if (rule->ir_flags & ILB_RULE_BUSY) { 874 mutex_exit(&rule->ir_lock); 875 break; 876 } 877 if (ret_rule != NULL) { 878 rule->ir_refcnt++; 879 mutex_exit(&rule->ir_lock); 880 *ret_rule = rule; 881 } else { 882 mutex_exit(&rule->ir_lock); 883 } 884 ret = B_TRUE; 885 break; 886 } 887 } 888 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 889 return (ret); 890 } 891 892 boolean_t 893 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule) 894 { 895 int i; 896 ilb_rule_t *rule; 897 boolean_t ret = B_FALSE; 898 899 i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size); 900 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 901 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 902 rule = rule->ir_hash_next) { 903 if (rule->ir_target_v6.s6_addr32[3] == addr) { 904 mutex_enter(&rule->ir_lock); 905 if (rule->ir_flags & ILB_RULE_BUSY) { 906 mutex_exit(&rule->ir_lock); 907 break; 908 } 909 if (ret_rule != NULL) { 910 rule->ir_refcnt++; 911 mutex_exit(&rule->ir_lock); 912 *ret_rule = rule; 913 } else { 914 mutex_exit(&rule->ir_lock); 915 } 916 ret = B_TRUE; 917 break; 918 } 919 } 920 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 921 return (ret); 922 } 923 924 static ilb_rule_t * 925 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 926 int *err) 927 { 928 ilb_rule_t *tmp_rule; 929 930 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 931 932 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 933 tmp_rule = tmp_rule->ir_next) { 934 if (tmp_rule->ir_zoneid != zoneid) 935 continue; 936 if (strcasecmp(tmp_rule->ir_name, name) == 0) { 937 mutex_enter(&tmp_rule->ir_lock); 938 if (tmp_rule->ir_flags & ILB_RULE_BUSY) { 939 mutex_exit(&tmp_rule->ir_lock); 940 *err = EINPROGRESS; 941 return (NULL); 942 } 943 tmp_rule->ir_refcnt++; 944 mutex_exit(&tmp_rule->ir_lock); 945 *err = 0; 946 return (tmp_rule); 947 } 948 } 949 *err = ENOENT; 950 return (NULL); 951 } 952 953 /* To find a rule with a given name and zone in the global rule list. */ 954 ilb_rule_t * 955 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 956 int *err) 957 { 958 ilb_rule_t *tmp_rule; 959 960 mutex_enter(&ilbs->ilbs_g_lock); 961 tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err); 962 mutex_exit(&ilbs->ilbs_g_lock); 963 return (tmp_rule); 964 } 965 966 /* Try to match the given packet info and zone ID with a rule. */ 967 static boolean_t 968 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3, 969 int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr) 970 { 971 ilb_rule_t *tmp_rule; 972 973 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 974 975 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 976 tmp_rule = tmp_rule->ir_next) { 977 if (tmp_rule->ir_zoneid != zoneid) 978 continue; 979 980 /* 981 * We don't allow the same name in different rules even if all 982 * the other rule components are different. 983 */ 984 if (strcasecmp(tmp_rule->ir_name, name) == 0) 985 return (B_TRUE); 986 987 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4) 988 continue; 989 990 /* 991 * ir_min_port and ir_max_port are the same if ir_port_range 992 * is false. In this case, if the ir_min|max_port (same) is 993 * outside of the given port range, it is OK. In other cases, 994 * check if min and max port are outside a rule's range. 995 */ 996 if (tmp_rule->ir_max_port < min_port || 997 tmp_rule->ir_min_port > max_port) { 998 continue; 999 } 1000 1001 /* 1002 * If l3 is IPv4, the addr passed in is assumed to be 1003 * mapped address. 1004 */ 1005 if (V6_OR_V4_INADDR_ANY(*addr) || 1006 V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) || 1007 IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) { 1008 return (B_TRUE); 1009 } 1010 } 1011 return (B_FALSE); 1012 } 1013 1014 int 1015 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid, 1016 const char *rule_name, ilb_rule_t *in_rule) 1017 { 1018 ilb_rule_t *rule; 1019 int err; 1020 1021 ASSERT((in_rule == NULL && rule_name != NULL) || 1022 (in_rule != NULL && rule_name == NULL)); 1023 if ((rule = in_rule) == NULL) { 1024 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1025 &err)) == NULL) { 1026 return (err); 1027 } 1028 } 1029 mutex_enter(&rule->ir_lock); 1030 rule->ir_flags |= ILB_RULE_ENABLED; 1031 mutex_exit(&rule->ir_lock); 1032 1033 /* Only refrele if the rule is passed in. */ 1034 if (in_rule == NULL) 1035 ILB_RULE_REFRELE(rule); 1036 return (0); 1037 } 1038 1039 int 1040 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid, 1041 const char *rule_name, ilb_rule_t *in_rule) 1042 { 1043 ilb_rule_t *rule; 1044 int err; 1045 1046 ASSERT((in_rule == NULL && rule_name != NULL) || 1047 (in_rule != NULL && rule_name == NULL)); 1048 if ((rule = in_rule) == NULL) { 1049 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1050 &err)) == NULL) { 1051 return (err); 1052 } 1053 } 1054 mutex_enter(&rule->ir_lock); 1055 rule->ir_flags &= ~ILB_RULE_ENABLED; 1056 mutex_exit(&rule->ir_lock); 1057 1058 /* Only refrele if the rule is passed in. */ 1059 if (in_rule == NULL) 1060 ILB_RULE_REFRELE(rule); 1061 return (0); 1062 } 1063 1064 /* 1065 * XXX We should probably have a walker function to walk all rules. For 1066 * now, just add a simple loop for enable/disable/del. 1067 */ 1068 void 1069 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1070 { 1071 ilb_rule_t *rule; 1072 1073 mutex_enter(&ilbs->ilbs_g_lock); 1074 for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) { 1075 if (rule->ir_zoneid != zoneid) 1076 continue; 1077 /* 1078 * No need to hold the rule as we are holding the global 1079 * lock so it won't go away. Ignore the return value here 1080 * as the rule is provided so the call cannot fail. 1081 */ 1082 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule); 1083 } 1084 mutex_exit(&ilbs->ilbs_g_lock); 1085 } 1086 1087 void 1088 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1089 { 1090 ilb_rule_t *rule; 1091 1092 mutex_enter(&ilbs->ilbs_g_lock); 1093 for (rule = ilbs->ilbs_rule_head; rule != NULL; 1094 rule = rule->ir_next) { 1095 if (rule->ir_zoneid != zoneid) 1096 continue; 1097 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule); 1098 } 1099 mutex_exit(&ilbs->ilbs_g_lock); 1100 } 1101 1102 void 1103 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1104 { 1105 ilb_rule_t *rule; 1106 ilb_rule_tq_t *arg; 1107 1108 mutex_enter(&ilbs->ilbs_g_lock); 1109 while ((rule = ilbs->ilbs_rule_head) != NULL) { 1110 if (rule->ir_zoneid != zoneid) 1111 continue; 1112 ilb_rule_hash_del(rule); 1113 ilb_rule_g_del(ilbs, rule); 1114 mutex_exit(&ilbs->ilbs_g_lock); 1115 1116 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); 1117 arg->ilbs = ilbs; 1118 arg->rule = rule; 1119 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, 1120 arg, TQ_SLEEP); 1121 1122 mutex_enter(&ilbs->ilbs_g_lock); 1123 } 1124 mutex_exit(&ilbs->ilbs_g_lock); 1125 } 1126 1127 /* 1128 * This is just an optimization, so don't grab the global lock. The 1129 * worst case is that we missed a couple packets. 1130 */ 1131 boolean_t 1132 ilb_has_rules(ilb_stack_t *ilbs) 1133 { 1134 return (ilbs->ilbs_rule_head != NULL); 1135 } 1136 1137 1138 static int 1139 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, 1140 ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable) 1141 { 1142 ilb_server_t *tmp_server; 1143 int ret; 1144 1145 ASSERT((rule == NULL && rule_name != NULL) || 1146 (rule != NULL && rule_name == NULL)); 1147 1148 if (rule == NULL) { 1149 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1150 &ret)) == NULL) { 1151 return (ret); 1152 } 1153 } 1154 1155 /* Once we get a hold on the rule, no server can be added/deleted. */ 1156 for (tmp_server = rule->ir_servers; tmp_server != NULL; 1157 tmp_server = tmp_server->iser_next) { 1158 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr)) 1159 break; 1160 } 1161 if (tmp_server == NULL) { 1162 ret = ENOENT; 1163 goto done; 1164 } 1165 1166 if (enable) { 1167 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server, 1168 rule->ir_alg->ilb_alg_data); 1169 if (ret == 0) { 1170 tmp_server->iser_enabled = B_TRUE; 1171 tmp_server->iser_die_time = 0; 1172 } 1173 } else { 1174 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server, 1175 rule->ir_alg->ilb_alg_data); 1176 if (ret == 0) { 1177 tmp_server->iser_enabled = B_FALSE; 1178 if (rule->ir_conn_drain_timeout != 0) { 1179 (void) atomic_swap_64( 1180 (uint64_t *)&tmp_server->iser_die_time, 1181 ddi_get_lbolt64() + SEC_TO_TICK( 1182 rule->ir_conn_drain_timeout)); 1183 } 1184 } 1185 } 1186 1187 done: 1188 if (rule_name != NULL) 1189 ILB_RULE_REFRELE(rule); 1190 return (ret); 1191 } 1192 int 1193 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1194 ilb_rule_t *rule, in6_addr_t *addr) 1195 { 1196 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE)); 1197 } 1198 1199 int 1200 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1201 ilb_rule_t *rule, in6_addr_t *addr) 1202 { 1203 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE)); 1204 } 1205 1206 /* 1207 * Add a back end server to a rule. If the address is IPv4, it is assumed 1208 * to be passed in as a mapped address. 1209 */ 1210 int 1211 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info) 1212 { 1213 ilb_server_t *server; 1214 netstackid_t stackid; 1215 int ret = 0; 1216 in_port_t min_port, max_port; 1217 in_port_t range; 1218 1219 /* Port is passed in network byte order. */ 1220 min_port = ntohs(info->min_port); 1221 max_port = ntohs(info->max_port); 1222 if (min_port > max_port) 1223 return (EINVAL); 1224 1225 /* min_port == 0 means "all ports". Make it so */ 1226 if (min_port == 0) { 1227 min_port = 1; 1228 max_port = 65535; 1229 } 1230 range = max_port - min_port; 1231 1232 mutex_enter(&rule->ir_lock); 1233 /* If someone is already doing server add/del, sleeps and wait. */ 1234 while (rule->ir_flags & ILB_RULE_BUSY) { 1235 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1236 mutex_exit(&rule->ir_lock); 1237 return (EINTR); 1238 } 1239 } 1240 1241 /* 1242 * Set the rule to be busy to make sure that no new packet can 1243 * use this rule. 1244 */ 1245 rule->ir_flags |= ILB_RULE_BUSY; 1246 1247 /* Now wait for all other guys to finish their work. */ 1248 while (rule->ir_refcnt > 2) { 1249 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1250 mutex_exit(&rule->ir_lock); 1251 ret = EINTR; 1252 goto end; 1253 } 1254 } 1255 mutex_exit(&rule->ir_lock); 1256 1257 /* Sanity checks... */ 1258 if ((IN6_IS_ADDR_V4MAPPED(&info->addr) && 1259 rule->ir_ipver != IPPROTO_IP) || 1260 (!IN6_IS_ADDR_V4MAPPED(&info->addr) && 1261 rule->ir_ipver != IPPROTO_IPV6)) { 1262 ret = EINVAL; 1263 goto end; 1264 } 1265 1266 /* 1267 * Check for valid port range. 1268 * 1269 * For DSR, there can be no port shifting. Hence the server 1270 * specification must be the same as the rule's. 1271 * 1272 * For half-NAT/NAT, the range must either be 0 (port collapsing) or 1273 * it must be equal to the same value as the rule port range. 1274 * 1275 */ 1276 if (rule->ir_topo == ILB_TOPO_IMPL_DSR) { 1277 if (rule->ir_max_port != max_port || 1278 rule->ir_min_port != min_port) { 1279 ret = EINVAL; 1280 goto end; 1281 } 1282 } else { 1283 if ((range != rule->ir_max_port - rule->ir_min_port) && 1284 range != 0) { 1285 ret = EINVAL; 1286 goto end; 1287 } 1288 } 1289 1290 /* Check for duplicate. */ 1291 for (server = rule->ir_servers; server != NULL; 1292 server = server->iser_next) { 1293 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) || 1294 strcasecmp(server->iser_name, info->name) == 0) { 1295 break; 1296 } 1297 } 1298 if (server != NULL) { 1299 ret = EEXIST; 1300 goto end; 1301 } 1302 1303 if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) { 1304 ret = ENOMEM; 1305 goto end; 1306 } 1307 1308 (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1); 1309 (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr, 1310 sizeof (server->iser_ip_addr)); 1311 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 1312 server->iser_ksp = ilb_server_kstat_init(stackid, rule, server); 1313 if (server->iser_ksp == NULL) { 1314 kmem_free(server, sizeof (ilb_server_t)); 1315 ret = EINVAL; 1316 goto end; 1317 } 1318 1319 server->iser_stackid = stackid; 1320 server->iser_addr_v6 = info->addr; 1321 server->iser_min_port = min_port; 1322 server->iser_max_port = max_port; 1323 if (min_port != max_port) 1324 server->iser_port_range = B_TRUE; 1325 else 1326 server->iser_port_range = B_FALSE; 1327 1328 /* 1329 * If the rule uses NAT, find/create the NAT source entry to use 1330 * for this server. 1331 */ 1332 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { 1333 in_port_t port; 1334 1335 /* 1336 * If the server uses a port range, our port allocation 1337 * scheme needs to treat it as a wildcard. Refer to the 1338 * comments in ilb_nat.c about the scheme. 1339 */ 1340 if (server->iser_port_range) 1341 port = 0; 1342 else 1343 port = server->iser_min_port; 1344 1345 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src, 1346 &server->iser_addr_v6, port, &rule->ir_nat_src_start, 1347 num_nat_src_v6(&rule->ir_nat_src_start, 1348 &rule->ir_nat_src_end))) != 0) { 1349 kstat_delete_netstack(server->iser_ksp, stackid); 1350 kmem_free(server, sizeof (ilb_server_t)); 1351 goto end; 1352 } 1353 } 1354 1355 /* 1356 * The iser_lock is only used to protect iser_refcnt. All the other 1357 * fields in ilb_server_t should not change, except for iser_enabled. 1358 * The worst thing that can happen if iser_enabled is messed up is 1359 * that one or two packets may not be load balanced to a server 1360 * correctly. 1361 */ 1362 server->iser_refcnt = 1; 1363 server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE : 1364 B_FALSE; 1365 mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL); 1366 cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL); 1367 1368 /* Let the load balancing algorithm know about the addition. */ 1369 ASSERT(rule->ir_alg != NULL); 1370 if ((ret = rule->ir_alg->ilb_alg_server_add(server, 1371 rule->ir_alg->ilb_alg_data)) != 0) { 1372 kstat_delete_netstack(server->iser_ksp, stackid); 1373 kmem_free(server, sizeof (ilb_server_t)); 1374 goto end; 1375 } 1376 1377 /* 1378 * No need to hold ir_lock since no other thread should manipulate 1379 * the following fields until ILB_RULE_BUSY is cleared. 1380 */ 1381 if (rule->ir_servers == NULL) { 1382 server->iser_next = NULL; 1383 } else { 1384 server->iser_next = rule->ir_servers; 1385 } 1386 rule->ir_servers = server; 1387 ILB_R_KSTAT(rule, num_servers); 1388 1389 end: 1390 mutex_enter(&rule->ir_lock); 1391 rule->ir_flags &= ~ILB_RULE_BUSY; 1392 cv_signal(&rule->ir_cv); 1393 mutex_exit(&rule->ir_lock); 1394 return (ret); 1395 } 1396 1397 /* The routine executed by the delayed rule processing taskq. */ 1398 static void 1399 ilb_server_del_tq(void *arg) 1400 { 1401 ilb_server_t *server = (ilb_server_t *)arg; 1402 1403 mutex_enter(&server->iser_lock); 1404 while (server->iser_refcnt > 1) 1405 cv_wait(&server->iser_cv, &server->iser_lock); 1406 kstat_delete_netstack(server->iser_ksp, server->iser_stackid); 1407 kmem_free(server, sizeof (ilb_server_t)); 1408 } 1409 1410 /* 1411 * Delete a back end server from a rule. If the address is IPv4, it is assumed 1412 * to be passed in as a mapped address. 1413 */ 1414 int 1415 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, 1416 ilb_rule_t *rule, in6_addr_t *addr) 1417 { 1418 ilb_server_t *server; 1419 ilb_server_t *prev_server; 1420 int ret = 0; 1421 1422 ASSERT((rule == NULL && rule_name != NULL) || 1423 (rule != NULL && rule_name == NULL)); 1424 if (rule == NULL) { 1425 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1426 &ret)) == NULL) { 1427 return (ret); 1428 } 1429 } 1430 1431 mutex_enter(&rule->ir_lock); 1432 /* If someone is already doing server add/del, sleeps and wait. */ 1433 while (rule->ir_flags & ILB_RULE_BUSY) { 1434 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1435 if (rule_name != NULL) { 1436 if (--rule->ir_refcnt <= 2) 1437 cv_signal(&rule->ir_cv); 1438 } 1439 mutex_exit(&rule->ir_lock); 1440 return (EINTR); 1441 } 1442 } 1443 /* 1444 * Set the rule to be busy to make sure that no new packet can 1445 * use this rule. 1446 */ 1447 rule->ir_flags |= ILB_RULE_BUSY; 1448 1449 /* Now wait for all other guys to finish their work. */ 1450 while (rule->ir_refcnt > 2) { 1451 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1452 mutex_exit(&rule->ir_lock); 1453 ret = EINTR; 1454 goto end; 1455 } 1456 } 1457 mutex_exit(&rule->ir_lock); 1458 1459 prev_server = NULL; 1460 for (server = rule->ir_servers; server != NULL; 1461 prev_server = server, server = server->iser_next) { 1462 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr)) 1463 break; 1464 } 1465 if (server == NULL) { 1466 ret = ENOENT; 1467 goto end; 1468 } 1469 1470 /* 1471 * Let the load balancing algorithm know about the removal. 1472 * The algorithm may disallow the removal... 1473 */ 1474 if ((ret = rule->ir_alg->ilb_alg_server_del(server, 1475 rule->ir_alg->ilb_alg_data)) != 0) { 1476 goto end; 1477 } 1478 1479 if (prev_server == NULL) 1480 rule->ir_servers = server->iser_next; 1481 else 1482 prev_server->iser_next = server->iser_next; 1483 1484 ILB_R_KSTAT_UPDATE(rule, num_servers, -1); 1485 1486 /* 1487 * Mark the server as disabled so that if there is any sticky cache 1488 * using this server around, it won't be used. 1489 */ 1490 server->iser_enabled = B_FALSE; 1491 1492 mutex_enter(&server->iser_lock); 1493 1494 /* 1495 * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t 1496 * may not go away if there is still a conn using it. The NAT source 1497 * timer will do the garbage collection. 1498 */ 1499 ilb_destroy_nat_src(&server->iser_nat_src); 1500 1501 /* If there is a hard limit on when a server should die, set it. */ 1502 if (rule->ir_conn_drain_timeout != 0) { 1503 (void) atomic_swap_64((uint64_t *)&server->iser_die_time, 1504 ddi_get_lbolt64() + 1505 SEC_TO_TICK(rule->ir_conn_drain_timeout)); 1506 } 1507 1508 if (server->iser_refcnt > 1) { 1509 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq, 1510 server, TQ_SLEEP); 1511 mutex_exit(&server->iser_lock); 1512 } else { 1513 kstat_delete_netstack(server->iser_ksp, server->iser_stackid); 1514 kmem_free(server, sizeof (ilb_server_t)); 1515 } 1516 1517 end: 1518 mutex_enter(&rule->ir_lock); 1519 rule->ir_flags &= ~ILB_RULE_BUSY; 1520 if (rule_name != NULL) 1521 rule->ir_refcnt--; 1522 cv_signal(&rule->ir_cv); 1523 mutex_exit(&rule->ir_lock); 1524 return (ret); 1525 } 1526 1527 /* 1528 * First check if the destination of the ICMP message matches a VIP of 1529 * a rule. If it does not, just return ILB_PASSED. 1530 * 1531 * If the destination matches a VIP: 1532 * 1533 * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end 1534 * server. 1535 * 1536 * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload 1537 * and see which back end server we should send this message to. And we 1538 * need to do NAT on both the payload message and the outside IP packet. 1539 * 1540 * For other ICMP messages, drop them. 1541 */ 1542 /* ARGSUSED */ 1543 static int 1544 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, 1545 icmph_t *icmph, ipaddr_t *lb_dst) 1546 { 1547 ipaddr_t vip; 1548 ilb_rule_t *rule; 1549 in6_addr_t addr6; 1550 1551 if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule)) 1552 return (ILB_PASSED); 1553 1554 1555 if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) { 1556 ILB_R_KSTAT(rule, icmp_dropped); 1557 ILB_RULE_REFRELE(rule); 1558 return (ILB_DROPPED); 1559 } 1560 1561 switch (icmph->icmph_type) { 1562 case ICMP_ECHO_REQUEST: 1563 ILB_R_KSTAT(rule, icmp_echo_processed); 1564 ILB_RULE_REFRELE(rule); 1565 1566 icmph->icmph_type = ICMP_ECHO_REPLY; 1567 icmph->icmph_checksum = 0; 1568 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0); 1569 ipha->ipha_ttl = 1570 ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl; 1571 *lb_dst = ipha->ipha_src; 1572 vip = ipha->ipha_dst; 1573 ipha->ipha_dst = ipha->ipha_src; 1574 ipha->ipha_src = vip; 1575 return (ILB_BALANCED); 1576 case ICMP_DEST_UNREACHABLE: { 1577 int ret; 1578 1579 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) { 1580 ILB_R_KSTAT(rule, icmp_dropped); 1581 ILB_RULE_REFRELE(rule); 1582 return (ILB_DROPPED); 1583 } 1584 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph, 1585 &addr6)) { 1586 ILB_R_KSTAT(rule, icmp_2big_processed); 1587 ret = ILB_BALANCED; 1588 } else { 1589 ILB_R_KSTAT(rule, icmp_2big_dropped); 1590 ret = ILB_DROPPED; 1591 } 1592 ILB_RULE_REFRELE(rule); 1593 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst); 1594 return (ret); 1595 } 1596 default: 1597 ILB_R_KSTAT(rule, icmp_dropped); 1598 ILB_RULE_REFRELE(rule); 1599 return (ILB_DROPPED); 1600 } 1601 } 1602 1603 /* ARGSUSED */ 1604 static int 1605 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, 1606 icmp6_t *icmp6, in6_addr_t *lb_dst) 1607 { 1608 ilb_rule_t *rule; 1609 1610 if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule)) 1611 return (ILB_PASSED); 1612 1613 if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) { 1614 ILB_R_KSTAT(rule, icmp_dropped); 1615 ILB_RULE_REFRELE(rule); 1616 return (ILB_DROPPED); 1617 } 1618 1619 switch (icmp6->icmp6_type) { 1620 case ICMP6_ECHO_REQUEST: { 1621 int hdr_len; 1622 1623 ILB_R_KSTAT(rule, icmp_echo_processed); 1624 ILB_RULE_REFRELE(rule); 1625 1626 icmp6->icmp6_type = ICMP6_ECHO_REPLY; 1627 icmp6->icmp6_cksum = ip6h->ip6_plen; 1628 hdr_len = (char *)icmp6 - (char *)ip6h; 1629 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len, 1630 ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6)); 1631 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; 1632 ip6h->ip6_hops = 1633 ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops; 1634 *lb_dst = ip6h->ip6_src; 1635 ip6h->ip6_src = ip6h->ip6_dst; 1636 ip6h->ip6_dst = *lb_dst; 1637 return (ILB_BALANCED); 1638 } 1639 case ICMP6_PACKET_TOO_BIG: { 1640 int ret; 1641 1642 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6, 1643 lb_dst)) { 1644 ILB_R_KSTAT(rule, icmp_2big_processed); 1645 ret = ILB_BALANCED; 1646 } else { 1647 ILB_R_KSTAT(rule, icmp_2big_dropped); 1648 ret = ILB_DROPPED; 1649 } 1650 ILB_RULE_REFRELE(rule); 1651 return (ret); 1652 } 1653 default: 1654 ILB_R_KSTAT(rule, icmp_dropped); 1655 ILB_RULE_REFRELE(rule); 1656 return (ILB_DROPPED); 1657 } 1658 } 1659 1660 /* 1661 * Common routine to check an incoming packet and decide what to do with it. 1662 * called by ilb_check_v4|v6(). 1663 */ 1664 static int 1665 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src, 1666 in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len, 1667 in6_addr_t *lb_dst) 1668 { 1669 in_port_t sport, dport; 1670 tcpha_t *tcph; 1671 udpha_t *udph; 1672 ilb_rule_t *rule; 1673 ilb_server_t *server; 1674 boolean_t balanced; 1675 struct ilb_sticky_s *s = NULL; 1676 int ret; 1677 uint32_t ip_sum, tp_sum; 1678 ilb_nat_info_t info; 1679 uint16_t nat_src_idx; 1680 boolean_t busy; 1681 1682 /* 1683 * We don't really need to switch here since both protocols's 1684 * ports are at the same offset. Just prepare for future protocol 1685 * specific processing. 1686 */ 1687 switch (l4) { 1688 case IPPROTO_TCP: 1689 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr) 1690 return (ILB_DROPPED); 1691 tcph = (tcpha_t *)tph; 1692 sport = tcph->tha_lport; 1693 dport = tcph->tha_fport; 1694 break; 1695 case IPPROTO_UDP: 1696 if (tph + sizeof (udpha_t) > mp->b_wptr) 1697 return (ILB_DROPPED); 1698 udph = (udpha_t *)tph; 1699 sport = udph->uha_src_port; 1700 dport = udph->uha_dst_port; 1701 break; 1702 default: 1703 return (ILB_PASSED); 1704 } 1705 1706 /* Fast path, there is an existing conn. */ 1707 if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport, 1708 pkt_len, lb_dst)) { 1709 return (ILB_BALANCED); 1710 } 1711 1712 /* 1713 * If there is no existing connection for the incoming packet, check 1714 * to see if the packet matches a rule. If not, just let IP decide 1715 * what to do with it. 1716 * 1717 * Note: a reply from back end server should not match a rule. A 1718 * reply should match one existing conn. 1719 */ 1720 rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid, 1721 pkt_len, &busy); 1722 if (rule == NULL) { 1723 /* If the rule is busy, just drop the packet. */ 1724 if (busy) 1725 return (ILB_DROPPED); 1726 else 1727 return (ILB_PASSED); 1728 } 1729 1730 /* 1731 * The packet matches a rule, use the rule load balance algorithm 1732 * to find a server. 1733 */ 1734 balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport, 1735 rule->ir_alg->ilb_alg_data, &server); 1736 /* 1737 * This can only happen if there is no server in a rule or all 1738 * the servers are currently disabled. 1739 */ 1740 if (!balanced) 1741 goto no_server; 1742 1743 /* 1744 * If the rule is sticky enabled, we need to check the sticky table. 1745 * If there is a sticky entry for the client, use the previous server 1746 * instead of the one found above (note that both can be the same). 1747 * If there is no entry for that client, add an entry to the sticky 1748 * table. Both the find and add are done in ilb_sticky_find_add() 1749 * to avoid checking for duplicate when adding an entry. 1750 */ 1751 if (rule->ir_flags & ILB_RULE_STICKY) { 1752 in6_addr_t addr; 1753 1754 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr); 1755 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server, 1756 &s, &nat_src_idx)) == NULL) { 1757 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1758 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1759 goto no_server; 1760 } 1761 } 1762 1763 /* 1764 * We are holding a reference on the rule, so the server 1765 * cannot go away. 1766 */ 1767 *lb_dst = server->iser_addr_v6; 1768 ILB_S_KSTAT(server, pkt_processed); 1769 ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len); 1770 1771 switch (rule->ir_topo) { 1772 case ILB_TOPO_IMPL_NAT: { 1773 ilb_nat_src_entry_t *src_ent; 1774 uint16_t *src_idx; 1775 1776 /* 1777 * We create a cache even if it is not a SYN segment. 1778 * The server should return a RST. When we see the 1779 * RST, we will destroy this cache. But by having 1780 * a cache, we know how to NAT the returned RST. 1781 */ 1782 info.vip = *dst; 1783 info.dport = dport; 1784 info.src = *src; 1785 info.sport = sport; 1786 1787 /* If stickiness is enabled, use the same source address */ 1788 if (s != NULL) 1789 src_idx = &nat_src_idx; 1790 else 1791 src_idx = NULL; 1792 1793 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src, 1794 &info.nat_src, &info.nat_sport, src_idx)) == NULL) { 1795 if (s != NULL) 1796 ilb_sticky_refrele(s); 1797 ILB_R_KSTAT(rule, pkt_dropped); 1798 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1799 ILB_R_KSTAT(rule, noport_pkt_dropped); 1800 ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len); 1801 ret = ILB_DROPPED; 1802 break; 1803 } 1804 info.src_ent = src_ent; 1805 info.nat_dst = server->iser_addr_v6; 1806 if (rule->ir_port_range && server->iser_port_range) { 1807 info.nat_dport = htons(ntohs(dport) - 1808 rule->ir_min_port + server->iser_min_port); 1809 } else { 1810 info.nat_dport = htons(server->iser_min_port); 1811 } 1812 1813 /* 1814 * If ilb_conn_add() fails, it will release the reference on 1815 * sticky info and de-allocate the NAT source port allocated 1816 * above. 1817 */ 1818 if (ilb_conn_add(ilbs, rule, server, src, sport, dst, 1819 dport, &info, &ip_sum, &tp_sum, s) != 0) { 1820 ILB_R_KSTAT(rule, pkt_dropped); 1821 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1822 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1823 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1824 ret = ILB_DROPPED; 1825 break; 1826 } 1827 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); 1828 ret = ILB_BALANCED; 1829 break; 1830 } 1831 case ILB_TOPO_IMPL_HALF_NAT: 1832 info.vip = *dst; 1833 info.nat_dst = server->iser_addr_v6; 1834 info.dport = dport; 1835 if (rule->ir_port_range && server->iser_port_range) { 1836 info.nat_dport = htons(ntohs(dport) - 1837 rule->ir_min_port + server->iser_min_port); 1838 } else { 1839 info.nat_dport = htons(server->iser_min_port); 1840 } 1841 1842 if (ilb_conn_add(ilbs, rule, server, src, sport, dst, 1843 dport, &info, &ip_sum, &tp_sum, s) != 0) { 1844 ILB_R_KSTAT(rule, pkt_dropped); 1845 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1846 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1847 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1848 ret = ILB_DROPPED; 1849 break; 1850 } 1851 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); 1852 1853 ret = ILB_BALANCED; 1854 break; 1855 case ILB_TOPO_IMPL_DSR: 1856 /* 1857 * By decrementing the sticky refcnt, the period of 1858 * stickiness (life time of ilb_sticky_t) will be 1859 * from now to (now + default expiry time). 1860 */ 1861 if (s != NULL) 1862 ilb_sticky_refrele(s); 1863 ret = ILB_BALANCED; 1864 break; 1865 default: 1866 cmn_err(CE_PANIC, "data corruption unknown topology: %p", 1867 (void *) rule); 1868 break; 1869 } 1870 ILB_RULE_REFRELE(rule); 1871 return (ret); 1872 1873 no_server: 1874 /* This can only happen if there is no server available. */ 1875 ILB_R_KSTAT(rule, pkt_dropped); 1876 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1877 ILB_RULE_REFRELE(rule); 1878 return (ILB_DROPPED); 1879 } 1880 1881 int 1882 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4, 1883 uint8_t *tph, ipaddr_t *lb_dst) 1884 { 1885 in6_addr_t v6_src, v6_dst, v6_lb_dst; 1886 int ret; 1887 1888 ASSERT(DB_REF(mp) == 1); 1889 1890 if (l4 == IPPROTO_ICMP) { 1891 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph, 1892 lb_dst)); 1893 } 1894 1895 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src); 1896 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst); 1897 ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha, 1898 tph, ntohs(ipha->ipha_length), &v6_lb_dst); 1899 if (ret == ILB_BALANCED) 1900 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst); 1901 return (ret); 1902 } 1903 1904 int 1905 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4, 1906 uint8_t *tph, in6_addr_t *lb_dst) 1907 { 1908 uint32_t pkt_len; 1909 1910 ASSERT(DB_REF(mp) == 1); 1911 1912 if (l4 == IPPROTO_ICMPV6) { 1913 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph, 1914 lb_dst)); 1915 } 1916 1917 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 1918 return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst, 1919 IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst)); 1920 } 1921 1922 void 1923 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules) 1924 { 1925 ilb_rule_t *tmp_rule; 1926 1927 mutex_enter(&ilbs->ilbs_g_lock); 1928 *num_rules = 0; 1929 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 1930 tmp_rule = tmp_rule->ir_next) { 1931 if (tmp_rule->ir_zoneid == zoneid) 1932 *num_rules += 1; 1933 } 1934 mutex_exit(&ilbs->ilbs_g_lock); 1935 } 1936 1937 int 1938 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1939 uint32_t *num_servers) 1940 { 1941 ilb_rule_t *rule; 1942 int err; 1943 1944 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) 1945 return (err); 1946 *num_servers = rule->ir_kstat.num_servers.value.ui64; 1947 ILB_RULE_REFRELE(rule); 1948 return (0); 1949 } 1950 1951 int 1952 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1953 ilb_server_info_t *servers, uint32_t *num_servers) 1954 { 1955 ilb_rule_t *rule; 1956 ilb_server_t *server; 1957 size_t cnt; 1958 int err; 1959 1960 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) 1961 return (err); 1962 for (server = rule->ir_servers, cnt = *num_servers; 1963 server != NULL && cnt > 0; 1964 server = server->iser_next, cnt--, servers++) { 1965 (void) memcpy(servers->name, server->iser_name, 1966 ILB_SERVER_NAMESZ); 1967 servers->addr = server->iser_addr_v6; 1968 servers->min_port = htons(server->iser_min_port); 1969 servers->max_port = htons(server->iser_max_port); 1970 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0; 1971 servers->err = 0; 1972 } 1973 ILB_RULE_REFRELE(rule); 1974 *num_servers -= cnt; 1975 1976 return (0); 1977 } 1978 1979 void 1980 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names, 1981 char *buf) 1982 { 1983 ilb_rule_t *tmp_rule; 1984 int cnt; 1985 1986 if (*num_names == 0) 1987 return; 1988 1989 mutex_enter(&ilbs->ilbs_g_lock); 1990 for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 1991 tmp_rule = tmp_rule->ir_next) { 1992 if (tmp_rule->ir_zoneid != zoneid) 1993 continue; 1994 1995 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ); 1996 buf += ILB_RULE_NAMESZ; 1997 if (++cnt == *num_names) 1998 break; 1999 } 2000 mutex_exit(&ilbs->ilbs_g_lock); 2001 *num_names = cnt; 2002 } 2003 2004 int 2005 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd) 2006 { 2007 ilb_rule_t *rule; 2008 int err; 2009 2010 if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) { 2011 return (err); 2012 } 2013 2014 /* 2015 * Except the enabled flags, none of the following will change 2016 * in the life time of a rule. So we don't hold the mutex when 2017 * reading them. The worst is to report a wrong enabled flags. 2018 */ 2019 cmd->ip_ver = rule->ir_ipver; 2020 cmd->proto = rule->ir_proto; 2021 cmd->min_port = htons(rule->ir_min_port); 2022 cmd->max_port = htons(rule->ir_max_port); 2023 2024 cmd->vip = rule->ir_target_v6; 2025 cmd->algo = rule->ir_alg_type; 2026 cmd->topo = rule->ir_topo; 2027 2028 cmd->nat_src_start = rule->ir_nat_src_start; 2029 cmd->nat_src_end = rule->ir_nat_src_end; 2030 2031 cmd->conn_drain_timeout = rule->ir_conn_drain_timeout; 2032 cmd->nat_expiry = rule->ir_nat_expiry; 2033 cmd->sticky_expiry = rule->ir_sticky_expiry; 2034 2035 cmd->flags = 0; 2036 if (rule->ir_flags & ILB_RULE_ENABLED) 2037 cmd->flags |= ILB_RULE_ENABLED; 2038 if (rule->ir_flags & ILB_RULE_STICKY) { 2039 cmd->flags |= ILB_RULE_STICKY; 2040 cmd->sticky_mask = rule->ir_sticky_mask; 2041 } 2042 2043 ILB_RULE_REFRELE(rule); 2044 return (0); 2045 } 2046 2047 static void * 2048 ilb_stack_init(netstackid_t stackid, netstack_t *ns) 2049 { 2050 ilb_stack_t *ilbs; 2051 char tq_name[TASKQ_NAMELEN]; 2052 2053 ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP); 2054 ilbs->ilbs_netstack = ns; 2055 2056 ilbs->ilbs_rule_head = NULL; 2057 ilbs->ilbs_g_hash = NULL; 2058 mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL); 2059 2060 ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP); 2061 if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) { 2062 kmem_free(ilbs, sizeof (ilb_stack_t)); 2063 return (NULL); 2064 } 2065 2066 /* 2067 * ilbs_conn/sticky_hash related info is initialized in 2068 * ilb_conn/sticky_hash_init(). 2069 */ 2070 ilbs->ilbs_conn_taskq = NULL; 2071 ilbs->ilbs_rule_hash_size = ilb_rule_hash_size; 2072 ilbs->ilbs_conn_hash_size = ilb_conn_hash_size; 2073 ilbs->ilbs_c2s_conn_hash = NULL; 2074 ilbs->ilbs_s2c_conn_hash = NULL; 2075 ilbs->ilbs_conn_timer_list = NULL; 2076 2077 ilbs->ilbs_sticky_hash = NULL; 2078 ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size; 2079 ilbs->ilbs_sticky_timer_list = NULL; 2080 ilbs->ilbs_sticky_taskq = NULL; 2081 2082 /* The allocation is done later when there is a rule using NAT mode. */ 2083 ilbs->ilbs_nat_src = NULL; 2084 ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size; 2085 mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL); 2086 ilbs->ilbs_nat_src_tid = 0; 2087 2088 /* For listing the conn hash table */ 2089 mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL); 2090 cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL); 2091 ilbs->ilbs_conn_list_busy = B_FALSE; 2092 ilbs->ilbs_conn_list_cur = 0; 2093 ilbs->ilbs_conn_list_connp = NULL; 2094 2095 /* For listing the sticky hash table */ 2096 mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL); 2097 cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL); 2098 ilbs->ilbs_sticky_list_busy = B_FALSE; 2099 ilbs->ilbs_sticky_list_cur = 0; 2100 ilbs->ilbs_sticky_list_curp = NULL; 2101 2102 (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p", 2103 (void *)ns); 2104 ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR, 2105 minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 2106 2107 return (ilbs); 2108 } 2109 2110 /* ARGSUSED */ 2111 static void 2112 ilb_stack_shutdown(netstackid_t stackid, void *arg) 2113 { 2114 ilb_stack_t *ilbs = (ilb_stack_t *)arg; 2115 ilb_rule_t *tmp_rule; 2116 2117 ilb_sticky_hash_fini(ilbs); 2118 ilb_conn_hash_fini(ilbs); 2119 mutex_enter(&ilbs->ilbs_g_lock); 2120 while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) { 2121 ilb_rule_hash_del(tmp_rule); 2122 ilb_rule_g_del(ilbs, tmp_rule); 2123 mutex_exit(&ilbs->ilbs_g_lock); 2124 ilb_rule_del_common(ilbs, tmp_rule); 2125 mutex_enter(&ilbs->ilbs_g_lock); 2126 } 2127 mutex_exit(&ilbs->ilbs_g_lock); 2128 if (ilbs->ilbs_nat_src != NULL) 2129 ilb_nat_src_fini(ilbs); 2130 } 2131 2132 static void 2133 ilb_stack_fini(netstackid_t stackid, void * arg) 2134 { 2135 ilb_stack_t *ilbs = (ilb_stack_t *)arg; 2136 2137 ilb_rule_hash_fini(ilbs); 2138 taskq_destroy(ilbs->ilbs_rule_taskq); 2139 ilb_kstat_g_fini(stackid, ilbs); 2140 kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t)); 2141 kmem_free(ilbs, sizeof (ilb_stack_t)); 2142 } 2143 2144 void 2145 ilb_ddi_g_init(void) 2146 { 2147 netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown, 2148 ilb_stack_fini); 2149 } 2150 2151 void 2152 ilb_ddi_g_destroy(void) 2153 { 2154 netstack_unregister(NS_ILB); 2155 ilb_conn_cache_fini(); 2156 ilb_sticky_cache_fini(); 2157 } 2158