xref: /illumos-gate/usr/src/uts/common/inet/ilb/ilb.c (revision f3041bfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/sysmacros.h>
28 #include <sys/kmem.h>
29 #include <sys/ksynch.h>
30 #include <sys/systm.h>
31 #include <sys/socket.h>
32 #include <sys/disp.h>
33 #include <sys/taskq.h>
34 #include <sys/cmn_err.h>
35 #include <sys/strsun.h>
36 #include <sys/sdt.h>
37 #include <sys/atomic.h>
38 #include <netinet/in.h>
39 #include <inet/ip.h>
40 #include <inet/ip6.h>
41 #include <inet/tcp.h>
42 #include <inet/udp_impl.h>
43 #include <inet/kstatcom.h>
44 
45 #include <inet/ilb_ip.h>
46 #include "ilb_alg.h"
47 #include "ilb_nat.h"
48 #include "ilb_conn.h"
49 
50 /* ILB kmem cache flag */
51 int ilb_kmem_flags = 0;
52 
53 /*
54  * The default size for the different hash tables.  Global for all stacks.
55  * But each stack has its own table, just that their sizes are the same.
56  */
57 static size_t ilb_rule_hash_size = 2048;
58 
59 static size_t ilb_conn_hash_size = 262144;
60 
61 static size_t ilb_sticky_hash_size = 262144;
62 
63 /* This should be a prime number. */
64 static size_t ilb_nat_src_hash_size = 97;
65 
66 /* Default NAT cache entry expiry time. */
67 static uint32_t ilb_conn_tcp_expiry = 120;
68 static uint32_t ilb_conn_udp_expiry = 60;
69 
70 /* Default sticky entry expiry time. */
71 static uint32_t ilb_sticky_expiry = 60;
72 
73 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
74 #define	ILB_RULE_HASH(addr, hash_size) \
75 	((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
76 	*(addr)) & ((hash_size) - 1))
77 
78 /*
79  * Note on ILB delayed processing
80  *
81  * To avoid in line removal on some of the data structures, such as rules,
82  * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
83  * There are three types of ILB taskq:
84  *
85  * 1. rule handling: created at stack initialialization time, ilb_stack_init()
86  * 2. conn hash handling: created at conn hash initialization time,
87  *                        ilb_conn_hash_init()
88  * 3. sticky hash handling: created at sticky hash initialization time,
89  *                          ilb_sticky_hash_init()
90  *
91  * The rule taskq is for processing rule and server removal.  When a user
92  * land rule/server removal request comes in, a taskq is dispatched after
93  * removing the rule/server from all related hashes.  This taskq will wait
94  * until all references to the rule/server are gone before removing it.
95  * So the user land thread requesting the removal does not need to wait
96  * for the removal completion.
97  *
98  * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
99  * ilb_sticky_hash table entry removal.  There are ilb_conn_timer_size timers
100  * and ilb_sticky_timer_size timers running for ilb_conn_hash and
101  * ilb_sticky_hash cleanup respectively.   Each timer is responsible for one
102  * portion (same size) of the hash table.  When a timer fires, it dispatches
103  * a conn hash taskq to clean up its portion of the table.  This avoids in
104  * line processing of the removal.
105  *
106  * There is another delayed processing, the clean up of NAT source address
107  * table.  We just use the timer to directly handle it instead of using
108  * a taskq.  The reason is that the table is small so it is OK to use the
109  * timer.
110  */
111 
112 /* ILB rule taskq constants. */
113 #define	ILB_RULE_TASKQ_NUM_THR	20
114 
115 /* Argument passed to ILB rule taskq routines. */
116 typedef	struct {
117 	ilb_stack_t	*ilbs;
118 	ilb_rule_t	*rule;
119 } ilb_rule_tq_t;
120 
121 /* kstat handling routines. */
122 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
123 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
124 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
125 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
126     ilb_server_t *);
127 
128 /* Rule hash handling routines. */
129 static void ilb_rule_hash_init(ilb_stack_t *);
130 static void ilb_rule_hash_fini(ilb_stack_t *);
131 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
132 static void ilb_rule_hash_del(ilb_rule_t *);
133 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
134     in_port_t, zoneid_t, uint32_t, boolean_t *);
135 
136 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
137 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
138 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
139 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
140     int *);
141 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
142     int, in_port_t, in_port_t, const in6_addr_t *);
143 
144 /* Back end server handling routines. */
145 static void ilb_server_free(ilb_server_t *);
146 
147 /* Network stack handling routines. */
148 static void *ilb_stack_init(netstackid_t, netstack_t *);
149 static void ilb_stack_shutdown(netstackid_t, void *);
150 static void ilb_stack_fini(netstackid_t, void *);
151 
152 /* Sticky connection handling routines. */
153 static void ilb_rule_sticky_init(ilb_rule_t *);
154 static void ilb_rule_sticky_fini(ilb_rule_t *);
155 
156 /* Handy macro to check for unspecified address. */
157 #define	IS_ADDR_UNSPEC(addr)						\
158 	(IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) :	\
159 	    IN6_IS_ADDR_UNSPECIFIED(addr))
160 
161 /*
162  * Global kstat instance counter.  When a rule is created, its kstat instance
163  * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
164  * incremented.
165  */
166 static uint_t ilb_kstat_instance = 0;
167 
168 /*
169  * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
170  * A rule's kstat has ILB_RULE_KS_CNAME class name.
171  */
172 #define	ILB_G_KS_NAME		"global"
173 #define	ILB_G_KS_CNAME		"kstat"
174 #define	ILB_RULE_KS_CNAME	"rulestat"
175 
176 static kstat_t *
177 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
178 {
179 	kstat_t *ksp;
180 	ilb_g_kstat_t template = {
181 		{ "num_rules",		KSTAT_DATA_UINT64, 0 },
182 		{ "ip_frag_in",		KSTAT_DATA_UINT64, 0 },
183 		{ "ip_frag_dropped",	KSTAT_DATA_UINT64, 0 }
184 	};
185 
186 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
187 	    ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
188 	    KSTAT_FLAG_VIRTUAL, stackid);
189 	if (ksp == NULL)
190 		return (NULL);
191 	bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
192 	ksp->ks_data = ilbs->ilbs_kstat;
193 	ksp->ks_private = (void *)(uintptr_t)stackid;
194 
195 	kstat_install(ksp);
196 	return (ksp);
197 }
198 
199 static void
200 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
201 {
202 	if (ilbs->ilbs_ksp != NULL) {
203 		ASSERT(stackid == (netstackid_t)(uintptr_t)
204 		    ilbs->ilbs_ksp->ks_private);
205 		kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
206 		ilbs->ilbs_ksp = NULL;
207 	}
208 }
209 
210 static kstat_t *
211 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
212 {
213 	kstat_t *ksp;
214 	ilb_rule_kstat_t template = {
215 		{ "num_servers",		KSTAT_DATA_UINT64, 0 },
216 		{ "bytes_not_processed",	KSTAT_DATA_UINT64, 0 },
217 		{ "pkt_not_processed",		KSTAT_DATA_UINT64, 0 },
218 		{ "bytes_dropped",		KSTAT_DATA_UINT64, 0 },
219 		{ "pkt_dropped",		KSTAT_DATA_UINT64, 0 },
220 		{ "nomem_bytes_dropped",	KSTAT_DATA_UINT64, 0 },
221 		{ "nomem_pkt_dropped",		KSTAT_DATA_UINT64, 0 },
222 		{ "noport_bytes_dropped",	KSTAT_DATA_UINT64, 0 },
223 		{ "noport_pkt_dropped",		KSTAT_DATA_UINT64, 0 },
224 		{ "icmp_echo_processed",	KSTAT_DATA_UINT64, 0 },
225 		{ "icmp_dropped",		KSTAT_DATA_UINT64, 0 },
226 		{ "icmp_too_big_processed",	KSTAT_DATA_UINT64, 0 },
227 		{ "icmp_too_big_dropped",	KSTAT_DATA_UINT64, 0 }
228 	};
229 
230 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
231 	    rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
232 	    NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
233 	if (ksp == NULL)
234 		return (NULL);
235 
236 	bcopy(&template, &rule->ir_kstat, sizeof (template));
237 	ksp->ks_data = &rule->ir_kstat;
238 	ksp->ks_private = (void *)(uintptr_t)stackid;
239 
240 	kstat_install(ksp);
241 	return (ksp);
242 }
243 
244 static kstat_t *
245 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
246     ilb_server_t *server)
247 {
248 	kstat_t *ksp;
249 	ilb_server_kstat_t template = {
250 		{ "bytes_processed",	KSTAT_DATA_UINT64, 0 },
251 		{ "pkt_processed",	KSTAT_DATA_UINT64, 0 },
252 		{ "ip_address",		KSTAT_DATA_STRING, 0 }
253 	};
254 	char cname_buf[KSTAT_STRLEN];
255 
256 	/* 7 is "-sstat" */
257 	ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
258 	(void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
259 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
260 	    server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
261 	    NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
262 	if (ksp == NULL)
263 		return (NULL);
264 
265 	bcopy(&template, &server->iser_kstat, sizeof (template));
266 	ksp->ks_data = &server->iser_kstat;
267 	ksp->ks_private = (void *)(uintptr_t)stackid;
268 
269 	kstat_named_setstr(&server->iser_kstat.ip_address,
270 	    server->iser_ip_addr);
271 	/* We never change the IP address */
272 	ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
273 
274 	kstat_install(ksp);
275 	return (ksp);
276 }
277 
278 /* Initialize the rule hash table. */
279 static void
280 ilb_rule_hash_init(ilb_stack_t *ilbs)
281 {
282 	int i;
283 
284 	/*
285 	 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
286 	 * the next power of 2.
287 	 */
288 	if (!ISP2(ilbs->ilbs_rule_hash_size)) {
289 		for (i = 0; i < 31; i++) {
290 			if (ilbs->ilbs_rule_hash_size < (1 << i))
291 				break;
292 		}
293 		ilbs->ilbs_rule_hash_size = 1 << i;
294 	}
295 	ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
296 	    ilbs->ilbs_rule_hash_size, KM_SLEEP);
297 	for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
298 		mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
299 		    MUTEX_DEFAULT, NULL);
300 	}
301 }
302 
303 /* Clean up the rule hash table. */
304 static void
305 ilb_rule_hash_fini(ilb_stack_t *ilbs)
306 {
307 	if (ilbs->ilbs_g_hash == NULL)
308 		return;
309 	kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
310 	    ilbs->ilbs_rule_hash_size);
311 }
312 
313 /* Add a rule to the rule hash table. */
314 static void
315 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
316 {
317 	int i;
318 
319 	i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
320 	    ilbs->ilbs_rule_hash_size);
321 	DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
322 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
323 	rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
324 	if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
325 		ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
326 	rule->ir_hash_prev = NULL;
327 	ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
328 
329 	rule->ir_hash = &ilbs->ilbs_g_hash[i];
330 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
331 }
332 
333 /*
334  * Remove a rule from the rule hash table.  Note that the rule is not freed
335  * in this routine.
336  */
337 static void
338 ilb_rule_hash_del(ilb_rule_t *rule)
339 {
340 	mutex_enter(&rule->ir_hash->ilb_hash_lock);
341 	if (rule->ir_hash->ilb_hash_rule == rule) {
342 		rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
343 		if (rule->ir_hash_next != NULL)
344 			rule->ir_hash_next->ir_hash_prev = NULL;
345 	} else {
346 		if (rule->ir_hash_prev != NULL)
347 			rule->ir_hash_prev->ir_hash_next =
348 			    rule->ir_hash_next;
349 		if (rule->ir_hash_next != NULL) {
350 			rule->ir_hash_next->ir_hash_prev =
351 			    rule->ir_hash_prev;
352 		}
353 	}
354 	mutex_exit(&rule->ir_hash->ilb_hash_lock);
355 
356 	rule->ir_hash_next = NULL;
357 	rule->ir_hash_prev = NULL;
358 	rule->ir_hash = NULL;
359 }
360 
361 /*
362  * Given the info of a packet, look for a match in the rule hash table.
363  */
364 static ilb_rule_t *
365 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
366     in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
367 {
368 	int i;
369 	ilb_rule_t *rule;
370 	ipaddr_t v4_addr;
371 
372 	*busy = B_FALSE;
373 	IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
374 	i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
375 	port = ntohs(port);
376 
377 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
378 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
379 	    rule = rule->ir_hash_next) {
380 		if (!rule->ir_port_range) {
381 			if (rule->ir_min_port != port)
382 				continue;
383 		} else {
384 			if (port < rule->ir_min_port ||
385 			    port > rule->ir_max_port) {
386 				continue;
387 			}
388 		}
389 		if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
390 		    rule->ir_zoneid != zoneid) {
391 			continue;
392 		}
393 
394 		if (l3 == IPPROTO_IP) {
395 			if (rule->ir_target_v4 != INADDR_ANY &&
396 			    rule->ir_target_v4 != v4_addr) {
397 				continue;
398 			}
399 		} else {
400 			if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
401 			    !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
402 				continue;
403 			}
404 		}
405 
406 		/*
407 		 * Just update the stats if the rule is disabled.
408 		 */
409 		mutex_enter(&rule->ir_lock);
410 		if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
411 			ILB_R_KSTAT(rule, pkt_not_processed);
412 			ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
413 			mutex_exit(&rule->ir_lock);
414 			rule = NULL;
415 			break;
416 		} else if (rule->ir_flags & ILB_RULE_BUSY) {
417 			/*
418 			 * If we are busy...
419 			 *
420 			 * XXX we should have a queue to postpone the
421 			 * packet processing.  But this requires a
422 			 * mechanism in IP to re-start the packet
423 			 * processing.  So for now, just drop the packet.
424 			 */
425 			ILB_R_KSTAT(rule, pkt_dropped);
426 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
427 			mutex_exit(&rule->ir_lock);
428 			*busy = B_TRUE;
429 			rule = NULL;
430 			break;
431 		} else {
432 			rule->ir_refcnt++;
433 			ASSERT(rule->ir_refcnt != 1);
434 			mutex_exit(&rule->ir_lock);
435 			break;
436 		}
437 	}
438 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
439 	return (rule);
440 }
441 
442 /*
443  * Add a rule to the global rule list.  This list is for finding all rules
444  * in an IP stack.  The caller is assumed to hold the ilbs_g_lock.
445  */
446 static void
447 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
448 {
449 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
450 	rule->ir_next = ilbs->ilbs_rule_head;
451 	ilbs->ilbs_rule_head = rule;
452 	ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
453 }
454 
455 /* The call is assumed to hold the ilbs_g_lock. */
456 static void
457 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
458 {
459 	ilb_rule_t *tmp_rule;
460 	ilb_rule_t *prev_rule;
461 
462 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
463 	prev_rule = NULL;
464 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
465 	    prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
466 		if (tmp_rule == rule)
467 			break;
468 	}
469 	if (tmp_rule == NULL) {
470 		mutex_exit(&ilbs->ilbs_g_lock);
471 		return;
472 	}
473 	if (prev_rule == NULL)
474 		ilbs->ilbs_rule_head = tmp_rule->ir_next;
475 	else
476 		prev_rule->ir_next = tmp_rule->ir_next;
477 	ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
478 }
479 
480 /*
481  * Helper routine to calculate how many source addresses are in a given
482  * range.
483  */
484 static int64_t
485 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
486 {
487 	int64_t ret;
488 	uint32_t addr1, addr2;
489 
490 	/*
491 	 * Here we assume that the max number of NAT source cannot be
492 	 * large such that the most significant 2 s6_addr32 must be
493 	 * equal.
494 	 */
495 	addr1 = ntohl(a1->s6_addr32[3]);
496 	addr2 = ntohl(a2->s6_addr32[3]);
497 	if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
498 	    a1->s6_addr32[1] != a2->s6_addr32[1] ||
499 	    a1->s6_addr32[2] > a2->s6_addr32[2] ||
500 	    (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
501 		return (-1);
502 	}
503 	if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
504 		return (addr2 - addr1 + 1);
505 	} else {
506 		ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
507 		ret <<= 32;
508 		ret = ret + addr1 - addr2;
509 		return (ret + 1);
510 	}
511 }
512 
513 /*
514  * Add an ILB rule.
515  */
516 int
517 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
518 {
519 	ilb_rule_t *rule;
520 	netstackid_t stackid;
521 	int ret;
522 	in_port_t min_port, max_port;
523 	int64_t num_src;
524 
525 	/* Sanity checks. */
526 	if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
527 		return (EINVAL);
528 
529 	/* Need to support SCTP... */
530 	if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
531 		return (EINVAL);
532 
533 	/* For full NAT, the NAT source must be supplied. */
534 	if (cmd->topo == ILB_TOPO_IMPL_NAT) {
535 		if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
536 		    IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
537 			return (EINVAL);
538 		}
539 	}
540 
541 	/* Check invalid mask */
542 	if ((cmd->flags & ILB_RULE_STICKY) &&
543 	    IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
544 		return (EINVAL);
545 	}
546 
547 	/* Port is passed in network byte order. */
548 	min_port = ntohs(cmd->min_port);
549 	max_port = ntohs(cmd->max_port);
550 	if (min_port > max_port)
551 		return (EINVAL);
552 
553 	/* min_port == 0 means "all ports". Make it so */
554 	if (min_port == 0) {
555 		min_port = 1;
556 		max_port = 65535;
557 	}
558 
559 	/* Funny address checking. */
560 	if (cmd->ip_ver == IPPROTO_IP) {
561 		in_addr_t v4_addr1, v4_addr2;
562 
563 		v4_addr1 = cmd->vip.s6_addr32[3];
564 		if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
565 		    CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
566 		    v4_addr1 == INADDR_ANY ||
567 		    !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
568 			return (EINVAL);
569 		}
570 
571 		if (cmd->topo == ILB_TOPO_IMPL_NAT) {
572 			v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
573 			v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
574 			if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
575 			    (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
576 			    v4_addr1 == INADDR_BROADCAST ||
577 			    v4_addr2 == INADDR_BROADCAST ||
578 			    v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
579 			    CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
580 			    !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
581 			    !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
582 				return (EINVAL);
583 			}
584 
585 			num_src = v4_addr2 - v4_addr1 + 1;
586 			if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
587 				return (EINVAL);
588 		}
589 	} else {
590 		if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
591 		    IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
592 		    IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
593 		    IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
594 			return (EINVAL);
595 		}
596 
597 		if (cmd->topo == ILB_TOPO_IMPL_NAT) {
598 			if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
599 			    IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
600 			    IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
601 			    IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
602 			    IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
603 			    IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
604 			    IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
605 			    IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
606 				return (EINVAL);
607 			}
608 
609 			if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
610 			    &cmd->nat_src_end)) < 0 ||
611 			    num_src > ILB_MAX_NAT_SRC) {
612 				return (EINVAL);
613 			}
614 		}
615 	}
616 
617 	mutex_enter(&ilbs->ilbs_g_lock);
618 	if (ilbs->ilbs_g_hash == NULL)
619 		ilb_rule_hash_init(ilbs);
620 	if (ilbs->ilbs_c2s_conn_hash == NULL) {
621 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
622 		ilb_conn_hash_init(ilbs);
623 		ilb_nat_src_init(ilbs);
624 	}
625 
626 	/* Make sure that the new rule does not duplicate an existing one. */
627 	if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
628 	    min_port, max_port, &cmd->vip)) {
629 		mutex_exit(&ilbs->ilbs_g_lock);
630 		return (EEXIST);
631 	}
632 
633 	rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
634 	if (rule == NULL) {
635 		mutex_exit(&ilbs->ilbs_g_lock);
636 		return (ENOMEM);
637 	}
638 
639 	/* ir_name is all 0 to begin with */
640 	(void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
641 
642 	rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance);
643 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
644 	if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
645 		ret = ENOMEM;
646 		goto error;
647 	}
648 
649 	if (cmd->topo == ILB_TOPO_IMPL_NAT) {
650 		rule->ir_nat_src_start = cmd->nat_src_start;
651 		rule->ir_nat_src_end = cmd->nat_src_end;
652 	}
653 
654 	rule->ir_ipver = cmd->ip_ver;
655 	rule->ir_proto = cmd->proto;
656 	rule->ir_topo = cmd->topo;
657 
658 	rule->ir_min_port = min_port;
659 	rule->ir_max_port = max_port;
660 	if (rule->ir_min_port != rule->ir_max_port)
661 		rule->ir_port_range = B_TRUE;
662 	else
663 		rule->ir_port_range = B_FALSE;
664 
665 	rule->ir_zoneid = zoneid;
666 
667 	rule->ir_target_v6 = cmd->vip;
668 	rule->ir_servers = NULL;
669 
670 	/*
671 	 * The default connection drain timeout is indefinite (value 0),
672 	 * meaning we will wait for all connections to finish.  So we
673 	 * can assign cmd->conn_drain_timeout to it directly.
674 	 */
675 	rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
676 	if (cmd->nat_expiry != 0) {
677 		rule->ir_nat_expiry = cmd->nat_expiry;
678 	} else {
679 		switch (rule->ir_proto) {
680 		case IPPROTO_TCP:
681 			rule->ir_nat_expiry = ilb_conn_tcp_expiry;
682 			break;
683 		case IPPROTO_UDP:
684 			rule->ir_nat_expiry = ilb_conn_udp_expiry;
685 			break;
686 		default:
687 			cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
688 			    (void *)rule);
689 			break;
690 		}
691 	}
692 	if (cmd->sticky_expiry != 0)
693 		rule->ir_sticky_expiry = cmd->sticky_expiry;
694 	else
695 		rule->ir_sticky_expiry = ilb_sticky_expiry;
696 
697 	if (cmd->flags & ILB_RULE_STICKY) {
698 		rule->ir_flags |= ILB_RULE_STICKY;
699 		rule->ir_sticky_mask = cmd->sticky_mask;
700 		if (ilbs->ilbs_sticky_hash == NULL)
701 			ilb_sticky_hash_init(ilbs);
702 	}
703 	if (cmd->flags & ILB_RULE_ENABLED)
704 		rule->ir_flags |= ILB_RULE_ENABLED;
705 
706 	mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
707 	cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
708 
709 	rule->ir_refcnt = 1;
710 
711 	switch (cmd->algo) {
712 	case ILB_ALG_IMPL_ROUNDROBIN:
713 		if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
714 			ret = ENOMEM;
715 			goto error;
716 		}
717 		rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
718 		break;
719 	case ILB_ALG_IMPL_HASH_IP:
720 	case ILB_ALG_IMPL_HASH_IP_SPORT:
721 	case ILB_ALG_IMPL_HASH_IP_VIP:
722 		if ((rule->ir_alg = ilb_alg_hash_init(rule,
723 		    &cmd->algo)) == NULL) {
724 			ret = ENOMEM;
725 			goto error;
726 		}
727 		rule->ir_alg_type = cmd->algo;
728 		break;
729 	default:
730 		ret = EINVAL;
731 		goto error;
732 	}
733 
734 	/* Add it to the global list and hash array at the end. */
735 	ilb_rule_g_add(ilbs, rule);
736 	ilb_rule_hash_add(ilbs, rule, &cmd->vip);
737 
738 	mutex_exit(&ilbs->ilbs_g_lock);
739 
740 	return (0);
741 
742 error:
743 	mutex_exit(&ilbs->ilbs_g_lock);
744 	if (rule->ir_ksp != NULL) {
745 		/* stackid must be initialized if ir_ksp != NULL */
746 		kstat_delete_netstack(rule->ir_ksp, stackid);
747 	}
748 	kmem_free(rule, sizeof (ilb_rule_t));
749 	return (ret);
750 }
751 
752 /*
753  * The final part in deleting a rule.  Either called directly or by the
754  * taskq dispatched.
755  */
756 static void
757 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
758 {
759 	netstackid_t stackid;
760 	ilb_server_t *server;
761 
762 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
763 
764 	/*
765 	 * Let the algorithm know that the rule is going away.  The
766 	 * algorithm fini routine will free all its resources with this
767 	 * rule.
768 	 */
769 	tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
770 
771 	while ((server = tmp_rule->ir_servers) != NULL) {
772 		mutex_enter(&server->iser_lock);
773 		ilb_destroy_nat_src(&server->iser_nat_src);
774 		if (tmp_rule->ir_conn_drain_timeout != 0) {
775 			/*
776 			 * The garbage collection thread checks this value
777 			 * without grabing a lock.  So we need to use
778 			 * atomic_swap_64() to make sure that the value seen
779 			 * by gc thread is intact.
780 			 */
781 			(void) atomic_swap_64(
782 			    (uint64_t *)&server->iser_die_time,
783 			    ddi_get_lbolt64() +
784 			    SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
785 		}
786 		while (server->iser_refcnt > 1)
787 			cv_wait(&server->iser_cv, &server->iser_lock);
788 		tmp_rule->ir_servers = server->iser_next;
789 		kstat_delete_netstack(server->iser_ksp, stackid);
790 		kmem_free(server, sizeof (ilb_server_t));
791 	}
792 
793 	ASSERT(tmp_rule->ir_ksp != NULL);
794 	kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
795 
796 	kmem_free(tmp_rule, sizeof (ilb_rule_t));
797 }
798 
799 /* The routine executed by the delayed rule taskq. */
800 static void
801 ilb_rule_del_tq(void *arg)
802 {
803 	ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
804 	ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
805 
806 	mutex_enter(&rule->ir_lock);
807 	while (rule->ir_refcnt > 1)
808 		cv_wait(&rule->ir_cv, &rule->ir_lock);
809 	ilb_rule_del_common(ilbs, rule);
810 	kmem_free(arg, sizeof (ilb_rule_tq_t));
811 }
812 
813 /* Routine to delete a rule. */
814 int
815 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
816 {
817 	ilb_rule_t *tmp_rule;
818 	ilb_rule_tq_t *arg;
819 	int err;
820 
821 	mutex_enter(&ilbs->ilbs_g_lock);
822 	if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
823 	    &err)) == NULL) {
824 		mutex_exit(&ilbs->ilbs_g_lock);
825 		return (err);
826 	}
827 
828 	/*
829 	 * First remove the rule from the hash array and the global list so
830 	 * that no one can find this rule any more.
831 	 */
832 	ilb_rule_hash_del(tmp_rule);
833 	ilb_rule_g_del(ilbs, tmp_rule);
834 	mutex_exit(&ilbs->ilbs_g_lock);
835 	ILB_RULE_REFRELE(tmp_rule);
836 
837 	/*
838 	 * Now no one can find this rule, we can remove it once all
839 	 * references to it are dropped and all references to the list
840 	 * of servers are dropped.  So dispatch a task to finish the deletion.
841 	 * We do this instead of letting the last one referencing the
842 	 * rule do it.  The reason is that the last one may be the
843 	 * interrupt thread.  We want to minimize the work it needs to
844 	 * do.  Rule deletion is not a critical task so it can be delayed.
845 	 */
846 	arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
847 	arg->ilbs = ilbs;
848 	arg->rule = tmp_rule;
849 	(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
850 	    TQ_SLEEP);
851 
852 	return (0);
853 }
854 
855 /*
856  * Given an IP address, check to see if there is a rule using this
857  * as the VIP.  It can be used to check if we need to drop a fragment.
858  */
859 boolean_t
860 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
861 {
862 	int i;
863 	ilb_rule_t *rule;
864 	boolean_t ret = B_FALSE;
865 
866 	i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
867 	    ilbs->ilbs_rule_hash_size);
868 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
869 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
870 	    rule = rule->ir_hash_next) {
871 		if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
872 			mutex_enter(&rule->ir_lock);
873 			if (rule->ir_flags & ILB_RULE_BUSY) {
874 				mutex_exit(&rule->ir_lock);
875 				break;
876 			}
877 			if (ret_rule != NULL) {
878 				rule->ir_refcnt++;
879 				mutex_exit(&rule->ir_lock);
880 				*ret_rule = rule;
881 			} else {
882 				mutex_exit(&rule->ir_lock);
883 			}
884 			ret = B_TRUE;
885 			break;
886 		}
887 	}
888 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
889 	return (ret);
890 }
891 
892 boolean_t
893 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
894 {
895 	int i;
896 	ilb_rule_t *rule;
897 	boolean_t ret = B_FALSE;
898 
899 	i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
900 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
901 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
902 	    rule = rule->ir_hash_next) {
903 		if (rule->ir_target_v6.s6_addr32[3] == addr) {
904 			mutex_enter(&rule->ir_lock);
905 			if (rule->ir_flags & ILB_RULE_BUSY) {
906 				mutex_exit(&rule->ir_lock);
907 				break;
908 			}
909 			if (ret_rule != NULL) {
910 				rule->ir_refcnt++;
911 				mutex_exit(&rule->ir_lock);
912 				*ret_rule = rule;
913 			} else {
914 				mutex_exit(&rule->ir_lock);
915 			}
916 			ret = B_TRUE;
917 			break;
918 		}
919 	}
920 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
921 	return (ret);
922 }
923 
924 static ilb_rule_t *
925 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
926     int *err)
927 {
928 	ilb_rule_t *tmp_rule;
929 
930 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
931 
932 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
933 	    tmp_rule = tmp_rule->ir_next) {
934 		if (tmp_rule->ir_zoneid != zoneid)
935 			continue;
936 		if (strcasecmp(tmp_rule->ir_name, name) == 0) {
937 			mutex_enter(&tmp_rule->ir_lock);
938 			if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
939 				mutex_exit(&tmp_rule->ir_lock);
940 				*err = EINPROGRESS;
941 				return (NULL);
942 			}
943 			tmp_rule->ir_refcnt++;
944 			mutex_exit(&tmp_rule->ir_lock);
945 			*err = 0;
946 			return (tmp_rule);
947 		}
948 	}
949 	*err = ENOENT;
950 	return (NULL);
951 }
952 
953 /* To find a rule with a given name and zone in the global rule list. */
954 ilb_rule_t *
955 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
956     int *err)
957 {
958 	ilb_rule_t *tmp_rule;
959 
960 	mutex_enter(&ilbs->ilbs_g_lock);
961 	tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
962 	mutex_exit(&ilbs->ilbs_g_lock);
963 	return (tmp_rule);
964 }
965 
966 /* Try to match the given packet info and zone ID with a rule. */
967 static boolean_t
968 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
969     int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
970 {
971 	ilb_rule_t *tmp_rule;
972 
973 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
974 
975 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
976 	    tmp_rule = tmp_rule->ir_next) {
977 		if (tmp_rule->ir_zoneid != zoneid)
978 			continue;
979 
980 		/*
981 		 * We don't allow the same name in different rules even if all
982 		 * the other rule components are different.
983 		 */
984 		if (strcasecmp(tmp_rule->ir_name, name) == 0)
985 			return (B_TRUE);
986 
987 		if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
988 			continue;
989 
990 		/*
991 		 * ir_min_port and ir_max_port are the same if ir_port_range
992 		 * is false.  In this case, if the ir_min|max_port (same) is
993 		 * outside of the given port range, it is OK.  In other cases,
994 		 * check if min and max port are outside a rule's range.
995 		 */
996 		if (tmp_rule->ir_max_port < min_port ||
997 		    tmp_rule->ir_min_port > max_port) {
998 			continue;
999 		}
1000 
1001 		/*
1002 		 * If l3 is IPv4, the addr passed in is assumed to be
1003 		 * mapped address.
1004 		 */
1005 		if (V6_OR_V4_INADDR_ANY(*addr) ||
1006 		    V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1007 		    IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1008 			return (B_TRUE);
1009 		}
1010 	}
1011 	return (B_FALSE);
1012 }
1013 
1014 int
1015 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1016     const char *rule_name, ilb_rule_t *in_rule)
1017 {
1018 	ilb_rule_t *rule;
1019 	int err;
1020 
1021 	ASSERT((in_rule == NULL && rule_name != NULL) ||
1022 	    (in_rule != NULL && rule_name == NULL));
1023 	if ((rule = in_rule) == NULL) {
1024 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1025 		    &err)) == NULL) {
1026 			return (err);
1027 		}
1028 	}
1029 	mutex_enter(&rule->ir_lock);
1030 	rule->ir_flags |= ILB_RULE_ENABLED;
1031 	mutex_exit(&rule->ir_lock);
1032 
1033 	/* Only refrele if the rule is passed in. */
1034 	if (in_rule == NULL)
1035 		ILB_RULE_REFRELE(rule);
1036 	return (0);
1037 }
1038 
1039 int
1040 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1041     const char *rule_name, ilb_rule_t *in_rule)
1042 {
1043 	ilb_rule_t *rule;
1044 	int err;
1045 
1046 	ASSERT((in_rule == NULL && rule_name != NULL) ||
1047 	    (in_rule != NULL && rule_name == NULL));
1048 	if ((rule = in_rule) == NULL) {
1049 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1050 		    &err)) == NULL) {
1051 			return (err);
1052 		}
1053 	}
1054 	mutex_enter(&rule->ir_lock);
1055 	rule->ir_flags &= ~ILB_RULE_ENABLED;
1056 	mutex_exit(&rule->ir_lock);
1057 
1058 	/* Only refrele if the rule is passed in. */
1059 	if (in_rule == NULL)
1060 		ILB_RULE_REFRELE(rule);
1061 	return (0);
1062 }
1063 
1064 /*
1065  * XXX We should probably have a walker function to walk all rules.  For
1066  * now, just add a simple loop for enable/disable/del.
1067  */
1068 void
1069 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1070 {
1071 	ilb_rule_t *rule;
1072 
1073 	mutex_enter(&ilbs->ilbs_g_lock);
1074 	for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1075 		if (rule->ir_zoneid != zoneid)
1076 			continue;
1077 		/*
1078 		 * No need to hold the rule as we are holding the global
1079 		 * lock so it won't go away.  Ignore the return value here
1080 		 * as the rule is provided so the call cannot fail.
1081 		 */
1082 		(void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1083 	}
1084 	mutex_exit(&ilbs->ilbs_g_lock);
1085 }
1086 
1087 void
1088 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1089 {
1090 	ilb_rule_t *rule;
1091 
1092 	mutex_enter(&ilbs->ilbs_g_lock);
1093 	for (rule = ilbs->ilbs_rule_head; rule != NULL;
1094 	    rule = rule->ir_next) {
1095 		if (rule->ir_zoneid != zoneid)
1096 			continue;
1097 		(void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1098 	}
1099 	mutex_exit(&ilbs->ilbs_g_lock);
1100 }
1101 
1102 void
1103 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1104 {
1105 	ilb_rule_t *rule;
1106 	ilb_rule_tq_t *arg;
1107 
1108 	mutex_enter(&ilbs->ilbs_g_lock);
1109 	while ((rule = ilbs->ilbs_rule_head) != NULL) {
1110 		if (rule->ir_zoneid != zoneid)
1111 			continue;
1112 		ilb_rule_hash_del(rule);
1113 		ilb_rule_g_del(ilbs, rule);
1114 		mutex_exit(&ilbs->ilbs_g_lock);
1115 
1116 		arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1117 		arg->ilbs = ilbs;
1118 		arg->rule = rule;
1119 		(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1120 		    arg, TQ_SLEEP);
1121 
1122 		mutex_enter(&ilbs->ilbs_g_lock);
1123 	}
1124 	mutex_exit(&ilbs->ilbs_g_lock);
1125 }
1126 
1127 /*
1128  * This is just an optimization, so don't grab the global lock.  The
1129  * worst case is that we missed a couple packets.
1130  */
1131 boolean_t
1132 ilb_has_rules(ilb_stack_t *ilbs)
1133 {
1134 	return (ilbs->ilbs_rule_head != NULL);
1135 }
1136 
1137 
1138 static int
1139 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1140     ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1141 {
1142 	ilb_server_t *tmp_server;
1143 	int ret;
1144 
1145 	ASSERT((rule == NULL && rule_name != NULL) ||
1146 	    (rule != NULL && rule_name == NULL));
1147 
1148 	if (rule == NULL) {
1149 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1150 		    &ret)) == NULL) {
1151 			return (ret);
1152 		}
1153 	}
1154 
1155 	/* Once we get a hold on the rule, no server can be added/deleted. */
1156 	for (tmp_server = rule->ir_servers; tmp_server != NULL;
1157 	    tmp_server = tmp_server->iser_next) {
1158 		if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1159 			break;
1160 	}
1161 	if (tmp_server == NULL) {
1162 		ret = ENOENT;
1163 		goto done;
1164 	}
1165 
1166 	if (enable) {
1167 		ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1168 		    rule->ir_alg->ilb_alg_data);
1169 		if (ret == 0) {
1170 			tmp_server->iser_enabled = B_TRUE;
1171 			tmp_server->iser_die_time = 0;
1172 		}
1173 	} else {
1174 		ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1175 		    rule->ir_alg->ilb_alg_data);
1176 		if (ret == 0) {
1177 			tmp_server->iser_enabled = B_FALSE;
1178 			if (rule->ir_conn_drain_timeout != 0) {
1179 				(void) atomic_swap_64(
1180 				    (uint64_t *)&tmp_server->iser_die_time,
1181 				    ddi_get_lbolt64() + SEC_TO_TICK(
1182 				    rule->ir_conn_drain_timeout));
1183 			}
1184 		}
1185 	}
1186 
1187 done:
1188 	if (rule_name != NULL)
1189 		ILB_RULE_REFRELE(rule);
1190 	return (ret);
1191 }
1192 int
1193 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1194     ilb_rule_t *rule, in6_addr_t *addr)
1195 {
1196 	return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1197 }
1198 
1199 int
1200 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1201     ilb_rule_t *rule, in6_addr_t *addr)
1202 {
1203 	return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1204 }
1205 
1206 /*
1207  * Add a back end server to a rule.  If the address is IPv4, it is assumed
1208  * to be passed in as a mapped address.
1209  */
1210 int
1211 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1212 {
1213 	ilb_server_t	*server;
1214 	netstackid_t	stackid;
1215 	int		ret = 0;
1216 	in_port_t	min_port, max_port;
1217 	in_port_t	range;
1218 
1219 	/* Port is passed in network byte order. */
1220 	min_port = ntohs(info->min_port);
1221 	max_port = ntohs(info->max_port);
1222 	if (min_port > max_port)
1223 		return (EINVAL);
1224 
1225 	/* min_port == 0 means "all ports". Make it so */
1226 	if (min_port == 0) {
1227 		min_port = 1;
1228 		max_port = 65535;
1229 	}
1230 	range = max_port - min_port;
1231 
1232 	mutex_enter(&rule->ir_lock);
1233 	/* If someone is already doing server add/del, sleeps and wait. */
1234 	while (rule->ir_flags & ILB_RULE_BUSY) {
1235 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1236 			mutex_exit(&rule->ir_lock);
1237 			return (EINTR);
1238 		}
1239 	}
1240 
1241 	/*
1242 	 * Set the rule to be busy to make sure that no new packet can
1243 	 * use this rule.
1244 	 */
1245 	rule->ir_flags |= ILB_RULE_BUSY;
1246 
1247 	/* Now wait for all other guys to finish their work. */
1248 	while (rule->ir_refcnt > 2) {
1249 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1250 			mutex_exit(&rule->ir_lock);
1251 			ret = EINTR;
1252 			goto end;
1253 		}
1254 	}
1255 	mutex_exit(&rule->ir_lock);
1256 
1257 	/* Sanity checks... */
1258 	if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1259 	    rule->ir_ipver != IPPROTO_IP) ||
1260 	    (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1261 	    rule->ir_ipver != IPPROTO_IPV6)) {
1262 		ret = EINVAL;
1263 		goto end;
1264 	}
1265 
1266 	/*
1267 	 * Check for valid port range.
1268 	 *
1269 	 * For DSR, there can be no port shifting.  Hence the server
1270 	 * specification must be the same as the rule's.
1271 	 *
1272 	 * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1273 	 * it must be equal to the same value as the rule port range.
1274 	 *
1275 	 */
1276 	if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1277 		if (rule->ir_max_port != max_port ||
1278 		    rule->ir_min_port != min_port) {
1279 			ret = EINVAL;
1280 			goto end;
1281 		}
1282 	} else {
1283 		if ((range != rule->ir_max_port - rule->ir_min_port) &&
1284 		    range != 0) {
1285 			ret = EINVAL;
1286 			goto end;
1287 		}
1288 	}
1289 
1290 	/* Check for duplicate. */
1291 	for (server = rule->ir_servers; server != NULL;
1292 	    server = server->iser_next) {
1293 		if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1294 		    strcasecmp(server->iser_name, info->name) == 0) {
1295 			break;
1296 		}
1297 	}
1298 	if (server != NULL) {
1299 		ret = EEXIST;
1300 		goto end;
1301 	}
1302 
1303 	if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1304 		ret = ENOMEM;
1305 		goto end;
1306 	}
1307 
1308 	(void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1309 	(void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1310 	    sizeof (server->iser_ip_addr));
1311 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1312 	server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1313 	if (server->iser_ksp == NULL) {
1314 		kmem_free(server, sizeof (ilb_server_t));
1315 		ret = EINVAL;
1316 		goto end;
1317 	}
1318 
1319 	server->iser_stackid = stackid;
1320 	server->iser_addr_v6 = info->addr;
1321 	server->iser_min_port = min_port;
1322 	server->iser_max_port = max_port;
1323 	if (min_port != max_port)
1324 		server->iser_port_range = B_TRUE;
1325 	else
1326 		server->iser_port_range = B_FALSE;
1327 
1328 	/*
1329 	 * If the rule uses NAT, find/create the NAT source entry to use
1330 	 * for this server.
1331 	 */
1332 	if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1333 		in_port_t port;
1334 
1335 		/*
1336 		 * If the server uses a port range, our port allocation
1337 		 * scheme needs to treat it as a wildcard.  Refer to the
1338 		 * comments in ilb_nat.c about the scheme.
1339 		 */
1340 		if (server->iser_port_range)
1341 			port = 0;
1342 		else
1343 			port = server->iser_min_port;
1344 
1345 		if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1346 		    &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1347 		    num_nat_src_v6(&rule->ir_nat_src_start,
1348 		    &rule->ir_nat_src_end))) != 0) {
1349 			kstat_delete_netstack(server->iser_ksp, stackid);
1350 			kmem_free(server, sizeof (ilb_server_t));
1351 			goto end;
1352 		}
1353 	}
1354 
1355 	/*
1356 	 * The iser_lock is only used to protect iser_refcnt.  All the other
1357 	 * fields in ilb_server_t should not change, except for iser_enabled.
1358 	 * The worst thing that can happen if iser_enabled is messed up is
1359 	 * that one or two packets may not be load balanced to a server
1360 	 * correctly.
1361 	 */
1362 	server->iser_refcnt = 1;
1363 	server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1364 	    B_FALSE;
1365 	mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1366 	cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1367 
1368 	/* Let the load balancing algorithm know about the addition. */
1369 	ASSERT(rule->ir_alg != NULL);
1370 	if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1371 	    rule->ir_alg->ilb_alg_data)) != 0) {
1372 		kstat_delete_netstack(server->iser_ksp, stackid);
1373 		kmem_free(server, sizeof (ilb_server_t));
1374 		goto end;
1375 	}
1376 
1377 	/*
1378 	 * No need to hold ir_lock since no other thread should manipulate
1379 	 * the following fields until ILB_RULE_BUSY is cleared.
1380 	 */
1381 	if (rule->ir_servers == NULL) {
1382 		server->iser_next = NULL;
1383 	} else {
1384 		server->iser_next = rule->ir_servers;
1385 	}
1386 	rule->ir_servers = server;
1387 	ILB_R_KSTAT(rule, num_servers);
1388 
1389 end:
1390 	mutex_enter(&rule->ir_lock);
1391 	rule->ir_flags &= ~ILB_RULE_BUSY;
1392 	cv_signal(&rule->ir_cv);
1393 	mutex_exit(&rule->ir_lock);
1394 	return (ret);
1395 }
1396 
1397 /* The routine executed by the delayed rule processing taskq. */
1398 static void
1399 ilb_server_del_tq(void *arg)
1400 {
1401 	ilb_server_t *server = (ilb_server_t *)arg;
1402 
1403 	mutex_enter(&server->iser_lock);
1404 	while (server->iser_refcnt > 1)
1405 		cv_wait(&server->iser_cv, &server->iser_lock);
1406 	kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1407 	kmem_free(server, sizeof (ilb_server_t));
1408 }
1409 
1410 /*
1411  * Delete a back end server from a rule.  If the address is IPv4, it is assumed
1412  * to be passed in as a mapped address.
1413  */
1414 int
1415 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1416     ilb_rule_t *rule, in6_addr_t *addr)
1417 {
1418 	ilb_server_t	*server;
1419 	ilb_server_t	*prev_server;
1420 	int		ret = 0;
1421 
1422 	ASSERT((rule == NULL && rule_name != NULL) ||
1423 	    (rule != NULL && rule_name == NULL));
1424 	if (rule == NULL) {
1425 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1426 		    &ret)) == NULL) {
1427 			return (ret);
1428 		}
1429 	}
1430 
1431 	mutex_enter(&rule->ir_lock);
1432 	/* If someone is already doing server add/del, sleeps and wait. */
1433 	while (rule->ir_flags & ILB_RULE_BUSY) {
1434 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1435 			if (rule_name != NULL) {
1436 				if (--rule->ir_refcnt <= 2)
1437 					cv_signal(&rule->ir_cv);
1438 			}
1439 			mutex_exit(&rule->ir_lock);
1440 			return (EINTR);
1441 		}
1442 	}
1443 	/*
1444 	 * Set the rule to be busy to make sure that no new packet can
1445 	 * use this rule.
1446 	 */
1447 	rule->ir_flags |= ILB_RULE_BUSY;
1448 
1449 	/* Now wait for all other guys to finish their work. */
1450 	while (rule->ir_refcnt > 2) {
1451 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1452 			mutex_exit(&rule->ir_lock);
1453 			ret = EINTR;
1454 			goto end;
1455 		}
1456 	}
1457 	mutex_exit(&rule->ir_lock);
1458 
1459 	prev_server = NULL;
1460 	for (server = rule->ir_servers; server != NULL;
1461 	    prev_server = server, server = server->iser_next) {
1462 		if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1463 			break;
1464 	}
1465 	if (server == NULL) {
1466 		ret = ENOENT;
1467 		goto end;
1468 	}
1469 
1470 	/*
1471 	 * Let the load balancing algorithm know about the removal.
1472 	 * The algorithm may disallow the removal...
1473 	 */
1474 	if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1475 	    rule->ir_alg->ilb_alg_data)) != 0) {
1476 		goto end;
1477 	}
1478 
1479 	if (prev_server == NULL)
1480 		rule->ir_servers = server->iser_next;
1481 	else
1482 		prev_server->iser_next = server->iser_next;
1483 
1484 	ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1485 
1486 	/*
1487 	 * Mark the server as disabled so that if there is any sticky cache
1488 	 * using this server around, it won't be used.
1489 	 */
1490 	server->iser_enabled = B_FALSE;
1491 
1492 	mutex_enter(&server->iser_lock);
1493 
1494 	/*
1495 	 * De-allocate the NAT source array.  The indiviual ilb_nat_src_entry_t
1496 	 * may not go away if there is still a conn using it.  The NAT source
1497 	 * timer will do the garbage collection.
1498 	 */
1499 	ilb_destroy_nat_src(&server->iser_nat_src);
1500 
1501 	/* If there is a hard limit on when a server should die, set it. */
1502 	if (rule->ir_conn_drain_timeout != 0) {
1503 		(void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1504 		    ddi_get_lbolt64() +
1505 		    SEC_TO_TICK(rule->ir_conn_drain_timeout));
1506 	}
1507 
1508 	if (server->iser_refcnt > 1) {
1509 		(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1510 		    server, TQ_SLEEP);
1511 		mutex_exit(&server->iser_lock);
1512 	} else {
1513 		kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1514 		kmem_free(server, sizeof (ilb_server_t));
1515 	}
1516 
1517 end:
1518 	mutex_enter(&rule->ir_lock);
1519 	rule->ir_flags &= ~ILB_RULE_BUSY;
1520 	if (rule_name != NULL)
1521 		rule->ir_refcnt--;
1522 	cv_signal(&rule->ir_cv);
1523 	mutex_exit(&rule->ir_lock);
1524 	return (ret);
1525 }
1526 
1527 /*
1528  * First check if the destination of the ICMP message matches a VIP of
1529  * a rule.  If it does not, just return ILB_PASSED.
1530  *
1531  * If the destination matches a VIP:
1532  *
1533  * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1534  * server.
1535  *
1536  * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1537  * and see which back end server we should send this message to.  And we
1538  * need to do NAT on both the payload message and the outside IP packet.
1539  *
1540  * For other ICMP messages, drop them.
1541  */
1542 /* ARGSUSED */
1543 static int
1544 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1545     icmph_t *icmph, ipaddr_t *lb_dst)
1546 {
1547 	ipaddr_t vip;
1548 	ilb_rule_t *rule;
1549 	in6_addr_t addr6;
1550 
1551 	if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1552 		return (ILB_PASSED);
1553 
1554 
1555 	if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1556 		ILB_R_KSTAT(rule, icmp_dropped);
1557 		ILB_RULE_REFRELE(rule);
1558 		return (ILB_DROPPED);
1559 	}
1560 
1561 	switch (icmph->icmph_type) {
1562 	case ICMP_ECHO_REQUEST:
1563 		ILB_R_KSTAT(rule, icmp_echo_processed);
1564 		ILB_RULE_REFRELE(rule);
1565 
1566 		icmph->icmph_type = ICMP_ECHO_REPLY;
1567 		icmph->icmph_checksum = 0;
1568 		icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1569 		ipha->ipha_ttl =
1570 		    ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1571 		*lb_dst = ipha->ipha_src;
1572 		vip = ipha->ipha_dst;
1573 		ipha->ipha_dst = ipha->ipha_src;
1574 		ipha->ipha_src = vip;
1575 		return (ILB_BALANCED);
1576 	case ICMP_DEST_UNREACHABLE: {
1577 		int ret;
1578 
1579 		if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1580 			ILB_R_KSTAT(rule, icmp_dropped);
1581 			ILB_RULE_REFRELE(rule);
1582 			return (ILB_DROPPED);
1583 		}
1584 		if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1585 		    &addr6)) {
1586 			ILB_R_KSTAT(rule, icmp_2big_processed);
1587 			ret = ILB_BALANCED;
1588 		} else {
1589 			ILB_R_KSTAT(rule, icmp_2big_dropped);
1590 			ret = ILB_DROPPED;
1591 		}
1592 		ILB_RULE_REFRELE(rule);
1593 		IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1594 		return (ret);
1595 	}
1596 	default:
1597 		ILB_R_KSTAT(rule, icmp_dropped);
1598 		ILB_RULE_REFRELE(rule);
1599 		return (ILB_DROPPED);
1600 	}
1601 }
1602 
1603 /* ARGSUSED */
1604 static int
1605 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1606     icmp6_t *icmp6, in6_addr_t *lb_dst)
1607 {
1608 	ilb_rule_t *rule;
1609 
1610 	if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1611 		return (ILB_PASSED);
1612 
1613 	if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1614 		ILB_R_KSTAT(rule, icmp_dropped);
1615 		ILB_RULE_REFRELE(rule);
1616 		return (ILB_DROPPED);
1617 	}
1618 
1619 	switch (icmp6->icmp6_type) {
1620 	case ICMP6_ECHO_REQUEST: {
1621 		int hdr_len;
1622 
1623 		ILB_R_KSTAT(rule, icmp_echo_processed);
1624 		ILB_RULE_REFRELE(rule);
1625 
1626 		icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1627 		icmp6->icmp6_cksum = ip6h->ip6_plen;
1628 		hdr_len = (char *)icmp6 - (char *)ip6h;
1629 		icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1630 		    ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1631 		ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1632 		ip6h->ip6_hops =
1633 		    ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1634 		*lb_dst = ip6h->ip6_src;
1635 		ip6h->ip6_src = ip6h->ip6_dst;
1636 		ip6h->ip6_dst = *lb_dst;
1637 		return (ILB_BALANCED);
1638 	}
1639 	case ICMP6_PACKET_TOO_BIG: {
1640 		int ret;
1641 
1642 		if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1643 		    lb_dst)) {
1644 			ILB_R_KSTAT(rule, icmp_2big_processed);
1645 			ret = ILB_BALANCED;
1646 		} else {
1647 			ILB_R_KSTAT(rule, icmp_2big_dropped);
1648 			ret = ILB_DROPPED;
1649 		}
1650 		ILB_RULE_REFRELE(rule);
1651 		return (ret);
1652 	}
1653 	default:
1654 		ILB_R_KSTAT(rule, icmp_dropped);
1655 		ILB_RULE_REFRELE(rule);
1656 		return (ILB_DROPPED);
1657 	}
1658 }
1659 
1660 /*
1661  * Common routine to check an incoming packet and decide what to do with it.
1662  * called by ilb_check_v4|v6().
1663  */
1664 static int
1665 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1666     in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1667     in6_addr_t *lb_dst)
1668 {
1669 	in_port_t		sport, dport;
1670 	tcpha_t			*tcph;
1671 	udpha_t			*udph;
1672 	ilb_rule_t		*rule;
1673 	ilb_server_t		*server;
1674 	boolean_t		balanced;
1675 	struct ilb_sticky_s	*s = NULL;
1676 	int			ret;
1677 	uint32_t		ip_sum, tp_sum;
1678 	ilb_nat_info_t		info;
1679 	uint16_t		nat_src_idx;
1680 	boolean_t		busy;
1681 
1682 	/*
1683 	 * We don't really need to switch here since both protocols's
1684 	 * ports are at the same offset.  Just prepare for future protocol
1685 	 * specific processing.
1686 	 */
1687 	switch (l4) {
1688 	case IPPROTO_TCP:
1689 		if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1690 			return (ILB_DROPPED);
1691 		tcph = (tcpha_t *)tph;
1692 		sport = tcph->tha_lport;
1693 		dport = tcph->tha_fport;
1694 		break;
1695 	case IPPROTO_UDP:
1696 		if (tph + sizeof (udpha_t) > mp->b_wptr)
1697 			return (ILB_DROPPED);
1698 		udph = (udpha_t *)tph;
1699 		sport = udph->uha_src_port;
1700 		dport = udph->uha_dst_port;
1701 		break;
1702 	default:
1703 		return (ILB_PASSED);
1704 	}
1705 
1706 	/* Fast path, there is an existing conn. */
1707 	if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1708 	    pkt_len, lb_dst)) {
1709 		return (ILB_BALANCED);
1710 	}
1711 
1712 	/*
1713 	 * If there is no existing connection for the incoming packet, check
1714 	 * to see if the packet matches a rule.  If not, just let IP decide
1715 	 * what to do with it.
1716 	 *
1717 	 * Note: a reply from back end server should not match a rule.  A
1718 	 * reply should match one existing conn.
1719 	 */
1720 	rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1721 	    pkt_len, &busy);
1722 	if (rule == NULL) {
1723 		/* If the rule is busy, just drop the packet. */
1724 		if (busy)
1725 			return (ILB_DROPPED);
1726 		else
1727 			return (ILB_PASSED);
1728 	}
1729 
1730 	/*
1731 	 * The packet matches a rule, use the rule load balance algorithm
1732 	 * to find a server.
1733 	 */
1734 	balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1735 	    rule->ir_alg->ilb_alg_data, &server);
1736 	/*
1737 	 * This can only happen if there is no server in a rule or all
1738 	 * the servers are currently disabled.
1739 	 */
1740 	if (!balanced)
1741 		goto no_server;
1742 
1743 	/*
1744 	 * If the rule is sticky enabled, we need to check the sticky table.
1745 	 * If there is a sticky entry for the client, use the previous server
1746 	 * instead of the one found above (note that both can be the same).
1747 	 * If there is no entry for that client, add an entry to the sticky
1748 	 * table.  Both the find and add are done in ilb_sticky_find_add()
1749 	 * to avoid checking for duplicate when adding an entry.
1750 	 */
1751 	if (rule->ir_flags & ILB_RULE_STICKY) {
1752 		in6_addr_t addr;
1753 
1754 		V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1755 		if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1756 		    &s, &nat_src_idx)) == NULL) {
1757 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1758 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1759 			goto no_server;
1760 		}
1761 	}
1762 
1763 	/*
1764 	 * We are holding a reference on the rule, so the server
1765 	 * cannot go away.
1766 	 */
1767 	*lb_dst = server->iser_addr_v6;
1768 	ILB_S_KSTAT(server, pkt_processed);
1769 	ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1770 
1771 	switch (rule->ir_topo) {
1772 	case ILB_TOPO_IMPL_NAT: {
1773 		ilb_nat_src_entry_t	*src_ent;
1774 		uint16_t		*src_idx;
1775 
1776 		/*
1777 		 * We create a cache even if it is not a SYN segment.
1778 		 * The server should return a RST.  When we see the
1779 		 * RST, we will destroy this cache.  But by having
1780 		 * a cache, we know how to NAT the returned RST.
1781 		 */
1782 		info.vip = *dst;
1783 		info.dport = dport;
1784 		info.src = *src;
1785 		info.sport = sport;
1786 
1787 		/* If stickiness is enabled, use the same source address */
1788 		if (s != NULL)
1789 			src_idx = &nat_src_idx;
1790 		else
1791 			src_idx = NULL;
1792 
1793 		if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1794 		    &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1795 			if (s != NULL)
1796 				ilb_sticky_refrele(s);
1797 			ILB_R_KSTAT(rule, pkt_dropped);
1798 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1799 			ILB_R_KSTAT(rule, noport_pkt_dropped);
1800 			ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1801 			ret = ILB_DROPPED;
1802 			break;
1803 		}
1804 		info.src_ent = src_ent;
1805 		info.nat_dst = server->iser_addr_v6;
1806 		if (rule->ir_port_range && server->iser_port_range) {
1807 			info.nat_dport = htons(ntohs(dport) -
1808 			    rule->ir_min_port + server->iser_min_port);
1809 		} else {
1810 			info.nat_dport = htons(server->iser_min_port);
1811 		}
1812 
1813 		/*
1814 		 * If ilb_conn_add() fails, it will release the reference on
1815 		 * sticky info and de-allocate the NAT source port allocated
1816 		 * above.
1817 		 */
1818 		if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1819 		    dport, &info, &ip_sum, &tp_sum, s) != 0) {
1820 			ILB_R_KSTAT(rule, pkt_dropped);
1821 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1822 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1823 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1824 			ret = ILB_DROPPED;
1825 			break;
1826 		}
1827 		ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1828 		ret = ILB_BALANCED;
1829 		break;
1830 	}
1831 	case ILB_TOPO_IMPL_HALF_NAT:
1832 		info.vip = *dst;
1833 		info.nat_dst = server->iser_addr_v6;
1834 		info.dport = dport;
1835 		if (rule->ir_port_range && server->iser_port_range) {
1836 			info.nat_dport = htons(ntohs(dport) -
1837 			    rule->ir_min_port + server->iser_min_port);
1838 		} else {
1839 			info.nat_dport = htons(server->iser_min_port);
1840 		}
1841 
1842 		if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1843 		    dport, &info, &ip_sum, &tp_sum, s) != 0) {
1844 			ILB_R_KSTAT(rule, pkt_dropped);
1845 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1846 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1847 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1848 			ret = ILB_DROPPED;
1849 			break;
1850 		}
1851 		ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1852 
1853 		ret = ILB_BALANCED;
1854 		break;
1855 	case ILB_TOPO_IMPL_DSR:
1856 		/*
1857 		 * By decrementing the sticky refcnt, the period of
1858 		 * stickiness (life time of ilb_sticky_t) will be
1859 		 * from now to (now + default expiry time).
1860 		 */
1861 		if (s != NULL)
1862 			ilb_sticky_refrele(s);
1863 		ret = ILB_BALANCED;
1864 		break;
1865 	default:
1866 		cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1867 		    (void *) rule);
1868 		break;
1869 	}
1870 	ILB_RULE_REFRELE(rule);
1871 	return (ret);
1872 
1873 no_server:
1874 	/* This can only happen if there is no server available. */
1875 	ILB_R_KSTAT(rule, pkt_dropped);
1876 	ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1877 	ILB_RULE_REFRELE(rule);
1878 	return (ILB_DROPPED);
1879 }
1880 
1881 int
1882 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1883     uint8_t *tph, ipaddr_t *lb_dst)
1884 {
1885 	in6_addr_t v6_src, v6_dst, v6_lb_dst;
1886 	int ret;
1887 
1888 	ASSERT(DB_REF(mp) == 1);
1889 
1890 	if (l4 == IPPROTO_ICMP) {
1891 		return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1892 		    lb_dst));
1893 	}
1894 
1895 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1896 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1897 	ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1898 	    tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1899 	if (ret == ILB_BALANCED)
1900 		IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1901 	return (ret);
1902 }
1903 
1904 int
1905 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1906     uint8_t *tph, in6_addr_t *lb_dst)
1907 {
1908 	uint32_t pkt_len;
1909 
1910 	ASSERT(DB_REF(mp) == 1);
1911 
1912 	if (l4 == IPPROTO_ICMPV6) {
1913 		return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1914 		    lb_dst));
1915 	}
1916 
1917 	pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1918 	return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1919 	    IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1920 }
1921 
1922 void
1923 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1924 {
1925 	ilb_rule_t *tmp_rule;
1926 
1927 	mutex_enter(&ilbs->ilbs_g_lock);
1928 	*num_rules = 0;
1929 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1930 	    tmp_rule = tmp_rule->ir_next) {
1931 		if (tmp_rule->ir_zoneid == zoneid)
1932 			*num_rules += 1;
1933 	}
1934 	mutex_exit(&ilbs->ilbs_g_lock);
1935 }
1936 
1937 int
1938 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1939     uint32_t *num_servers)
1940 {
1941 	ilb_rule_t *rule;
1942 	int err;
1943 
1944 	if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1945 		return (err);
1946 	*num_servers = rule->ir_kstat.num_servers.value.ui64;
1947 	ILB_RULE_REFRELE(rule);
1948 	return (0);
1949 }
1950 
1951 int
1952 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1953     ilb_server_info_t *servers, uint32_t *num_servers)
1954 {
1955 	ilb_rule_t *rule;
1956 	ilb_server_t *server;
1957 	size_t cnt;
1958 	int err;
1959 
1960 	if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1961 		return (err);
1962 	for (server = rule->ir_servers, cnt = *num_servers;
1963 	    server != NULL && cnt > 0;
1964 	    server = server->iser_next, cnt--, servers++) {
1965 		(void) memcpy(servers->name, server->iser_name,
1966 		    ILB_SERVER_NAMESZ);
1967 		servers->addr = server->iser_addr_v6;
1968 		servers->min_port = htons(server->iser_min_port);
1969 		servers->max_port = htons(server->iser_max_port);
1970 		servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1971 		servers->err = 0;
1972 	}
1973 	ILB_RULE_REFRELE(rule);
1974 	*num_servers -= cnt;
1975 
1976 	return (0);
1977 }
1978 
1979 void
1980 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1981     char *buf)
1982 {
1983 	ilb_rule_t *tmp_rule;
1984 	int cnt;
1985 
1986 	if (*num_names == 0)
1987 		return;
1988 
1989 	mutex_enter(&ilbs->ilbs_g_lock);
1990 	for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1991 	    tmp_rule = tmp_rule->ir_next) {
1992 		if (tmp_rule->ir_zoneid != zoneid)
1993 			continue;
1994 
1995 		(void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1996 		buf += ILB_RULE_NAMESZ;
1997 		if (++cnt == *num_names)
1998 			break;
1999 	}
2000 	mutex_exit(&ilbs->ilbs_g_lock);
2001 	*num_names = cnt;
2002 }
2003 
2004 int
2005 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2006 {
2007 	ilb_rule_t *rule;
2008 	int err;
2009 
2010 	if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2011 		return (err);
2012 	}
2013 
2014 	/*
2015 	 * Except the enabled flags, none of the following will change
2016 	 * in the life time of a rule.  So we don't hold the mutex when
2017 	 * reading them.  The worst is to report a wrong enabled flags.
2018 	 */
2019 	cmd->ip_ver = rule->ir_ipver;
2020 	cmd->proto = rule->ir_proto;
2021 	cmd->min_port = htons(rule->ir_min_port);
2022 	cmd->max_port = htons(rule->ir_max_port);
2023 
2024 	cmd->vip = rule->ir_target_v6;
2025 	cmd->algo = rule->ir_alg_type;
2026 	cmd->topo = rule->ir_topo;
2027 
2028 	cmd->nat_src_start = rule->ir_nat_src_start;
2029 	cmd->nat_src_end = rule->ir_nat_src_end;
2030 
2031 	cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2032 	cmd->nat_expiry = rule->ir_nat_expiry;
2033 	cmd->sticky_expiry = rule->ir_sticky_expiry;
2034 
2035 	cmd->flags = 0;
2036 	if (rule->ir_flags & ILB_RULE_ENABLED)
2037 		cmd->flags |= ILB_RULE_ENABLED;
2038 	if (rule->ir_flags & ILB_RULE_STICKY) {
2039 		cmd->flags |= ILB_RULE_STICKY;
2040 		cmd->sticky_mask = rule->ir_sticky_mask;
2041 	}
2042 
2043 	ILB_RULE_REFRELE(rule);
2044 	return (0);
2045 }
2046 
2047 static void *
2048 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2049 {
2050 	ilb_stack_t *ilbs;
2051 	char tq_name[TASKQ_NAMELEN];
2052 
2053 	ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2054 	ilbs->ilbs_netstack = ns;
2055 
2056 	ilbs->ilbs_rule_head = NULL;
2057 	ilbs->ilbs_g_hash = NULL;
2058 	mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2059 
2060 	ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2061 	if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2062 		kmem_free(ilbs, sizeof (ilb_stack_t));
2063 		return (NULL);
2064 	}
2065 
2066 	/*
2067 	 * ilbs_conn/sticky_hash related info is initialized in
2068 	 * ilb_conn/sticky_hash_init().
2069 	 */
2070 	ilbs->ilbs_conn_taskq = NULL;
2071 	ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2072 	ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2073 	ilbs->ilbs_c2s_conn_hash = NULL;
2074 	ilbs->ilbs_s2c_conn_hash = NULL;
2075 	ilbs->ilbs_conn_timer_list = NULL;
2076 
2077 	ilbs->ilbs_sticky_hash = NULL;
2078 	ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2079 	ilbs->ilbs_sticky_timer_list = NULL;
2080 	ilbs->ilbs_sticky_taskq = NULL;
2081 
2082 	/* The allocation is done later when there is a rule using NAT mode. */
2083 	ilbs->ilbs_nat_src = NULL;
2084 	ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2085 	mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2086 	ilbs->ilbs_nat_src_tid = 0;
2087 
2088 	/* For listing the conn hash table */
2089 	mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2090 	cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2091 	ilbs->ilbs_conn_list_busy = B_FALSE;
2092 	ilbs->ilbs_conn_list_cur = 0;
2093 	ilbs->ilbs_conn_list_connp = NULL;
2094 
2095 	/* For listing the sticky hash table */
2096 	mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2097 	cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2098 	ilbs->ilbs_sticky_list_busy = B_FALSE;
2099 	ilbs->ilbs_sticky_list_cur = 0;
2100 	ilbs->ilbs_sticky_list_curp = NULL;
2101 
2102 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
2103 	    (void *)ns);
2104 	ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2105 	    minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2106 
2107 	return (ilbs);
2108 }
2109 
2110 /* ARGSUSED */
2111 static void
2112 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2113 {
2114 	ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2115 	ilb_rule_t *tmp_rule;
2116 
2117 	ilb_sticky_hash_fini(ilbs);
2118 	ilb_conn_hash_fini(ilbs);
2119 	mutex_enter(&ilbs->ilbs_g_lock);
2120 	while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2121 		ilb_rule_hash_del(tmp_rule);
2122 		ilb_rule_g_del(ilbs, tmp_rule);
2123 		mutex_exit(&ilbs->ilbs_g_lock);
2124 		ilb_rule_del_common(ilbs, tmp_rule);
2125 		mutex_enter(&ilbs->ilbs_g_lock);
2126 	}
2127 	mutex_exit(&ilbs->ilbs_g_lock);
2128 	if (ilbs->ilbs_nat_src != NULL)
2129 		ilb_nat_src_fini(ilbs);
2130 }
2131 
2132 static void
2133 ilb_stack_fini(netstackid_t stackid, void * arg)
2134 {
2135 	ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2136 
2137 	ilb_rule_hash_fini(ilbs);
2138 	taskq_destroy(ilbs->ilbs_rule_taskq);
2139 	ilb_kstat_g_fini(stackid, ilbs);
2140 	kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2141 	kmem_free(ilbs, sizeof (ilb_stack_t));
2142 }
2143 
2144 void
2145 ilb_ddi_g_init(void)
2146 {
2147 	netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2148 	    ilb_stack_fini);
2149 }
2150 
2151 void
2152 ilb_ddi_g_destroy(void)
2153 {
2154 	netstack_unregister(NS_ILB);
2155 	ilb_conn_cache_fini();
2156 	ilb_sticky_cache_fini();
2157 }
2158