1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/sysmacros.h>
28 #include <sys/kmem.h>
29 #include <sys/ksynch.h>
30 #include <sys/systm.h>
31 #include <sys/socket.h>
32 #include <sys/disp.h>
33 #include <sys/taskq.h>
34 #include <sys/cmn_err.h>
35 #include <sys/strsun.h>
36 #include <sys/sdt.h>
37 #include <sys/atomic.h>
38 #include <netinet/in.h>
39 #include <inet/ip.h>
40 #include <inet/ip6.h>
41 #include <inet/tcp.h>
42 #include <inet/udp_impl.h>
43 #include <inet/kstatcom.h>
44
45 #include <inet/ilb_ip.h>
46 #include "ilb_alg.h"
47 #include "ilb_nat.h"
48 #include "ilb_conn.h"
49
50 /* ILB kmem cache flag */
51 int ilb_kmem_flags = 0;
52
53 /*
54 * The default size for the different hash tables. Global for all stacks.
55 * But each stack has its own table, just that their sizes are the same.
56 */
57 static size_t ilb_rule_hash_size = 2048;
58
59 static size_t ilb_conn_hash_size = 262144;
60
61 static size_t ilb_sticky_hash_size = 262144;
62
63 /* This should be a prime number. */
64 static size_t ilb_nat_src_hash_size = 97;
65
66 /* Default NAT cache entry expiry time. */
67 static uint32_t ilb_conn_tcp_expiry = 120;
68 static uint32_t ilb_conn_udp_expiry = 60;
69
70 /* Default sticky entry expiry time. */
71 static uint32_t ilb_sticky_expiry = 60;
72
73 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
74 #define ILB_RULE_HASH(addr, hash_size) \
75 ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
76 *(addr)) & ((hash_size) - 1))
77
78 /*
79 * Note on ILB delayed processing
80 *
81 * To avoid in line removal on some of the data structures, such as rules,
82 * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
83 * There are three types of ILB taskq:
84 *
85 * 1. rule handling: created at stack initialialization time, ilb_stack_init()
86 * 2. conn hash handling: created at conn hash initialization time,
87 * ilb_conn_hash_init()
88 * 3. sticky hash handling: created at sticky hash initialization time,
89 * ilb_sticky_hash_init()
90 *
91 * The rule taskq is for processing rule and server removal. When a user
92 * land rule/server removal request comes in, a taskq is dispatched after
93 * removing the rule/server from all related hashes. This taskq will wait
94 * until all references to the rule/server are gone before removing it.
95 * So the user land thread requesting the removal does not need to wait
96 * for the removal completion.
97 *
98 * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
99 * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers
100 * and ilb_sticky_timer_size timers running for ilb_conn_hash and
101 * ilb_sticky_hash cleanup respectively. Each timer is responsible for one
102 * portion (same size) of the hash table. When a timer fires, it dispatches
103 * a conn hash taskq to clean up its portion of the table. This avoids in
104 * line processing of the removal.
105 *
106 * There is another delayed processing, the clean up of NAT source address
107 * table. We just use the timer to directly handle it instead of using
108 * a taskq. The reason is that the table is small so it is OK to use the
109 * timer.
110 */
111
112 /* ILB rule taskq constants. */
113 #define ILB_RULE_TASKQ_NUM_THR 20
114
115 /* Argument passed to ILB rule taskq routines. */
116 typedef struct {
117 ilb_stack_t *ilbs;
118 ilb_rule_t *rule;
119 } ilb_rule_tq_t;
120
121 /* kstat handling routines. */
122 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
123 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
124 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
125 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
126 ilb_server_t *);
127
128 /* Rule hash handling routines. */
129 static void ilb_rule_hash_init(ilb_stack_t *);
130 static void ilb_rule_hash_fini(ilb_stack_t *);
131 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
132 static void ilb_rule_hash_del(ilb_rule_t *);
133 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
134 in_port_t, zoneid_t, uint32_t, boolean_t *);
135
136 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
137 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
138 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
139 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
140 int *);
141 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
142 int, in_port_t, in_port_t, const in6_addr_t *);
143
144 /* Back end server handling routines. */
145 static void ilb_server_free(ilb_server_t *);
146
147 /* Network stack handling routines. */
148 static void *ilb_stack_init(netstackid_t, netstack_t *);
149 static void ilb_stack_shutdown(netstackid_t, void *);
150 static void ilb_stack_fini(netstackid_t, void *);
151
152 /* Sticky connection handling routines. */
153 static void ilb_rule_sticky_init(ilb_rule_t *);
154 static void ilb_rule_sticky_fini(ilb_rule_t *);
155
156 /* Handy macro to check for unspecified address. */
157 #define IS_ADDR_UNSPEC(addr) \
158 (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \
159 IN6_IS_ADDR_UNSPECIFIED(addr))
160
161 /*
162 * Global kstat instance counter. When a rule is created, its kstat instance
163 * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
164 * incremented.
165 */
166 static uint_t ilb_kstat_instance = 0;
167
168 /*
169 * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
170 * A rule's kstat has ILB_RULE_KS_CNAME class name.
171 */
172 #define ILB_G_KS_NAME "global"
173 #define ILB_G_KS_CNAME "kstat"
174 #define ILB_RULE_KS_CNAME "rulestat"
175
176 static kstat_t *
ilb_kstat_g_init(netstackid_t stackid,ilb_stack_t * ilbs)177 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
178 {
179 kstat_t *ksp;
180 ilb_g_kstat_t template = {
181 { "num_rules", KSTAT_DATA_UINT64, 0 },
182 { "ip_frag_in", KSTAT_DATA_UINT64, 0 },
183 { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 }
184 };
185
186 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
187 ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
188 KSTAT_FLAG_VIRTUAL, stackid);
189 if (ksp == NULL)
190 return (NULL);
191 bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
192 ksp->ks_data = ilbs->ilbs_kstat;
193 ksp->ks_private = (void *)(uintptr_t)stackid;
194
195 kstat_install(ksp);
196 return (ksp);
197 }
198
199 static void
ilb_kstat_g_fini(netstackid_t stackid,ilb_stack_t * ilbs)200 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
201 {
202 if (ilbs->ilbs_ksp != NULL) {
203 ASSERT(stackid == (netstackid_t)(uintptr_t)
204 ilbs->ilbs_ksp->ks_private);
205 kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
206 ilbs->ilbs_ksp = NULL;
207 }
208 }
209
210 static kstat_t *
ilb_rule_kstat_init(netstackid_t stackid,ilb_rule_t * rule)211 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
212 {
213 kstat_t *ksp;
214 ilb_rule_kstat_t template = {
215 { "num_servers", KSTAT_DATA_UINT64, 0 },
216 { "bytes_not_processed", KSTAT_DATA_UINT64, 0 },
217 { "pkt_not_processed", KSTAT_DATA_UINT64, 0 },
218 { "bytes_dropped", KSTAT_DATA_UINT64, 0 },
219 { "pkt_dropped", KSTAT_DATA_UINT64, 0 },
220 { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 },
221 { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 },
222 { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 },
223 { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 },
224 { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 },
225 { "icmp_dropped", KSTAT_DATA_UINT64, 0 },
226 { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 },
227 { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 }
228 };
229
230 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
231 rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
232 NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
233 if (ksp == NULL)
234 return (NULL);
235
236 bcopy(&template, &rule->ir_kstat, sizeof (template));
237 ksp->ks_data = &rule->ir_kstat;
238 ksp->ks_private = (void *)(uintptr_t)stackid;
239
240 kstat_install(ksp);
241 return (ksp);
242 }
243
244 static kstat_t *
ilb_server_kstat_init(netstackid_t stackid,ilb_rule_t * rule,ilb_server_t * server)245 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
246 ilb_server_t *server)
247 {
248 kstat_t *ksp;
249 ilb_server_kstat_t template = {
250 { "bytes_processed", KSTAT_DATA_UINT64, 0 },
251 { "pkt_processed", KSTAT_DATA_UINT64, 0 },
252 { "ip_address", KSTAT_DATA_STRING, 0 }
253 };
254 char cname_buf[KSTAT_STRLEN];
255
256 /* 7 is "-sstat" */
257 ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
258 (void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
259 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
260 server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
261 NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
262 if (ksp == NULL)
263 return (NULL);
264
265 bcopy(&template, &server->iser_kstat, sizeof (template));
266 ksp->ks_data = &server->iser_kstat;
267 ksp->ks_private = (void *)(uintptr_t)stackid;
268
269 kstat_named_setstr(&server->iser_kstat.ip_address,
270 server->iser_ip_addr);
271 /* We never change the IP address */
272 ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
273
274 kstat_install(ksp);
275 return (ksp);
276 }
277
278 /* Initialize the rule hash table. */
279 static void
ilb_rule_hash_init(ilb_stack_t * ilbs)280 ilb_rule_hash_init(ilb_stack_t *ilbs)
281 {
282 int i;
283
284 /*
285 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
286 * the next power of 2.
287 */
288 if (!ISP2(ilbs->ilbs_rule_hash_size)) {
289 for (i = 0; i < 31; i++) {
290 if (ilbs->ilbs_rule_hash_size < (1 << i))
291 break;
292 }
293 ilbs->ilbs_rule_hash_size = 1 << i;
294 }
295 ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
296 ilbs->ilbs_rule_hash_size, KM_SLEEP);
297 for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
298 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
299 MUTEX_DEFAULT, NULL);
300 }
301 }
302
303 /* Clean up the rule hash table. */
304 static void
ilb_rule_hash_fini(ilb_stack_t * ilbs)305 ilb_rule_hash_fini(ilb_stack_t *ilbs)
306 {
307 if (ilbs->ilbs_g_hash == NULL)
308 return;
309 kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
310 ilbs->ilbs_rule_hash_size);
311 }
312
313 /* Add a rule to the rule hash table. */
314 static void
ilb_rule_hash_add(ilb_stack_t * ilbs,ilb_rule_t * rule,const in6_addr_t * addr)315 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
316 {
317 int i;
318
319 i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
320 ilbs->ilbs_rule_hash_size);
321 DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
322 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
323 rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
324 if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
325 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
326 rule->ir_hash_prev = NULL;
327 ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
328
329 rule->ir_hash = &ilbs->ilbs_g_hash[i];
330 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
331 }
332
333 /*
334 * Remove a rule from the rule hash table. Note that the rule is not freed
335 * in this routine.
336 */
337 static void
ilb_rule_hash_del(ilb_rule_t * rule)338 ilb_rule_hash_del(ilb_rule_t *rule)
339 {
340 mutex_enter(&rule->ir_hash->ilb_hash_lock);
341 if (rule->ir_hash->ilb_hash_rule == rule) {
342 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
343 if (rule->ir_hash_next != NULL)
344 rule->ir_hash_next->ir_hash_prev = NULL;
345 } else {
346 if (rule->ir_hash_prev != NULL)
347 rule->ir_hash_prev->ir_hash_next =
348 rule->ir_hash_next;
349 if (rule->ir_hash_next != NULL) {
350 rule->ir_hash_next->ir_hash_prev =
351 rule->ir_hash_prev;
352 }
353 }
354 mutex_exit(&rule->ir_hash->ilb_hash_lock);
355
356 rule->ir_hash_next = NULL;
357 rule->ir_hash_prev = NULL;
358 rule->ir_hash = NULL;
359 }
360
361 /*
362 * Given the info of a packet, look for a match in the rule hash table.
363 */
364 static ilb_rule_t *
ilb_rule_hash(ilb_stack_t * ilbs,int l3,int l4,in6_addr_t * addr,in_port_t port,zoneid_t zoneid,uint32_t len,boolean_t * busy)365 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
366 in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
367 {
368 int i;
369 ilb_rule_t *rule;
370 ipaddr_t v4_addr;
371
372 *busy = B_FALSE;
373 IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
374 i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
375 port = ntohs(port);
376
377 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
378 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
379 rule = rule->ir_hash_next) {
380 if (!rule->ir_port_range) {
381 if (rule->ir_min_port != port)
382 continue;
383 } else {
384 if (port < rule->ir_min_port ||
385 port > rule->ir_max_port) {
386 continue;
387 }
388 }
389 if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
390 rule->ir_zoneid != zoneid) {
391 continue;
392 }
393
394 if (l3 == IPPROTO_IP) {
395 if (rule->ir_target_v4 != INADDR_ANY &&
396 rule->ir_target_v4 != v4_addr) {
397 continue;
398 }
399 } else {
400 if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
401 !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
402 continue;
403 }
404 }
405
406 /*
407 * Just update the stats if the rule is disabled.
408 */
409 mutex_enter(&rule->ir_lock);
410 if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
411 ILB_R_KSTAT(rule, pkt_not_processed);
412 ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
413 mutex_exit(&rule->ir_lock);
414 rule = NULL;
415 break;
416 } else if (rule->ir_flags & ILB_RULE_BUSY) {
417 /*
418 * If we are busy...
419 *
420 * XXX we should have a queue to postpone the
421 * packet processing. But this requires a
422 * mechanism in IP to re-start the packet
423 * processing. So for now, just drop the packet.
424 */
425 ILB_R_KSTAT(rule, pkt_dropped);
426 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
427 mutex_exit(&rule->ir_lock);
428 *busy = B_TRUE;
429 rule = NULL;
430 break;
431 } else {
432 rule->ir_refcnt++;
433 ASSERT(rule->ir_refcnt != 1);
434 mutex_exit(&rule->ir_lock);
435 break;
436 }
437 }
438 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
439 return (rule);
440 }
441
442 /*
443 * Add a rule to the global rule list. This list is for finding all rules
444 * in an IP stack. The caller is assumed to hold the ilbs_g_lock.
445 */
446 static void
ilb_rule_g_add(ilb_stack_t * ilbs,ilb_rule_t * rule)447 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
448 {
449 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
450 rule->ir_next = ilbs->ilbs_rule_head;
451 ilbs->ilbs_rule_head = rule;
452 ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
453 }
454
455 /* The call is assumed to hold the ilbs_g_lock. */
456 static void
ilb_rule_g_del(ilb_stack_t * ilbs,ilb_rule_t * rule)457 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
458 {
459 ilb_rule_t *tmp_rule;
460 ilb_rule_t *prev_rule;
461
462 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
463 prev_rule = NULL;
464 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
465 prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
466 if (tmp_rule == rule)
467 break;
468 }
469 if (tmp_rule == NULL) {
470 mutex_exit(&ilbs->ilbs_g_lock);
471 return;
472 }
473 if (prev_rule == NULL)
474 ilbs->ilbs_rule_head = tmp_rule->ir_next;
475 else
476 prev_rule->ir_next = tmp_rule->ir_next;
477 ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
478 }
479
480 /*
481 * Helper routine to calculate how many source addresses are in a given
482 * range.
483 */
484 static int64_t
num_nat_src_v6(const in6_addr_t * a1,const in6_addr_t * a2)485 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
486 {
487 int64_t ret;
488 uint32_t addr1, addr2;
489
490 /*
491 * Here we assume that the max number of NAT source cannot be
492 * large such that the most significant 2 s6_addr32 must be
493 * equal.
494 */
495 addr1 = ntohl(a1->s6_addr32[3]);
496 addr2 = ntohl(a2->s6_addr32[3]);
497 if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
498 a1->s6_addr32[1] != a2->s6_addr32[1] ||
499 a1->s6_addr32[2] > a2->s6_addr32[2] ||
500 (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
501 return (-1);
502 }
503 if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
504 return (addr2 - addr1 + 1);
505 } else {
506 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
507 ret <<= 32;
508 ret = ret + addr1 - addr2;
509 return (ret + 1);
510 }
511 }
512
513 /*
514 * Add an ILB rule.
515 */
516 int
ilb_rule_add(ilb_stack_t * ilbs,zoneid_t zoneid,const ilb_rule_cmd_t * cmd)517 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
518 {
519 ilb_rule_t *rule;
520 netstackid_t stackid;
521 int ret;
522 in_port_t min_port, max_port;
523 int64_t num_src;
524
525 /* Sanity checks. */
526 if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
527 return (EINVAL);
528
529 /* Need to support SCTP... */
530 if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
531 return (EINVAL);
532
533 /* For full NAT, the NAT source must be supplied. */
534 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
535 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
536 IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
537 return (EINVAL);
538 }
539 }
540
541 /* Check invalid mask */
542 if ((cmd->flags & ILB_RULE_STICKY) &&
543 IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
544 return (EINVAL);
545 }
546
547 /* Port is passed in network byte order. */
548 min_port = ntohs(cmd->min_port);
549 max_port = ntohs(cmd->max_port);
550 if (min_port > max_port)
551 return (EINVAL);
552
553 /* min_port == 0 means "all ports". Make it so */
554 if (min_port == 0) {
555 min_port = 1;
556 max_port = 65535;
557 }
558
559 /* Funny address checking. */
560 if (cmd->ip_ver == IPPROTO_IP) {
561 in_addr_t v4_addr1, v4_addr2;
562
563 v4_addr1 = cmd->vip.s6_addr32[3];
564 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
565 CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
566 v4_addr1 == INADDR_ANY ||
567 !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
568 return (EINVAL);
569 }
570
571 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
572 v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
573 v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
574 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
575 (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
576 v4_addr1 == INADDR_BROADCAST ||
577 v4_addr2 == INADDR_BROADCAST ||
578 v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
579 CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
580 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
581 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
582 return (EINVAL);
583 }
584
585 num_src = v4_addr2 - v4_addr1 + 1;
586 if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
587 return (EINVAL);
588 }
589 } else {
590 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
591 IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
592 IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
593 IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
594 return (EINVAL);
595 }
596
597 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
598 if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
599 IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
600 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
601 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
602 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
603 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
604 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
605 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
606 return (EINVAL);
607 }
608
609 if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
610 &cmd->nat_src_end)) < 0 ||
611 num_src > ILB_MAX_NAT_SRC) {
612 return (EINVAL);
613 }
614 }
615 }
616
617 mutex_enter(&ilbs->ilbs_g_lock);
618 if (ilbs->ilbs_g_hash == NULL)
619 ilb_rule_hash_init(ilbs);
620 if (ilbs->ilbs_c2s_conn_hash == NULL) {
621 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
622 ilb_conn_hash_init(ilbs);
623 ilb_nat_src_init(ilbs);
624 }
625
626 /* Make sure that the new rule does not duplicate an existing one. */
627 if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
628 min_port, max_port, &cmd->vip)) {
629 mutex_exit(&ilbs->ilbs_g_lock);
630 return (EEXIST);
631 }
632
633 rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
634 if (rule == NULL) {
635 mutex_exit(&ilbs->ilbs_g_lock);
636 return (ENOMEM);
637 }
638
639 /* ir_name is all 0 to begin with */
640 (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
641
642 rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance);
643 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
644 if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
645 ret = ENOMEM;
646 goto error;
647 }
648
649 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
650 rule->ir_nat_src_start = cmd->nat_src_start;
651 rule->ir_nat_src_end = cmd->nat_src_end;
652 }
653
654 rule->ir_ipver = cmd->ip_ver;
655 rule->ir_proto = cmd->proto;
656 rule->ir_topo = cmd->topo;
657
658 rule->ir_min_port = min_port;
659 rule->ir_max_port = max_port;
660 if (rule->ir_min_port != rule->ir_max_port)
661 rule->ir_port_range = B_TRUE;
662 else
663 rule->ir_port_range = B_FALSE;
664
665 rule->ir_zoneid = zoneid;
666
667 rule->ir_target_v6 = cmd->vip;
668 rule->ir_servers = NULL;
669
670 /*
671 * The default connection drain timeout is indefinite (value 0),
672 * meaning we will wait for all connections to finish. So we
673 * can assign cmd->conn_drain_timeout to it directly.
674 */
675 rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
676 if (cmd->nat_expiry != 0) {
677 rule->ir_nat_expiry = cmd->nat_expiry;
678 } else {
679 switch (rule->ir_proto) {
680 case IPPROTO_TCP:
681 rule->ir_nat_expiry = ilb_conn_tcp_expiry;
682 break;
683 case IPPROTO_UDP:
684 rule->ir_nat_expiry = ilb_conn_udp_expiry;
685 break;
686 default:
687 cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
688 (void *)rule);
689 break;
690 }
691 }
692 if (cmd->sticky_expiry != 0)
693 rule->ir_sticky_expiry = cmd->sticky_expiry;
694 else
695 rule->ir_sticky_expiry = ilb_sticky_expiry;
696
697 if (cmd->flags & ILB_RULE_STICKY) {
698 rule->ir_flags |= ILB_RULE_STICKY;
699 rule->ir_sticky_mask = cmd->sticky_mask;
700 if (ilbs->ilbs_sticky_hash == NULL)
701 ilb_sticky_hash_init(ilbs);
702 }
703 if (cmd->flags & ILB_RULE_ENABLED)
704 rule->ir_flags |= ILB_RULE_ENABLED;
705
706 mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
707 cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
708
709 rule->ir_refcnt = 1;
710
711 switch (cmd->algo) {
712 case ILB_ALG_IMPL_ROUNDROBIN:
713 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
714 ret = ENOMEM;
715 goto error;
716 }
717 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
718 break;
719 case ILB_ALG_IMPL_HASH_IP:
720 case ILB_ALG_IMPL_HASH_IP_SPORT:
721 case ILB_ALG_IMPL_HASH_IP_VIP:
722 if ((rule->ir_alg = ilb_alg_hash_init(rule,
723 &cmd->algo)) == NULL) {
724 ret = ENOMEM;
725 goto error;
726 }
727 rule->ir_alg_type = cmd->algo;
728 break;
729 default:
730 ret = EINVAL;
731 goto error;
732 }
733
734 /* Add it to the global list and hash array at the end. */
735 ilb_rule_g_add(ilbs, rule);
736 ilb_rule_hash_add(ilbs, rule, &cmd->vip);
737
738 mutex_exit(&ilbs->ilbs_g_lock);
739
740 return (0);
741
742 error:
743 mutex_exit(&ilbs->ilbs_g_lock);
744 if (rule->ir_ksp != NULL) {
745 /* stackid must be initialized if ir_ksp != NULL */
746 kstat_delete_netstack(rule->ir_ksp, stackid);
747 }
748 kmem_free(rule, sizeof (ilb_rule_t));
749 return (ret);
750 }
751
752 /*
753 * The final part in deleting a rule. Either called directly or by the
754 * taskq dispatched.
755 */
756 static void
ilb_rule_del_common(ilb_stack_t * ilbs,ilb_rule_t * tmp_rule)757 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
758 {
759 netstackid_t stackid;
760 ilb_server_t *server;
761
762 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
763
764 /*
765 * Let the algorithm know that the rule is going away. The
766 * algorithm fini routine will free all its resources with this
767 * rule.
768 */
769 tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
770
771 while ((server = tmp_rule->ir_servers) != NULL) {
772 mutex_enter(&server->iser_lock);
773 ilb_destroy_nat_src(&server->iser_nat_src);
774 if (tmp_rule->ir_conn_drain_timeout != 0) {
775 /*
776 * The garbage collection thread checks this value
777 * without grabing a lock. So we need to use
778 * atomic_swap_64() to make sure that the value seen
779 * by gc thread is intact.
780 */
781 (void) atomic_swap_64(
782 (uint64_t *)&server->iser_die_time,
783 ddi_get_lbolt64() +
784 SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
785 }
786 while (server->iser_refcnt > 1)
787 cv_wait(&server->iser_cv, &server->iser_lock);
788 tmp_rule->ir_servers = server->iser_next;
789 kstat_delete_netstack(server->iser_ksp, stackid);
790 kmem_free(server, sizeof (ilb_server_t));
791 }
792
793 ASSERT(tmp_rule->ir_ksp != NULL);
794 kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
795
796 kmem_free(tmp_rule, sizeof (ilb_rule_t));
797 }
798
799 /* The routine executed by the delayed rule taskq. */
800 static void
ilb_rule_del_tq(void * arg)801 ilb_rule_del_tq(void *arg)
802 {
803 ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
804 ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
805
806 mutex_enter(&rule->ir_lock);
807 while (rule->ir_refcnt > 1)
808 cv_wait(&rule->ir_cv, &rule->ir_lock);
809 ilb_rule_del_common(ilbs, rule);
810 kmem_free(arg, sizeof (ilb_rule_tq_t));
811 }
812
813 /* Routine to delete a rule. */
814 int
ilb_rule_del(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name)815 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
816 {
817 ilb_rule_t *tmp_rule;
818 ilb_rule_tq_t *arg;
819 int err;
820
821 mutex_enter(&ilbs->ilbs_g_lock);
822 if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
823 &err)) == NULL) {
824 mutex_exit(&ilbs->ilbs_g_lock);
825 return (err);
826 }
827
828 /*
829 * First remove the rule from the hash array and the global list so
830 * that no one can find this rule any more.
831 */
832 ilb_rule_hash_del(tmp_rule);
833 ilb_rule_g_del(ilbs, tmp_rule);
834 mutex_exit(&ilbs->ilbs_g_lock);
835 ILB_RULE_REFRELE(tmp_rule);
836
837 /*
838 * Now no one can find this rule, we can remove it once all
839 * references to it are dropped and all references to the list
840 * of servers are dropped. So dispatch a task to finish the deletion.
841 * We do this instead of letting the last one referencing the
842 * rule do it. The reason is that the last one may be the
843 * interrupt thread. We want to minimize the work it needs to
844 * do. Rule deletion is not a critical task so it can be delayed.
845 */
846 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
847 arg->ilbs = ilbs;
848 arg->rule = tmp_rule;
849 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
850 TQ_SLEEP);
851
852 return (0);
853 }
854
855 /*
856 * Given an IP address, check to see if there is a rule using this
857 * as the VIP. It can be used to check if we need to drop a fragment.
858 */
859 boolean_t
ilb_rule_match_vip_v6(ilb_stack_t * ilbs,in6_addr_t * vip,ilb_rule_t ** ret_rule)860 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
861 {
862 int i;
863 ilb_rule_t *rule;
864 boolean_t ret = B_FALSE;
865
866 i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
867 ilbs->ilbs_rule_hash_size);
868 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
869 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
870 rule = rule->ir_hash_next) {
871 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
872 mutex_enter(&rule->ir_lock);
873 if (rule->ir_flags & ILB_RULE_BUSY) {
874 mutex_exit(&rule->ir_lock);
875 break;
876 }
877 if (ret_rule != NULL) {
878 rule->ir_refcnt++;
879 mutex_exit(&rule->ir_lock);
880 *ret_rule = rule;
881 } else {
882 mutex_exit(&rule->ir_lock);
883 }
884 ret = B_TRUE;
885 break;
886 }
887 }
888 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
889 return (ret);
890 }
891
892 boolean_t
ilb_rule_match_vip_v4(ilb_stack_t * ilbs,ipaddr_t addr,ilb_rule_t ** ret_rule)893 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
894 {
895 int i;
896 ilb_rule_t *rule;
897 boolean_t ret = B_FALSE;
898
899 i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
900 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
901 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
902 rule = rule->ir_hash_next) {
903 if (rule->ir_target_v6.s6_addr32[3] == addr) {
904 mutex_enter(&rule->ir_lock);
905 if (rule->ir_flags & ILB_RULE_BUSY) {
906 mutex_exit(&rule->ir_lock);
907 break;
908 }
909 if (ret_rule != NULL) {
910 rule->ir_refcnt++;
911 mutex_exit(&rule->ir_lock);
912 *ret_rule = rule;
913 } else {
914 mutex_exit(&rule->ir_lock);
915 }
916 ret = B_TRUE;
917 break;
918 }
919 }
920 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
921 return (ret);
922 }
923
924 static ilb_rule_t *
ilb_find_rule_locked(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,int * err)925 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
926 int *err)
927 {
928 ilb_rule_t *tmp_rule;
929
930 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
931
932 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
933 tmp_rule = tmp_rule->ir_next) {
934 if (tmp_rule->ir_zoneid != zoneid)
935 continue;
936 if (strcasecmp(tmp_rule->ir_name, name) == 0) {
937 mutex_enter(&tmp_rule->ir_lock);
938 if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
939 mutex_exit(&tmp_rule->ir_lock);
940 *err = EINPROGRESS;
941 return (NULL);
942 }
943 tmp_rule->ir_refcnt++;
944 mutex_exit(&tmp_rule->ir_lock);
945 *err = 0;
946 return (tmp_rule);
947 }
948 }
949 *err = ENOENT;
950 return (NULL);
951 }
952
953 /* To find a rule with a given name and zone in the global rule list. */
954 ilb_rule_t *
ilb_find_rule(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,int * err)955 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
956 int *err)
957 {
958 ilb_rule_t *tmp_rule;
959
960 mutex_enter(&ilbs->ilbs_g_lock);
961 tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
962 mutex_exit(&ilbs->ilbs_g_lock);
963 return (tmp_rule);
964 }
965
966 /* Try to match the given packet info and zone ID with a rule. */
967 static boolean_t
ilb_match_rule(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,int l3,int l4,in_port_t min_port,in_port_t max_port,const in6_addr_t * addr)968 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
969 int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
970 {
971 ilb_rule_t *tmp_rule;
972
973 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
974
975 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
976 tmp_rule = tmp_rule->ir_next) {
977 if (tmp_rule->ir_zoneid != zoneid)
978 continue;
979
980 /*
981 * We don't allow the same name in different rules even if all
982 * the other rule components are different.
983 */
984 if (strcasecmp(tmp_rule->ir_name, name) == 0)
985 return (B_TRUE);
986
987 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
988 continue;
989
990 /*
991 * ir_min_port and ir_max_port are the same if ir_port_range
992 * is false. In this case, if the ir_min|max_port (same) is
993 * outside of the given port range, it is OK. In other cases,
994 * check if min and max port are outside a rule's range.
995 */
996 if (tmp_rule->ir_max_port < min_port ||
997 tmp_rule->ir_min_port > max_port) {
998 continue;
999 }
1000
1001 /*
1002 * If l3 is IPv4, the addr passed in is assumed to be
1003 * mapped address.
1004 */
1005 if (V6_OR_V4_INADDR_ANY(*addr) ||
1006 V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1007 IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1008 return (B_TRUE);
1009 }
1010 }
1011 return (B_FALSE);
1012 }
1013
1014 int
ilb_rule_enable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * in_rule)1015 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1016 const char *rule_name, ilb_rule_t *in_rule)
1017 {
1018 ilb_rule_t *rule;
1019 int err;
1020
1021 ASSERT((in_rule == NULL && rule_name != NULL) ||
1022 (in_rule != NULL && rule_name == NULL));
1023 if ((rule = in_rule) == NULL) {
1024 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1025 &err)) == NULL) {
1026 return (err);
1027 }
1028 }
1029 mutex_enter(&rule->ir_lock);
1030 rule->ir_flags |= ILB_RULE_ENABLED;
1031 mutex_exit(&rule->ir_lock);
1032
1033 /* Only refrele if the rule is passed in. */
1034 if (in_rule == NULL)
1035 ILB_RULE_REFRELE(rule);
1036 return (0);
1037 }
1038
1039 int
ilb_rule_disable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * in_rule)1040 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1041 const char *rule_name, ilb_rule_t *in_rule)
1042 {
1043 ilb_rule_t *rule;
1044 int err;
1045
1046 ASSERT((in_rule == NULL && rule_name != NULL) ||
1047 (in_rule != NULL && rule_name == NULL));
1048 if ((rule = in_rule) == NULL) {
1049 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1050 &err)) == NULL) {
1051 return (err);
1052 }
1053 }
1054 mutex_enter(&rule->ir_lock);
1055 rule->ir_flags &= ~ILB_RULE_ENABLED;
1056 mutex_exit(&rule->ir_lock);
1057
1058 /* Only refrele if the rule is passed in. */
1059 if (in_rule == NULL)
1060 ILB_RULE_REFRELE(rule);
1061 return (0);
1062 }
1063
1064 /*
1065 * XXX We should probably have a walker function to walk all rules. For
1066 * now, just add a simple loop for enable/disable/del.
1067 */
1068 void
ilb_rule_enable_all(ilb_stack_t * ilbs,zoneid_t zoneid)1069 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1070 {
1071 ilb_rule_t *rule;
1072
1073 mutex_enter(&ilbs->ilbs_g_lock);
1074 for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1075 if (rule->ir_zoneid != zoneid)
1076 continue;
1077 /*
1078 * No need to hold the rule as we are holding the global
1079 * lock so it won't go away. Ignore the return value here
1080 * as the rule is provided so the call cannot fail.
1081 */
1082 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1083 }
1084 mutex_exit(&ilbs->ilbs_g_lock);
1085 }
1086
1087 void
ilb_rule_disable_all(ilb_stack_t * ilbs,zoneid_t zoneid)1088 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1089 {
1090 ilb_rule_t *rule;
1091
1092 mutex_enter(&ilbs->ilbs_g_lock);
1093 for (rule = ilbs->ilbs_rule_head; rule != NULL;
1094 rule = rule->ir_next) {
1095 if (rule->ir_zoneid != zoneid)
1096 continue;
1097 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1098 }
1099 mutex_exit(&ilbs->ilbs_g_lock);
1100 }
1101
1102 void
ilb_rule_del_all(ilb_stack_t * ilbs,zoneid_t zoneid)1103 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1104 {
1105 ilb_rule_t *rule;
1106 ilb_rule_tq_t *arg;
1107
1108 mutex_enter(&ilbs->ilbs_g_lock);
1109 while ((rule = ilbs->ilbs_rule_head) != NULL) {
1110 if (rule->ir_zoneid != zoneid)
1111 continue;
1112 ilb_rule_hash_del(rule);
1113 ilb_rule_g_del(ilbs, rule);
1114 mutex_exit(&ilbs->ilbs_g_lock);
1115
1116 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1117 arg->ilbs = ilbs;
1118 arg->rule = rule;
1119 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1120 arg, TQ_SLEEP);
1121
1122 mutex_enter(&ilbs->ilbs_g_lock);
1123 }
1124 mutex_exit(&ilbs->ilbs_g_lock);
1125 }
1126
1127 /*
1128 * This is just an optimization, so don't grab the global lock. The
1129 * worst case is that we missed a couple packets.
1130 */
1131 boolean_t
ilb_has_rules(ilb_stack_t * ilbs)1132 ilb_has_rules(ilb_stack_t *ilbs)
1133 {
1134 return (ilbs->ilbs_rule_head != NULL);
1135 }
1136
1137
1138 static int
ilb_server_toggle(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * rule,in6_addr_t * addr,boolean_t enable)1139 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1140 ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1141 {
1142 ilb_server_t *tmp_server;
1143 int ret;
1144
1145 ASSERT((rule == NULL && rule_name != NULL) ||
1146 (rule != NULL && rule_name == NULL));
1147
1148 if (rule == NULL) {
1149 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1150 &ret)) == NULL) {
1151 return (ret);
1152 }
1153 }
1154
1155 /* Once we get a hold on the rule, no server can be added/deleted. */
1156 for (tmp_server = rule->ir_servers; tmp_server != NULL;
1157 tmp_server = tmp_server->iser_next) {
1158 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1159 break;
1160 }
1161 if (tmp_server == NULL) {
1162 ret = ENOENT;
1163 goto done;
1164 }
1165
1166 if (enable) {
1167 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1168 rule->ir_alg->ilb_alg_data);
1169 if (ret == 0) {
1170 tmp_server->iser_enabled = B_TRUE;
1171 tmp_server->iser_die_time = 0;
1172 }
1173 } else {
1174 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1175 rule->ir_alg->ilb_alg_data);
1176 if (ret == 0) {
1177 tmp_server->iser_enabled = B_FALSE;
1178 if (rule->ir_conn_drain_timeout != 0) {
1179 (void) atomic_swap_64(
1180 (uint64_t *)&tmp_server->iser_die_time,
1181 ddi_get_lbolt64() + SEC_TO_TICK(
1182 rule->ir_conn_drain_timeout));
1183 }
1184 }
1185 }
1186
1187 done:
1188 if (rule_name != NULL)
1189 ILB_RULE_REFRELE(rule);
1190 return (ret);
1191 }
1192 int
ilb_server_enable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,ilb_rule_t * rule,in6_addr_t * addr)1193 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1194 ilb_rule_t *rule, in6_addr_t *addr)
1195 {
1196 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1197 }
1198
1199 int
ilb_server_disable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,ilb_rule_t * rule,in6_addr_t * addr)1200 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1201 ilb_rule_t *rule, in6_addr_t *addr)
1202 {
1203 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1204 }
1205
1206 /*
1207 * Add a back end server to a rule. If the address is IPv4, it is assumed
1208 * to be passed in as a mapped address.
1209 */
1210 int
ilb_server_add(ilb_stack_t * ilbs,ilb_rule_t * rule,ilb_server_info_t * info)1211 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1212 {
1213 ilb_server_t *server;
1214 netstackid_t stackid;
1215 int ret = 0;
1216 in_port_t min_port, max_port;
1217 in_port_t range;
1218
1219 /* Port is passed in network byte order. */
1220 min_port = ntohs(info->min_port);
1221 max_port = ntohs(info->max_port);
1222 if (min_port > max_port)
1223 return (EINVAL);
1224
1225 /* min_port == 0 means "all ports". Make it so */
1226 if (min_port == 0) {
1227 min_port = 1;
1228 max_port = 65535;
1229 }
1230 range = max_port - min_port;
1231
1232 mutex_enter(&rule->ir_lock);
1233 /* If someone is already doing server add/del, sleeps and wait. */
1234 while (rule->ir_flags & ILB_RULE_BUSY) {
1235 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1236 mutex_exit(&rule->ir_lock);
1237 return (EINTR);
1238 }
1239 }
1240
1241 /*
1242 * Set the rule to be busy to make sure that no new packet can
1243 * use this rule.
1244 */
1245 rule->ir_flags |= ILB_RULE_BUSY;
1246
1247 /* Now wait for all other guys to finish their work. */
1248 while (rule->ir_refcnt > 2) {
1249 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1250 mutex_exit(&rule->ir_lock);
1251 ret = EINTR;
1252 goto end;
1253 }
1254 }
1255 mutex_exit(&rule->ir_lock);
1256
1257 /* Sanity checks... */
1258 if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1259 rule->ir_ipver != IPPROTO_IP) ||
1260 (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1261 rule->ir_ipver != IPPROTO_IPV6)) {
1262 ret = EINVAL;
1263 goto end;
1264 }
1265
1266 /*
1267 * Check for valid port range.
1268 *
1269 * For DSR, there can be no port shifting. Hence the server
1270 * specification must be the same as the rule's.
1271 *
1272 * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1273 * it must be equal to the same value as the rule port range.
1274 *
1275 */
1276 if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1277 if (rule->ir_max_port != max_port ||
1278 rule->ir_min_port != min_port) {
1279 ret = EINVAL;
1280 goto end;
1281 }
1282 } else {
1283 if ((range != rule->ir_max_port - rule->ir_min_port) &&
1284 range != 0) {
1285 ret = EINVAL;
1286 goto end;
1287 }
1288 }
1289
1290 /* Check for duplicate. */
1291 for (server = rule->ir_servers; server != NULL;
1292 server = server->iser_next) {
1293 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1294 strcasecmp(server->iser_name, info->name) == 0) {
1295 break;
1296 }
1297 }
1298 if (server != NULL) {
1299 ret = EEXIST;
1300 goto end;
1301 }
1302
1303 if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1304 ret = ENOMEM;
1305 goto end;
1306 }
1307
1308 (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1309 (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1310 sizeof (server->iser_ip_addr));
1311 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1312 server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1313 if (server->iser_ksp == NULL) {
1314 kmem_free(server, sizeof (ilb_server_t));
1315 ret = EINVAL;
1316 goto end;
1317 }
1318
1319 server->iser_stackid = stackid;
1320 server->iser_addr_v6 = info->addr;
1321 server->iser_min_port = min_port;
1322 server->iser_max_port = max_port;
1323 if (min_port != max_port)
1324 server->iser_port_range = B_TRUE;
1325 else
1326 server->iser_port_range = B_FALSE;
1327
1328 /*
1329 * If the rule uses NAT, find/create the NAT source entry to use
1330 * for this server.
1331 */
1332 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1333 in_port_t port;
1334
1335 /*
1336 * If the server uses a port range, our port allocation
1337 * scheme needs to treat it as a wildcard. Refer to the
1338 * comments in ilb_nat.c about the scheme.
1339 */
1340 if (server->iser_port_range)
1341 port = 0;
1342 else
1343 port = server->iser_min_port;
1344
1345 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1346 &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1347 num_nat_src_v6(&rule->ir_nat_src_start,
1348 &rule->ir_nat_src_end))) != 0) {
1349 kstat_delete_netstack(server->iser_ksp, stackid);
1350 kmem_free(server, sizeof (ilb_server_t));
1351 goto end;
1352 }
1353 }
1354
1355 /*
1356 * The iser_lock is only used to protect iser_refcnt. All the other
1357 * fields in ilb_server_t should not change, except for iser_enabled.
1358 * The worst thing that can happen if iser_enabled is messed up is
1359 * that one or two packets may not be load balanced to a server
1360 * correctly.
1361 */
1362 server->iser_refcnt = 1;
1363 server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1364 B_FALSE;
1365 mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1366 cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1367
1368 /* Let the load balancing algorithm know about the addition. */
1369 ASSERT(rule->ir_alg != NULL);
1370 if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1371 rule->ir_alg->ilb_alg_data)) != 0) {
1372 kstat_delete_netstack(server->iser_ksp, stackid);
1373 kmem_free(server, sizeof (ilb_server_t));
1374 goto end;
1375 }
1376
1377 /*
1378 * No need to hold ir_lock since no other thread should manipulate
1379 * the following fields until ILB_RULE_BUSY is cleared.
1380 */
1381 if (rule->ir_servers == NULL) {
1382 server->iser_next = NULL;
1383 } else {
1384 server->iser_next = rule->ir_servers;
1385 }
1386 rule->ir_servers = server;
1387 ILB_R_KSTAT(rule, num_servers);
1388
1389 end:
1390 mutex_enter(&rule->ir_lock);
1391 rule->ir_flags &= ~ILB_RULE_BUSY;
1392 cv_signal(&rule->ir_cv);
1393 mutex_exit(&rule->ir_lock);
1394 return (ret);
1395 }
1396
1397 /* The routine executed by the delayed rule processing taskq. */
1398 static void
ilb_server_del_tq(void * arg)1399 ilb_server_del_tq(void *arg)
1400 {
1401 ilb_server_t *server = (ilb_server_t *)arg;
1402
1403 mutex_enter(&server->iser_lock);
1404 while (server->iser_refcnt > 1)
1405 cv_wait(&server->iser_cv, &server->iser_lock);
1406 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1407 kmem_free(server, sizeof (ilb_server_t));
1408 }
1409
1410 /*
1411 * Delete a back end server from a rule. If the address is IPv4, it is assumed
1412 * to be passed in as a mapped address.
1413 */
1414 int
ilb_server_del(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * rule,in6_addr_t * addr)1415 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1416 ilb_rule_t *rule, in6_addr_t *addr)
1417 {
1418 ilb_server_t *server;
1419 ilb_server_t *prev_server;
1420 int ret = 0;
1421
1422 ASSERT((rule == NULL && rule_name != NULL) ||
1423 (rule != NULL && rule_name == NULL));
1424 if (rule == NULL) {
1425 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1426 &ret)) == NULL) {
1427 return (ret);
1428 }
1429 }
1430
1431 mutex_enter(&rule->ir_lock);
1432 /* If someone is already doing server add/del, sleeps and wait. */
1433 while (rule->ir_flags & ILB_RULE_BUSY) {
1434 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1435 if (rule_name != NULL) {
1436 if (--rule->ir_refcnt <= 2)
1437 cv_signal(&rule->ir_cv);
1438 }
1439 mutex_exit(&rule->ir_lock);
1440 return (EINTR);
1441 }
1442 }
1443 /*
1444 * Set the rule to be busy to make sure that no new packet can
1445 * use this rule.
1446 */
1447 rule->ir_flags |= ILB_RULE_BUSY;
1448
1449 /* Now wait for all other guys to finish their work. */
1450 while (rule->ir_refcnt > 2) {
1451 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1452 mutex_exit(&rule->ir_lock);
1453 ret = EINTR;
1454 goto end;
1455 }
1456 }
1457 mutex_exit(&rule->ir_lock);
1458
1459 prev_server = NULL;
1460 for (server = rule->ir_servers; server != NULL;
1461 prev_server = server, server = server->iser_next) {
1462 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1463 break;
1464 }
1465 if (server == NULL) {
1466 ret = ENOENT;
1467 goto end;
1468 }
1469
1470 /*
1471 * Let the load balancing algorithm know about the removal.
1472 * The algorithm may disallow the removal...
1473 */
1474 if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1475 rule->ir_alg->ilb_alg_data)) != 0) {
1476 goto end;
1477 }
1478
1479 if (prev_server == NULL)
1480 rule->ir_servers = server->iser_next;
1481 else
1482 prev_server->iser_next = server->iser_next;
1483
1484 ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1485
1486 /*
1487 * Mark the server as disabled so that if there is any sticky cache
1488 * using this server around, it won't be used.
1489 */
1490 server->iser_enabled = B_FALSE;
1491
1492 mutex_enter(&server->iser_lock);
1493
1494 /*
1495 * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t
1496 * may not go away if there is still a conn using it. The NAT source
1497 * timer will do the garbage collection.
1498 */
1499 ilb_destroy_nat_src(&server->iser_nat_src);
1500
1501 /* If there is a hard limit on when a server should die, set it. */
1502 if (rule->ir_conn_drain_timeout != 0) {
1503 (void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1504 ddi_get_lbolt64() +
1505 SEC_TO_TICK(rule->ir_conn_drain_timeout));
1506 }
1507
1508 if (server->iser_refcnt > 1) {
1509 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1510 server, TQ_SLEEP);
1511 mutex_exit(&server->iser_lock);
1512 } else {
1513 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1514 kmem_free(server, sizeof (ilb_server_t));
1515 }
1516
1517 end:
1518 mutex_enter(&rule->ir_lock);
1519 rule->ir_flags &= ~ILB_RULE_BUSY;
1520 if (rule_name != NULL)
1521 rule->ir_refcnt--;
1522 cv_signal(&rule->ir_cv);
1523 mutex_exit(&rule->ir_lock);
1524 return (ret);
1525 }
1526
1527 /*
1528 * First check if the destination of the ICMP message matches a VIP of
1529 * a rule. If it does not, just return ILB_PASSED.
1530 *
1531 * If the destination matches a VIP:
1532 *
1533 * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1534 * server.
1535 *
1536 * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1537 * and see which back end server we should send this message to. And we
1538 * need to do NAT on both the payload message and the outside IP packet.
1539 *
1540 * For other ICMP messages, drop them.
1541 */
1542 /* ARGSUSED */
1543 static int
ilb_icmp_v4(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ipha_t * ipha,icmph_t * icmph,ipaddr_t * lb_dst)1544 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1545 icmph_t *icmph, ipaddr_t *lb_dst)
1546 {
1547 ipaddr_t vip;
1548 ilb_rule_t *rule;
1549 in6_addr_t addr6;
1550
1551 if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1552 return (ILB_PASSED);
1553
1554
1555 if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1556 ILB_R_KSTAT(rule, icmp_dropped);
1557 ILB_RULE_REFRELE(rule);
1558 return (ILB_DROPPED);
1559 }
1560
1561 switch (icmph->icmph_type) {
1562 case ICMP_ECHO_REQUEST:
1563 ILB_R_KSTAT(rule, icmp_echo_processed);
1564 ILB_RULE_REFRELE(rule);
1565
1566 icmph->icmph_type = ICMP_ECHO_REPLY;
1567 icmph->icmph_checksum = 0;
1568 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1569 ipha->ipha_ttl =
1570 ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1571 *lb_dst = ipha->ipha_src;
1572 vip = ipha->ipha_dst;
1573 ipha->ipha_dst = ipha->ipha_src;
1574 ipha->ipha_src = vip;
1575 return (ILB_BALANCED);
1576 case ICMP_DEST_UNREACHABLE: {
1577 int ret;
1578
1579 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1580 ILB_R_KSTAT(rule, icmp_dropped);
1581 ILB_RULE_REFRELE(rule);
1582 return (ILB_DROPPED);
1583 }
1584 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1585 &addr6)) {
1586 ILB_R_KSTAT(rule, icmp_2big_processed);
1587 ret = ILB_BALANCED;
1588 } else {
1589 ILB_R_KSTAT(rule, icmp_2big_dropped);
1590 ret = ILB_DROPPED;
1591 }
1592 ILB_RULE_REFRELE(rule);
1593 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1594 return (ret);
1595 }
1596 default:
1597 ILB_R_KSTAT(rule, icmp_dropped);
1598 ILB_RULE_REFRELE(rule);
1599 return (ILB_DROPPED);
1600 }
1601 }
1602
1603 /* ARGSUSED */
1604 static int
ilb_icmp_v6(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ip6_t * ip6h,icmp6_t * icmp6,in6_addr_t * lb_dst)1605 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1606 icmp6_t *icmp6, in6_addr_t *lb_dst)
1607 {
1608 ilb_rule_t *rule;
1609
1610 if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1611 return (ILB_PASSED);
1612
1613 if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1614 ILB_R_KSTAT(rule, icmp_dropped);
1615 ILB_RULE_REFRELE(rule);
1616 return (ILB_DROPPED);
1617 }
1618
1619 switch (icmp6->icmp6_type) {
1620 case ICMP6_ECHO_REQUEST: {
1621 int hdr_len;
1622
1623 ILB_R_KSTAT(rule, icmp_echo_processed);
1624 ILB_RULE_REFRELE(rule);
1625
1626 icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1627 icmp6->icmp6_cksum = ip6h->ip6_plen;
1628 hdr_len = (char *)icmp6 - (char *)ip6h;
1629 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1630 ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1631 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1632 ip6h->ip6_hops =
1633 ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1634 *lb_dst = ip6h->ip6_src;
1635 ip6h->ip6_src = ip6h->ip6_dst;
1636 ip6h->ip6_dst = *lb_dst;
1637 return (ILB_BALANCED);
1638 }
1639 case ICMP6_PACKET_TOO_BIG: {
1640 int ret;
1641
1642 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1643 lb_dst)) {
1644 ILB_R_KSTAT(rule, icmp_2big_processed);
1645 ret = ILB_BALANCED;
1646 } else {
1647 ILB_R_KSTAT(rule, icmp_2big_dropped);
1648 ret = ILB_DROPPED;
1649 }
1650 ILB_RULE_REFRELE(rule);
1651 return (ret);
1652 }
1653 default:
1654 ILB_R_KSTAT(rule, icmp_dropped);
1655 ILB_RULE_REFRELE(rule);
1656 return (ILB_DROPPED);
1657 }
1658 }
1659
1660 /*
1661 * Common routine to check an incoming packet and decide what to do with it.
1662 * called by ilb_check_v4|v6().
1663 */
1664 static int
ilb_check(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,in6_addr_t * src,in6_addr_t * dst,int l3,int l4,void * iph,uint8_t * tph,uint32_t pkt_len,in6_addr_t * lb_dst)1665 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1666 in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1667 in6_addr_t *lb_dst)
1668 {
1669 in_port_t sport, dport;
1670 tcpha_t *tcph;
1671 udpha_t *udph;
1672 ilb_rule_t *rule;
1673 ilb_server_t *server;
1674 boolean_t balanced;
1675 struct ilb_sticky_s *s = NULL;
1676 int ret;
1677 uint32_t ip_sum, tp_sum;
1678 ilb_nat_info_t info;
1679 uint16_t nat_src_idx;
1680 boolean_t busy;
1681
1682 ret = 0;
1683
1684 /*
1685 * We don't really need to switch here since both protocols's
1686 * ports are at the same offset. Just prepare for future protocol
1687 * specific processing.
1688 */
1689 switch (l4) {
1690 case IPPROTO_TCP:
1691 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1692 return (ILB_DROPPED);
1693 tcph = (tcpha_t *)tph;
1694 sport = tcph->tha_lport;
1695 dport = tcph->tha_fport;
1696 break;
1697 case IPPROTO_UDP:
1698 if (tph + sizeof (udpha_t) > mp->b_wptr)
1699 return (ILB_DROPPED);
1700 udph = (udpha_t *)tph;
1701 sport = udph->uha_src_port;
1702 dport = udph->uha_dst_port;
1703 break;
1704 default:
1705 return (ILB_PASSED);
1706 }
1707
1708 /* Fast path, there is an existing conn. */
1709 if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1710 pkt_len, lb_dst)) {
1711 return (ILB_BALANCED);
1712 }
1713
1714 /*
1715 * If there is no existing connection for the incoming packet, check
1716 * to see if the packet matches a rule. If not, just let IP decide
1717 * what to do with it.
1718 *
1719 * Note: a reply from back end server should not match a rule. A
1720 * reply should match one existing conn.
1721 */
1722 rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1723 pkt_len, &busy);
1724 if (rule == NULL) {
1725 /* If the rule is busy, just drop the packet. */
1726 if (busy)
1727 return (ILB_DROPPED);
1728 else
1729 return (ILB_PASSED);
1730 }
1731
1732 /*
1733 * The packet matches a rule, use the rule load balance algorithm
1734 * to find a server.
1735 */
1736 balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1737 rule->ir_alg->ilb_alg_data, &server);
1738 /*
1739 * This can only happen if there is no server in a rule or all
1740 * the servers are currently disabled.
1741 */
1742 if (!balanced)
1743 goto no_server;
1744
1745 /*
1746 * If the rule is sticky enabled, we need to check the sticky table.
1747 * If there is a sticky entry for the client, use the previous server
1748 * instead of the one found above (note that both can be the same).
1749 * If there is no entry for that client, add an entry to the sticky
1750 * table. Both the find and add are done in ilb_sticky_find_add()
1751 * to avoid checking for duplicate when adding an entry.
1752 */
1753 if (rule->ir_flags & ILB_RULE_STICKY) {
1754 in6_addr_t addr;
1755
1756 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1757 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1758 &s, &nat_src_idx)) == NULL) {
1759 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1760 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1761 goto no_server;
1762 }
1763 }
1764
1765 /*
1766 * We are holding a reference on the rule, so the server
1767 * cannot go away.
1768 */
1769 *lb_dst = server->iser_addr_v6;
1770 ILB_S_KSTAT(server, pkt_processed);
1771 ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1772
1773 switch (rule->ir_topo) {
1774 case ILB_TOPO_IMPL_NAT: {
1775 ilb_nat_src_entry_t *src_ent;
1776 uint16_t *src_idx;
1777
1778 /*
1779 * We create a cache even if it is not a SYN segment.
1780 * The server should return a RST. When we see the
1781 * RST, we will destroy this cache. But by having
1782 * a cache, we know how to NAT the returned RST.
1783 */
1784 info.vip = *dst;
1785 info.dport = dport;
1786 info.src = *src;
1787 info.sport = sport;
1788
1789 /* If stickiness is enabled, use the same source address */
1790 if (s != NULL)
1791 src_idx = &nat_src_idx;
1792 else
1793 src_idx = NULL;
1794
1795 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1796 &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1797 if (s != NULL)
1798 ilb_sticky_refrele(s);
1799 ILB_R_KSTAT(rule, pkt_dropped);
1800 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1801 ILB_R_KSTAT(rule, noport_pkt_dropped);
1802 ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1803 ret = ILB_DROPPED;
1804 break;
1805 }
1806 info.src_ent = src_ent;
1807 info.nat_dst = server->iser_addr_v6;
1808 if (rule->ir_port_range && server->iser_port_range) {
1809 info.nat_dport = htons(ntohs(dport) -
1810 rule->ir_min_port + server->iser_min_port);
1811 } else {
1812 info.nat_dport = htons(server->iser_min_port);
1813 }
1814
1815 /*
1816 * If ilb_conn_add() fails, it will release the reference on
1817 * sticky info and de-allocate the NAT source port allocated
1818 * above.
1819 */
1820 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1821 dport, &info, &ip_sum, &tp_sum, s) != 0) {
1822 ILB_R_KSTAT(rule, pkt_dropped);
1823 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1824 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1825 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1826 ret = ILB_DROPPED;
1827 break;
1828 }
1829 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1830 ret = ILB_BALANCED;
1831 break;
1832 }
1833 case ILB_TOPO_IMPL_HALF_NAT:
1834 info.vip = *dst;
1835 info.nat_dst = server->iser_addr_v6;
1836 info.dport = dport;
1837 if (rule->ir_port_range && server->iser_port_range) {
1838 info.nat_dport = htons(ntohs(dport) -
1839 rule->ir_min_port + server->iser_min_port);
1840 } else {
1841 info.nat_dport = htons(server->iser_min_port);
1842 }
1843
1844 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1845 dport, &info, &ip_sum, &tp_sum, s) != 0) {
1846 ILB_R_KSTAT(rule, pkt_dropped);
1847 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1848 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1849 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1850 ret = ILB_DROPPED;
1851 break;
1852 }
1853 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1854
1855 ret = ILB_BALANCED;
1856 break;
1857 case ILB_TOPO_IMPL_DSR:
1858 /*
1859 * By decrementing the sticky refcnt, the period of
1860 * stickiness (life time of ilb_sticky_t) will be
1861 * from now to (now + default expiry time).
1862 */
1863 if (s != NULL)
1864 ilb_sticky_refrele(s);
1865 ret = ILB_BALANCED;
1866 break;
1867 default:
1868 cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1869 (void *) rule);
1870 break;
1871 }
1872 ILB_RULE_REFRELE(rule);
1873 return (ret);
1874
1875 no_server:
1876 /* This can only happen if there is no server available. */
1877 ILB_R_KSTAT(rule, pkt_dropped);
1878 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1879 ILB_RULE_REFRELE(rule);
1880 return (ILB_DROPPED);
1881 }
1882
1883 int
ilb_check_v4(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ipha_t * ipha,int l4,uint8_t * tph,ipaddr_t * lb_dst)1884 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1885 uint8_t *tph, ipaddr_t *lb_dst)
1886 {
1887 in6_addr_t v6_src, v6_dst, v6_lb_dst;
1888 int ret;
1889
1890 ASSERT(DB_REF(mp) == 1);
1891
1892 if (l4 == IPPROTO_ICMP) {
1893 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1894 lb_dst));
1895 }
1896
1897 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1898 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1899 ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1900 tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1901 if (ret == ILB_BALANCED)
1902 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1903 return (ret);
1904 }
1905
1906 int
ilb_check_v6(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ip6_t * ip6h,int l4,uint8_t * tph,in6_addr_t * lb_dst)1907 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1908 uint8_t *tph, in6_addr_t *lb_dst)
1909 {
1910 uint32_t pkt_len;
1911
1912 ASSERT(DB_REF(mp) == 1);
1913
1914 if (l4 == IPPROTO_ICMPV6) {
1915 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1916 lb_dst));
1917 }
1918
1919 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1920 return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1921 IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1922 }
1923
1924 void
ilb_get_num_rules(ilb_stack_t * ilbs,zoneid_t zoneid,uint32_t * num_rules)1925 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1926 {
1927 ilb_rule_t *tmp_rule;
1928
1929 mutex_enter(&ilbs->ilbs_g_lock);
1930 *num_rules = 0;
1931 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1932 tmp_rule = tmp_rule->ir_next) {
1933 if (tmp_rule->ir_zoneid == zoneid)
1934 *num_rules += 1;
1935 }
1936 mutex_exit(&ilbs->ilbs_g_lock);
1937 }
1938
1939 int
ilb_get_num_servers(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,uint32_t * num_servers)1940 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1941 uint32_t *num_servers)
1942 {
1943 ilb_rule_t *rule;
1944 int err;
1945
1946 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1947 return (err);
1948 *num_servers = rule->ir_kstat.num_servers.value.ui64;
1949 ILB_RULE_REFRELE(rule);
1950 return (0);
1951 }
1952
1953 int
ilb_get_servers(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,ilb_server_info_t * servers,uint32_t * num_servers)1954 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1955 ilb_server_info_t *servers, uint32_t *num_servers)
1956 {
1957 ilb_rule_t *rule;
1958 ilb_server_t *server;
1959 size_t cnt;
1960 int err;
1961
1962 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1963 return (err);
1964 for (server = rule->ir_servers, cnt = *num_servers;
1965 server != NULL && cnt > 0;
1966 server = server->iser_next, cnt--, servers++) {
1967 (void) memcpy(servers->name, server->iser_name,
1968 ILB_SERVER_NAMESZ);
1969 servers->addr = server->iser_addr_v6;
1970 servers->min_port = htons(server->iser_min_port);
1971 servers->max_port = htons(server->iser_max_port);
1972 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1973 servers->err = 0;
1974 }
1975 ILB_RULE_REFRELE(rule);
1976 *num_servers -= cnt;
1977
1978 return (0);
1979 }
1980
1981 void
ilb_get_rulenames(ilb_stack_t * ilbs,zoneid_t zoneid,uint32_t * num_names,char * buf)1982 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1983 char *buf)
1984 {
1985 ilb_rule_t *tmp_rule;
1986 int cnt;
1987
1988 if (*num_names == 0)
1989 return;
1990
1991 mutex_enter(&ilbs->ilbs_g_lock);
1992 for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1993 tmp_rule = tmp_rule->ir_next) {
1994 if (tmp_rule->ir_zoneid != zoneid)
1995 continue;
1996
1997 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1998 buf += ILB_RULE_NAMESZ;
1999 if (++cnt == *num_names)
2000 break;
2001 }
2002 mutex_exit(&ilbs->ilbs_g_lock);
2003 *num_names = cnt;
2004 }
2005
2006 int
ilb_rule_list(ilb_stack_t * ilbs,zoneid_t zoneid,ilb_rule_cmd_t * cmd)2007 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2008 {
2009 ilb_rule_t *rule;
2010 int err;
2011
2012 if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2013 return (err);
2014 }
2015
2016 /*
2017 * Except the enabled flags, none of the following will change
2018 * in the life time of a rule. So we don't hold the mutex when
2019 * reading them. The worst is to report a wrong enabled flags.
2020 */
2021 cmd->ip_ver = rule->ir_ipver;
2022 cmd->proto = rule->ir_proto;
2023 cmd->min_port = htons(rule->ir_min_port);
2024 cmd->max_port = htons(rule->ir_max_port);
2025
2026 cmd->vip = rule->ir_target_v6;
2027 cmd->algo = rule->ir_alg_type;
2028 cmd->topo = rule->ir_topo;
2029
2030 cmd->nat_src_start = rule->ir_nat_src_start;
2031 cmd->nat_src_end = rule->ir_nat_src_end;
2032
2033 cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2034 cmd->nat_expiry = rule->ir_nat_expiry;
2035 cmd->sticky_expiry = rule->ir_sticky_expiry;
2036
2037 cmd->flags = 0;
2038 if (rule->ir_flags & ILB_RULE_ENABLED)
2039 cmd->flags |= ILB_RULE_ENABLED;
2040 if (rule->ir_flags & ILB_RULE_STICKY) {
2041 cmd->flags |= ILB_RULE_STICKY;
2042 cmd->sticky_mask = rule->ir_sticky_mask;
2043 }
2044
2045 ILB_RULE_REFRELE(rule);
2046 return (0);
2047 }
2048
2049 static void *
ilb_stack_init(netstackid_t stackid,netstack_t * ns)2050 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2051 {
2052 ilb_stack_t *ilbs;
2053 char tq_name[TASKQ_NAMELEN];
2054
2055 ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2056 ilbs->ilbs_netstack = ns;
2057
2058 ilbs->ilbs_rule_head = NULL;
2059 ilbs->ilbs_g_hash = NULL;
2060 mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2061
2062 ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2063 if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2064 kmem_free(ilbs, sizeof (ilb_stack_t));
2065 return (NULL);
2066 }
2067
2068 /*
2069 * ilbs_conn/sticky_hash related info is initialized in
2070 * ilb_conn/sticky_hash_init().
2071 */
2072 ilbs->ilbs_conn_taskq = NULL;
2073 ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2074 ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2075 ilbs->ilbs_c2s_conn_hash = NULL;
2076 ilbs->ilbs_s2c_conn_hash = NULL;
2077 ilbs->ilbs_conn_timer_list = NULL;
2078
2079 ilbs->ilbs_sticky_hash = NULL;
2080 ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2081 ilbs->ilbs_sticky_timer_list = NULL;
2082 ilbs->ilbs_sticky_taskq = NULL;
2083
2084 /* The allocation is done later when there is a rule using NAT mode. */
2085 ilbs->ilbs_nat_src = NULL;
2086 ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2087 mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2088 ilbs->ilbs_nat_src_tid = 0;
2089
2090 /* For listing the conn hash table */
2091 mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2092 cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2093 ilbs->ilbs_conn_list_busy = B_FALSE;
2094 ilbs->ilbs_conn_list_cur = 0;
2095 ilbs->ilbs_conn_list_connp = NULL;
2096
2097 /* For listing the sticky hash table */
2098 mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2099 cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2100 ilbs->ilbs_sticky_list_busy = B_FALSE;
2101 ilbs->ilbs_sticky_list_cur = 0;
2102 ilbs->ilbs_sticky_list_curp = NULL;
2103
2104 (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
2105 (void *)ns);
2106 ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2107 minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2108
2109 return (ilbs);
2110 }
2111
2112 /* ARGSUSED */
2113 static void
ilb_stack_shutdown(netstackid_t stackid,void * arg)2114 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2115 {
2116 ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2117 ilb_rule_t *tmp_rule;
2118
2119 ilb_sticky_hash_fini(ilbs);
2120 ilb_conn_hash_fini(ilbs);
2121 mutex_enter(&ilbs->ilbs_g_lock);
2122 while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2123 ilb_rule_hash_del(tmp_rule);
2124 ilb_rule_g_del(ilbs, tmp_rule);
2125 mutex_exit(&ilbs->ilbs_g_lock);
2126 ilb_rule_del_common(ilbs, tmp_rule);
2127 mutex_enter(&ilbs->ilbs_g_lock);
2128 }
2129 mutex_exit(&ilbs->ilbs_g_lock);
2130 if (ilbs->ilbs_nat_src != NULL)
2131 ilb_nat_src_fini(ilbs);
2132 }
2133
2134 static void
ilb_stack_fini(netstackid_t stackid,void * arg)2135 ilb_stack_fini(netstackid_t stackid, void * arg)
2136 {
2137 ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2138
2139 ilb_rule_hash_fini(ilbs);
2140 taskq_destroy(ilbs->ilbs_rule_taskq);
2141 ilb_kstat_g_fini(stackid, ilbs);
2142 kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2143 kmem_free(ilbs, sizeof (ilb_stack_t));
2144 }
2145
2146 void
ilb_ddi_g_init(void)2147 ilb_ddi_g_init(void)
2148 {
2149 netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2150 ilb_stack_fini);
2151 }
2152
2153 void
ilb_ddi_g_destroy(void)2154 ilb_ddi_g_destroy(void)
2155 {
2156 netstack_unregister(NS_ILB);
2157 ilb_conn_cache_fini();
2158 ilb_sticky_cache_fini();
2159 }
2160