xref: /dragonfly/sys/net/ipfw3_nat/ip_fw3_nat.c (revision 556932ec)
1 /*
2  * Copyright (c) 2014 - 2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Bill Yuan <bycn82@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "opt_ipfw.h"
36 #include "opt_inet.h"
37 #ifndef INET
38 #error IPFIREWALL3 requires INET.
39 #endif /* INET */
40 
41 #include <sys/param.h>
42 #include <sys/kernel.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/systimer.h>
48 #include <sys/in_cksum.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/syslog.h>
53 #include <sys/ucred.h>
54 #include <sys/lock.h>
55 
56 #include <net/ethernet.h>
57 #include <net/netmsg2.h>
58 #include <net/netisr2.h>
59 #include <net/route.h>
60 #include <net/if.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/ip_icmp.h>
65 #include <netinet/tcp.h>
66 #include <netinet/tcp_timer.h>
67 #include <netinet/tcp_var.h>
68 #include <netinet/tcpip.h>
69 #include <netinet/udp.h>
70 #include <netinet/udp_var.h>
71 #include <netinet/in_systm.h>
72 #include <netinet/in_var.h>
73 #include <netinet/in_pcb.h>
74 #include <netinet/ip_var.h>
75 #include <netinet/ip_divert.h>
76 #include <net/ipfw3/ip_fw.h>
77 
78 #include "ip_fw3_nat.h"
79 
80 MALLOC_DEFINE(M_IPFW3_NAT, "IP_FW3_NAT", "ipfw3_nat module");
81 
82 /*
83  * Highspeed Lockless Kernel NAT
84  *
85  * Kernel NAT
86  * The network address translation (NAT) will replace the `src` of the packet
87  * with an `alias` (alias_addr & alias_port). Accordingt to the configuration,
88  * The alias will be randomly picked from the configured range.
89  *
90  * Highspeed
91  * The first outgoing packet should trigger the creation of the `net_state`,
92  * and the `net_state` will keep in a RB-Tree for the subsequent outgoing
93  * packets.
94  * The first returning packet will trigger the creation of the `net_state2`,
95  * which will be stored in a multidimensional array of points ( of net_state2 ).
96  *
97  * Lockless
98  * The `net_state` for outgoing packet will be stored in the nat_context of
99  * current CPU. But due to the nature of the NAT, the returning packet may be
100  * handled by another CPU. Hence, The `net_state2` for the returning packet
101  * will be prepared and stored into the nat_context of the right CPU.
102  */
103 
104 struct ip_fw3_nat_context	*ip_fw3_nat_ctx[MAXCPU];
105 static struct callout 		ip_fw3_nat_cleanup_callout;
106 extern struct ipfw3_context 	*fw3_ctx[MAXCPU];
107 extern ip_fw_ctl_t 		*ip_fw3_ctl_nat_ptr;
108 
109 static int 			sysctl_var_cleanup_interval = 1;
110 static int 			sysctl_var_icmp_timeout = 10;
111 static int 			sysctl_var_tcp_timeout = 60;
112 static int 			sysctl_var_udp_timeout = 30;
113 
114 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw3_nat, CTLFLAG_RW, 0, "ipfw3 NAT");
115 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, cleanup_interval, CTLFLAG_RW,
116 		&sysctl_var_cleanup_interval, 0, "default life time");
117 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, icmp_timeout, CTLFLAG_RW,
118 		&sysctl_var_icmp_timeout, 0, "default icmp state life time");
119 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, tcp_timeout, CTLFLAG_RW,
120 		&sysctl_var_tcp_timeout, 0, "default tcp state life time");
121 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, udp_timeout, CTLFLAG_RW,
122 		&sysctl_var_udp_timeout, 0, "default udp state life time");
123 
124 RB_PROTOTYPE(state_tree, nat_state, entries, ip_fw3_nat_state_cmp);
125 RB_GENERATE(state_tree, nat_state, entries, ip_fw3_nat_state_cmp);
126 
127 static __inline uint16_t
128 fix_cksum(uint16_t cksum, uint16_t old_info, uint16_t new_info, uint8_t is_udp)
129 {
130 	uint32_t tmp;
131 
132 	if (is_udp && !cksum)
133 		return (0x0000);
134 	tmp = cksum + old_info - new_info;
135 	tmp = (tmp >> 16) + (tmp & 65535);
136 	tmp = tmp & 65535;
137 	if (is_udp && !tmp)
138 		return (0xFFFF);
139 	return tmp;
140 }
141 
142 void
143 check_nat(int *cmd_ctl, int *cmd_val, struct ip_fw_args **args,
144 		struct ip_fw **f, ipfw_insn *cmd, uint16_t ip_len)
145 {
146 	if ((*args)->eh != NULL) {
147 		*cmd_ctl = IP_FW_CTL_NO;
148 		*cmd_val = IP_FW_NOT_MATCH;
149 		return;
150 	}
151 
152 	struct ip_fw3_nat_context *nat_ctx;
153 	struct cfg_nat *nat;
154 	int nat_id;
155 
156 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
157 	(*args)->rule = *f;
158 	nat = ((ipfw_insn_nat *)cmd)->nat;
159 	if (nat == NULL) {
160 		nat_id = cmd->arg1;
161 		nat = nat_ctx->nats[nat_id - 1];
162 		if (nat == NULL) {
163 			*cmd_val = IP_FW_DENY;
164 			*cmd_ctl = IP_FW_CTL_DONE;
165 			return;
166 		}
167 		((ipfw_insn_nat *)cmd)->nat = nat;
168 	}
169 	*cmd_val = ip_fw3_nat(*args, nat, (*args)->m);
170 	*cmd_ctl = IP_FW_CTL_NAT;
171 }
172 
173 int
174 ip_fw3_nat(struct ip_fw_args *args, struct cfg_nat *nat, struct mbuf *m)
175 {
176 	struct state_tree *tree_out = NULL;
177 	struct nat_state *s = NULL, *dup, *k, key;
178 	struct nat_state2 *s2 = NULL;
179 	struct ip *ip = mtod(m, struct ip *);
180 	struct in_addr *old_addr = NULL, new_addr;
181 	uint16_t *old_port = NULL, new_port;
182 	uint16_t *csum = NULL, dlen = 0;
183 	uint8_t udp = 0;
184 	boolean_t pseudo = FALSE, need_return_state = FALSE;
185 	struct cfg_alias *alias;
186 	int i = 0, rand_n = 0;
187 
188 	k = &key;
189 	memset(k, 0, LEN_NAT_STATE);
190 	if (args->oif == NULL) {
191 		old_addr = &ip->ip_dst;
192 		k->dst_addr = ntohl(args->f_id.dst_ip);
193 		LIST_FOREACH(alias, &nat->alias, next) {
194 			if (alias->ip.s_addr == ntohl(args->f_id.dst_ip)) {
195 				break;
196 			}
197 		}
198 		if (alias == NULL) {
199 			goto oops;
200 		}
201 		switch (ip->ip_p) {
202 		case IPPROTO_TCP:
203 			old_port = &L3HDR(struct tcphdr, ip)->th_dport;
204 			s2 = alias->tcp_in[*old_port - ALIAS_BEGIN];
205 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
206 			break;
207 		case IPPROTO_UDP:
208 			old_port = &L3HDR(struct udphdr, ip)->uh_dport;
209 			s2 = alias->udp_in[*old_port - ALIAS_BEGIN];
210 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
211 			udp = 1;
212 			break;
213 		case IPPROTO_ICMP:
214 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
215 			s2 = alias->icmp_in[*old_port];
216 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
217 			break;
218 		default:
219 			panic("ipfw3: unsupported proto %u", ip->ip_p);
220 		}
221 		if (s2 == NULL) {
222 			goto oops;
223 		}
224 	} else {
225 		old_addr = &ip->ip_src;
226 		k->src_addr = args->f_id.src_ip;
227 		k->dst_addr = args->f_id.dst_ip;
228 		switch (ip->ip_p) {
229 		case IPPROTO_TCP:
230 			k->src_port = args->f_id.src_port;
231 			k->dst_port = args->f_id.dst_port;
232 			m->m_pkthdr.csum_flags = CSUM_TCP;
233 			tree_out = &nat->rb_tcp_out;
234 			old_port = &L3HDR(struct tcphdr, ip)->th_sport;
235 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
236 			break;
237 		case IPPROTO_UDP:
238 			k->src_port = args->f_id.src_port;
239 			k->dst_port = args->f_id.dst_port;
240 			m->m_pkthdr.csum_flags = CSUM_UDP;
241 			tree_out = &nat->rb_udp_out;
242 			old_port = &L3HDR(struct udphdr, ip)->uh_sport;
243 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
244 			udp = 1;
245 			break;
246 		case IPPROTO_ICMP:
247 			k->src_port = L3HDR(struct icmp, ip)->icmp_id;
248 			k->dst_port = k->src_port;
249 			tree_out = &nat->rb_icmp_out;
250 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
251 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
252 			break;
253 		default:
254 			panic("ipfw3: unsupported proto %u", ip->ip_p);
255 		}
256 		s = RB_FIND(state_tree, tree_out, k);
257 		if (s == NULL) {
258 			/* pick an alias ip randomly when there are multiple */
259 			if (nat->count > 1) {
260 				rand_n = krandom() % nat->count;
261 			}
262 			LIST_FOREACH(alias, &nat->alias, next) {
263 				if (i++ == rand_n) {
264 					break;
265 				}
266 			}
267 			switch  (ip->ip_p) {
268 			case IPPROTO_TCP:
269 				m->m_pkthdr.csum_flags = CSUM_TCP;
270 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
271 						M_INTWAIT | M_NULLOK | M_ZERO);
272 
273 				s->src_addr = args->f_id.src_ip;
274 				s->src_port = args->f_id.src_port;
275 
276 				s->dst_addr = args->f_id.dst_ip;
277 				s->dst_port = args->f_id.dst_port;
278 
279 				s->alias_addr = alias->ip.s_addr;
280 				pick_alias_port(s, tree_out);
281 				dup = RB_INSERT(state_tree, tree_out, s);
282 				need_return_state = TRUE;
283 				break;
284 			case IPPROTO_UDP:
285 				m->m_pkthdr.csum_flags = CSUM_UDP;
286 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
287 						M_INTWAIT | M_NULLOK | M_ZERO);
288 
289 				s->src_addr = args->f_id.src_ip;
290 				s->src_port = args->f_id.src_port;
291 
292 				s->dst_addr = args->f_id.dst_ip;
293 				s->dst_port = args->f_id.dst_port;
294 
295 				s->alias_addr = alias->ip.s_addr;
296 				pick_alias_port(s, tree_out);
297 				dup = RB_INSERT(state_tree, tree_out, s);
298 				need_return_state = TRUE;
299 				break;
300 			case IPPROTO_ICMP:
301 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
302 						M_INTWAIT | M_NULLOK | M_ZERO);
303 				s->src_addr = args->f_id.src_ip;
304 				s->dst_addr = args->f_id.dst_ip;
305 
306 				s->src_port = *old_port;
307 				s->dst_port = *old_port;
308 
309 				s->alias_addr = alias->ip.s_addr;
310 				s->alias_port = htons(s->src_addr *
311 						s->dst_addr % ALIAS_RANGE);
312 				dup = RB_INSERT(state_tree, tree_out, s);
313 
314 				s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
315 						M_INTWAIT | M_NULLOK | M_ZERO);
316 
317 				s2->src_addr = args->f_id.dst_ip;
318 				s2->dst_addr = alias->ip.s_addr;
319 
320 				s2->src_port = s->alias_port;
321 				s2->dst_port = s->alias_port;
322 
323 				s2->alias_addr = htonl(args->f_id.src_ip);
324 				s2->alias_port = *old_port;
325 
326 				alias->icmp_in[s->alias_port] = s2;
327 				break;
328 			default :
329 				goto oops;
330 			}
331 		}
332 	}
333 	if (args->oif == NULL) {
334 		if (ip->ip_p == IPPROTO_ICMP) {
335 			new_addr.s_addr = s2->alias_addr;
336 			new_port = s2->alias_port;
337 		} else {
338 			new_addr.s_addr = s2->src_addr;
339 			new_port = s2->src_port;
340 		}
341 		s2->timestamp = time_uptime;
342 	} else {
343 		new_addr.s_addr = s->alias_addr;
344 		new_port = s->alias_port;
345 		s->timestamp = time_uptime;
346 	}
347 
348 	/* replace src/dst and fix the checksum */
349 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
350 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
351 			dlen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
352 		}
353 		pseudo = TRUE;
354 	}
355 	if (!pseudo) {
356 		const uint16_t *oaddr, *naddr;
357 		oaddr = (const uint16_t *)&old_addr->s_addr;
358 		naddr = (const uint16_t *)&new_addr.s_addr;
359 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[0], naddr[0], 0);
360 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[1], naddr[1], 0);
361 		if (ip->ip_p != IPPROTO_ICMP) {
362 			*csum = fix_cksum(*csum, oaddr[0], naddr[0], udp);
363 			*csum = fix_cksum(*csum, oaddr[1], naddr[1], udp);
364 		}
365 	}
366 	old_addr->s_addr = new_addr.s_addr;
367 	if (!pseudo) {
368 		*csum = fix_cksum(*csum, *old_port, new_port, udp);
369 	}
370 	*old_port = new_port;
371 
372 	if (pseudo) {
373 		*csum = in_pseudo(ip->ip_src.s_addr,
374 				ip->ip_dst.s_addr, htons(dlen + ip->ip_p));
375 	}
376 
377 	/* prepare the state for return traffic */
378 	if (need_return_state) {
379 		m->m_flags &= ~M_HASH;
380 		ip_hashfn(&m, 0);
381 
382 		int nextcpu = netisr_hashcpu(m->m_pkthdr.hash);
383 		if (nextcpu != mycpuid) {
384 			struct netmsg_nat_state_add *msg;
385 			msg = kmalloc(LEN_NMSG_NAT_STATE_ADD,
386 					M_LWKTMSG, M_NOWAIT | M_ZERO);
387 			netmsg_init(&msg->base, NULL, &curthread->td_msgport,
388 					0, nat_state_add_dispatch);
389 			s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
390 					M_INTWAIT | M_NULLOK | M_ZERO);
391 
392 			s2->src_addr = args->f_id.dst_ip;
393 			s2->src_port = args->f_id.dst_port;
394 
395 			s2->dst_addr = alias->ip.s_addr;
396 			s2->dst_port = s->alias_port;
397 
398 			s2->src_addr = htonl(args->f_id.src_ip);
399 			s2->src_port = htons(args->f_id.src_port);
400 
401 			s2->timestamp = s->timestamp;
402 			msg->alias_addr.s_addr = alias->ip.s_addr;
403 			msg->alias_port = s->alias_port;
404 			msg->state = s2;
405 			msg->nat_id = nat->id;
406 			msg->proto = ip->ip_p;
407 			netisr_sendmsg(&msg->base, nextcpu);
408 		} else {
409 			s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
410 					M_INTWAIT | M_NULLOK | M_ZERO);
411 
412 			s2->src_addr = args->f_id.dst_ip;
413 			s2->dst_addr = alias->ip.s_addr;
414 
415 			s2->src_port = s->alias_port;
416 			s2->dst_port = s->alias_port;
417 
418 			s2->src_addr = htonl(args->f_id.src_ip);
419 			s2->src_port = htons(args->f_id.src_port);
420 
421 			s2->timestamp = s->timestamp;
422 			if (ip->ip_p == IPPROTO_TCP) {
423 				alias->tcp_in[s->alias_port - ALIAS_BEGIN] = s2;
424 			} else {
425 				alias->udp_in[s->alias_port - ALIAS_BEGIN] = s2;
426 			}
427 		}
428 	}
429 	return IP_FW_NAT;
430 oops:
431 	IPFW3_DEBUG1("oops\n");
432 	return IP_FW_DENY;
433 }
434 
435 void
436 pick_alias_port(struct nat_state *s, struct state_tree *tree)
437 {
438 	do {
439 		s->alias_port = htons(krandom() % ALIAS_RANGE + ALIAS_BEGIN);
440 	} while (RB_FIND(state_tree, tree, s) != NULL);
441 }
442 
443 int
444 ip_fw3_nat_state_cmp(struct nat_state *s1, struct nat_state *s2)
445 {
446 	if (s1->src_addr > s2->src_addr)
447 		return 1;
448 	if (s1->src_addr < s2->src_addr)
449 		return -1;
450 
451 	if (s1->dst_addr > s2->dst_addr)
452 		return 1;
453 	if (s1->dst_addr < s2->dst_addr)
454 		return -1;
455 
456 	if (s1->src_port > s2->src_port)
457 		return 1;
458 	if (s1->src_port < s2->src_port)
459 		return -1;
460 
461 	if (s1->dst_port > s2->dst_port)
462 		return 1;
463 	if (s1->dst_port < s2->dst_port)
464 		return -1;
465 
466 	return 0;
467 }
468 
469 int
470 ip_fw3_ctl_nat_get_cfg(struct sockopt *sopt)
471 {
472 	struct ip_fw3_nat_context *nat_ctx;
473 	struct ioc_nat *ioc;
474 	struct cfg_nat *nat;
475 	struct cfg_alias *alias;
476 	struct in_addr *ip;
477 	size_t valsize;
478 	int i, len;
479 
480 	len = 0;
481 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
482 	valsize = sopt->sopt_valsize;
483 	ioc = (struct ioc_nat *)sopt->sopt_val;
484 
485 	for (i = 0; i < NAT_ID_MAX; i++) {
486 		nat = nat_ctx->nats[i];
487 		if (nat != NULL) {
488 			len += LEN_IOC_NAT;
489 			if (len >= valsize) {
490 				goto nospace;
491 			}
492 			ioc->id = nat->id;
493 			ioc->count = nat->count;
494 			ip = &ioc->ip;
495 			LIST_FOREACH(alias, &nat->alias, next) {
496 				len += LEN_IN_ADDR;
497 				if (len > valsize) {
498 					goto nospace;
499 				}
500 				bcopy(&alias->ip, ip, LEN_IN_ADDR);
501 				ip++;
502 			}
503 		}
504 	}
505 	sopt->sopt_valsize = len;
506 	return 0;
507 nospace:
508 	bzero(sopt->sopt_val, sopt->sopt_valsize);
509 	sopt->sopt_valsize = 0;
510 	return 0;
511 }
512 
513 int
514 ip_fw3_ctl_nat_get_record(struct sockopt *sopt)
515 {
516 	struct ip_fw3_nat_context *nat_ctx;
517 	struct cfg_nat *the;
518 	size_t sopt_size, total_len = 0;
519 	struct ioc_nat_state *ioc;
520 	int ioc_nat_id, i, n, cpu;
521 	struct nat_state 	*s;
522 	struct nat_state2 	*s2;
523 	struct cfg_alias	*a1;
524 
525 	ioc_nat_id = *((int *)(sopt->sopt_val));
526 	sopt_size = sopt->sopt_valsize;
527 	ioc = (struct ioc_nat_state *)sopt->sopt_val;
528 	/* icmp states only in CPU 0 */
529 	cpu = 0;
530 	nat_ctx = ip_fw3_nat_ctx[cpu];
531 	for (n = 0; n < NAT_ID_MAX; n++) {
532 		if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
533 			if (nat_ctx->nats[n] == NULL)
534 				break;
535 			the = nat_ctx->nats[n];
536 			RB_FOREACH(s, state_tree, &the->rb_icmp_out) {
537 				total_len += LEN_IOC_NAT_STATE;
538 				if (total_len > sopt_size)
539 					goto nospace;
540 				ioc->src_addr.s_addr = ntohl(s->src_addr);
541 				ioc->dst_addr.s_addr = s->dst_addr;
542 				ioc->alias_addr.s_addr = s->alias_addr;
543 				ioc->src_port = s->src_port;
544 				ioc->dst_port = s->dst_port;
545 				ioc->alias_port = s->alias_port;
546 				ioc->nat_id = n + 1;
547 				ioc->cpu_id = cpu;
548 				ioc->proto = IPPROTO_ICMP;
549 				ioc->direction = 1;
550 				ioc->life = s->timestamp +
551 					sysctl_var_icmp_timeout - time_uptime;
552 				ioc++;
553 			}
554 
555 			LIST_FOREACH(a1, &the->alias, next) {
556 			for (i = 0; i < ALIAS_RANGE; i++) {
557 				s2 = a1->icmp_in[i];
558 				if (s2 == NULL) {
559 					continue;
560 				}
561 
562 				total_len += LEN_IOC_NAT_STATE;
563 				if (total_len > sopt_size)
564 					goto nospace;
565 
566 				ioc->src_addr.s_addr = ntohl(s2->src_addr);
567 				ioc->dst_addr.s_addr = s2->dst_addr;
568 				ioc->alias_addr.s_addr = s2->alias_addr;
569 				ioc->src_port = s2->src_port;
570 				ioc->dst_port = s2->dst_port;
571 				ioc->alias_port = s2->alias_port;
572 				ioc->nat_id = n + 1;
573 				ioc->cpu_id = cpu;
574 				ioc->proto = IPPROTO_ICMP;
575 				ioc->direction = 0;
576 				ioc->life = s2->timestamp +
577 					sysctl_var_icmp_timeout - time_uptime;
578 				ioc++;
579 			}
580 			}
581 		}
582 	}
583 
584 	/* tcp states */
585 	for (cpu = 0; cpu < ncpus; cpu++) {
586 		nat_ctx = ip_fw3_nat_ctx[cpu];
587 		for (n = 0; n < NAT_ID_MAX; n++) {
588 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
589 				if (nat_ctx->nats[n] == NULL)
590 					break;
591 				the = nat_ctx->nats[n];
592 				RB_FOREACH(s, state_tree, &the->rb_tcp_out) {
593 					total_len += LEN_IOC_NAT_STATE;
594 					if (total_len > sopt_size)
595 						goto nospace;
596 					ioc->src_addr.s_addr = ntohl(s->src_addr);
597 					ioc->dst_addr.s_addr = ntohl(s->dst_addr);
598 					ioc->alias_addr.s_addr = s->alias_addr;
599 					ioc->src_port = ntohs(s->src_port);
600 					ioc->dst_port = ntohs(s->dst_port);
601 					ioc->alias_port = s->alias_port;
602 					ioc->nat_id = n + 1;
603 					ioc->cpu_id = cpu;
604 					ioc->proto = IPPROTO_TCP;
605 					ioc->direction = 1;
606 					ioc->life = s->timestamp +
607 						sysctl_var_tcp_timeout - time_uptime;
608 					ioc++;
609 				}
610 				LIST_FOREACH(a1, &the->alias, next) {
611 					for (i = 0; i < ALIAS_RANGE; i++) {
612 						s2 = a1->tcp_in[i];
613 						if (s2 == NULL) {
614 							continue;
615 						}
616 
617 						total_len += LEN_IOC_NAT_STATE;
618 						if (total_len > sopt_size)
619 							goto nospace;
620 
621 						ioc->src_addr.s_addr = ntohl(s2->src_addr);
622 						ioc->dst_addr.s_addr = s2->dst_addr;
623 						ioc->alias_addr.s_addr = s2->alias_addr;
624 						ioc->src_port = s2->src_port;
625 						ioc->dst_port = s2->dst_port;
626 						ioc->alias_port = s2->alias_port;
627 						ioc->nat_id = n + 1;
628 						ioc->cpu_id = cpu;
629 						ioc->proto = IPPROTO_TCP;
630 						ioc->direction = 0;
631 						ioc->life = s2->timestamp +
632 							sysctl_var_icmp_timeout - time_uptime;
633 						ioc++;
634 					}
635 				}
636 			}
637 		}
638 	}
639 
640 	/* udp states */
641 	for (cpu = 0; cpu < ncpus; cpu++) {
642 		nat_ctx = ip_fw3_nat_ctx[cpu];
643 		for (n = 0; n < NAT_ID_MAX; n++) {
644 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
645 				if (nat_ctx->nats[n] == NULL)
646 					break;
647 				the = nat_ctx->nats[n];
648 				RB_FOREACH(s, state_tree, &the->rb_udp_out) {
649 					total_len += LEN_IOC_NAT_STATE;
650 					if (total_len > sopt_size)
651 						goto nospace;
652 					ioc->src_addr.s_addr = ntohl(s->src_addr);
653 					ioc->dst_addr.s_addr = s->dst_addr;
654 					ioc->alias_addr.s_addr = s->alias_addr;
655 					ioc->src_port = s->src_port;
656 					ioc->dst_port = s->dst_port;
657 					ioc->alias_port = s->alias_port;
658 					ioc->nat_id = n + 1;
659 					ioc->cpu_id = cpu;
660 					ioc->proto = IPPROTO_UDP;
661 					ioc->direction = 1;
662 					ioc->life = s->timestamp +
663 						sysctl_var_udp_timeout - time_uptime;
664 					ioc++;
665 				}
666 				LIST_FOREACH(a1, &the->alias, next) {
667 					for (i = 0; i < ALIAS_RANGE; i++) {
668 						s2 = a1->udp_in[i];
669 						if (s2 == NULL) {
670 							continue;
671 						}
672 
673 						total_len += LEN_IOC_NAT_STATE;
674 						if (total_len > sopt_size)
675 							goto nospace;
676 
677 						ioc->src_addr.s_addr = ntohl(s2->src_addr);
678 						ioc->dst_addr.s_addr = s2->dst_addr;
679 						ioc->alias_addr.s_addr = s2->alias_addr;
680 						ioc->src_port = s2->src_port;
681 						ioc->dst_port = s2->dst_port;
682 						ioc->alias_port = s2->alias_port;
683 						ioc->nat_id = n + 1;
684 						ioc->cpu_id = cpu;
685 						ioc->proto = IPPROTO_UDP;
686 						ioc->direction = 0;
687 						ioc->life = s2->timestamp +
688 							sysctl_var_icmp_timeout - time_uptime;
689 						ioc++;
690 					}
691 				}
692 			}
693 		}
694 	}
695 	sopt->sopt_valsize = total_len;
696 	return 0;
697 nospace:
698 	return 0;
699 }
700 
701 void
702 nat_state_add_dispatch(netmsg_t add_msg)
703 {
704 	struct ip_fw3_nat_context *nat_ctx;
705 	struct netmsg_nat_state_add *msg;
706 	struct cfg_nat *nat;
707 	struct nat_state2 *s2;
708 	struct cfg_alias *alias;
709 
710 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
711 	msg = (struct netmsg_nat_state_add *)add_msg;
712 	nat = nat_ctx->nats[msg->nat_id - 1];
713 
714 	LIST_FOREACH(alias, &nat->alias, next) {
715 		if (alias->ip.s_addr == msg->alias_addr.s_addr) {
716 			break;
717 		}
718 	}
719 	s2 = msg->state;
720 	if (msg->proto == IPPROTO_TCP) {
721 		alias->tcp_in[msg->alias_port - ALIAS_BEGIN] = s2;
722 	} else {
723 		alias->udp_in[msg->alias_port - ALIAS_BEGIN] = s2;
724 	}
725 }
726 
727 /*
728  * Init the RB trees only when the NAT is configured.
729  */
730 void
731 nat_add_dispatch(netmsg_t nat_add_msg)
732 {
733 	struct ip_fw3_nat_context *nat_ctx;
734 	struct netmsg_nat_add *msg;
735 	struct ioc_nat *ioc;
736 	struct cfg_nat *nat;
737 	struct cfg_alias *alias;
738 	struct in_addr *ip;
739 	int n;
740 
741 	msg = (struct netmsg_nat_add *)nat_add_msg;
742 	ioc = &msg->ioc_nat;
743 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
744 
745 	if (nat_ctx->nats[ioc->id - 1] == NULL) {
746 		/* op = set, and nat not exists */
747 		nat = kmalloc(LEN_CFG_NAT, M_IPFW3_NAT, M_WAITOK | M_ZERO);
748 		LIST_INIT(&nat->alias);
749 		RB_INIT(&nat->rb_tcp_out);
750 		RB_INIT(&nat->rb_udp_out);
751 		if (mycpuid == 0) {
752 			RB_INIT(&nat->rb_icmp_out);
753 		}
754 		nat->id = ioc->id;
755 		nat->count = ioc->count;
756 		ip = &ioc->ip;
757 		for (n = 0; n < ioc->count; n++) {
758 			alias = kmalloc(LEN_CFG_ALIAS,
759 					M_IPFW3_NAT, M_WAITOK | M_ZERO);
760 			memcpy(&alias->ip, ip, LEN_IN_ADDR);
761 			LIST_INSERT_HEAD((&nat->alias), alias, next);
762 			ip++;
763 		}
764 		nat_ctx->nats[ioc->id - 1] = nat;
765 	}
766 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
767 }
768 
769 int
770 ip_fw3_ctl_nat_add(struct sockopt *sopt)
771 {
772 	struct netmsg_nat_add nat_add_msg, *msg;
773 	struct ioc_nat *ioc;
774 	msg = &nat_add_msg;
775 
776 	ioc = (struct ioc_nat *)(sopt->sopt_val);
777 	sooptcopyin(sopt, &msg->ioc_nat, sopt->sopt_valsize,
778 			sizeof(struct ioc_nat));
779 	netmsg_init(&msg->base, NULL, &curthread->td_msgport, 0,
780 			nat_add_dispatch);
781 	netisr_domsg(&msg->base, 0);
782 	return 0;
783 }
784 
785 void
786 nat_del_dispatch(netmsg_t nat_del_msg)
787 {
788 	struct ip_fw3_nat_context *nat_ctx;
789 	struct netmsg_nat_del *msg;
790 	struct cfg_nat *nat;
791 	struct nat_state *s, *tmp;
792 	struct cfg_alias *alias, *tmp3;
793 
794 	msg = (struct netmsg_nat_del *)nat_del_msg;
795 
796 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
797 	nat = nat_ctx->nats[msg->id - 1];
798 	if (nat != NULL) {
799 		/* the icmp states will only stored in cpu 0 */
800 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
801 			RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
802 			if (s != NULL) {
803 				kfree(s, M_IPFW3_NAT);
804 			}
805 		}
806 		/*
807 		LIST_FOREACH_MUTABLE(s2, &nat->alias->icmp_in, next, tmp2) {
808 			LIST_REMOVE(s2, next);
809 			if (s != NULL) {
810 				kfree(s, M_IPFW3_NAT);
811 			}
812 		}
813 		*/
814 
815 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
816 			RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
817 			if (s != NULL) {
818 				kfree(s, M_IPFW3_NAT);
819 			}
820 		}
821 		/*
822 		LIST_FOREACH_MUTABLE(s2, &nat->alias->tcp_in, next, tmp2) {
823 			LIST_REMOVE(s2, next);
824 			if (s != NULL) {
825 				kfree(s, M_IPFW3_NAT);
826 			}
827 		}
828 		*/
829 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
830 			RB_REMOVE(state_tree, &nat->rb_udp_out, s);
831 			if (s != NULL) {
832 				kfree(s, M_IPFW3_NAT);
833 			}
834 		}
835 		/*
836 		LIST_FOREACH_MUTABLE(s2, &nat->alias->udp_in, next, tmp2) {
837 			LIST_REMOVE(s2, next);
838 			if (s != NULL) {
839 				kfree(s, M_IPFW3_NAT);
840 			}
841 		}
842 		*/
843 		LIST_FOREACH_MUTABLE(alias, &nat->alias, next, tmp3) {
844 			kfree(alias, M_IPFW3_NAT);
845 		}
846 		kfree(nat, M_IPFW3_NAT);
847 		nat_ctx->nats[msg->id - 1] = NULL;
848 	}
849 	netisr_forwardmsg_all(&nat_del_msg->base, mycpuid + 1);
850 }
851 int
852 ip_fw3_ctl_nat_del(struct sockopt *sopt)
853 {
854 	struct netmsg_nat_del nat_del_msg, *msg;
855 
856 	msg = &nat_del_msg;
857 	msg->id = *((int *)sopt->sopt_val);
858 	netmsg_init(&msg->base, NULL, &curthread->td_msgport,
859 			0, nat_del_dispatch);
860 
861 	netisr_domsg(&msg->base, 0);
862 	return 0;
863 }
864 int
865 ip_fw3_ctl_nat_flush(struct sockopt *sopt)
866 {
867 	struct netmsg_nat_del nat_del_msg, *msg;
868 	int i;
869 	msg = &nat_del_msg;
870 	for (i = 0; i < NAT_ID_MAX; i++) {
871 		msg->id = i + 1;
872 		netmsg_init(&msg->base, NULL, &curthread->td_msgport,
873 				0, nat_del_dispatch);
874 
875 		netisr_domsg(&msg->base, 0);
876 	}
877 	return 0;
878 }
879 
880 int
881 ip_fw3_ctl_nat_sockopt(struct sockopt *sopt)
882 {
883 	int error = 0;
884 	switch (sopt->sopt_name) {
885 	case IP_FW_NAT_ADD:
886 		error = ip_fw3_ctl_nat_add(sopt);
887 		break;
888 	case IP_FW_NAT_DEL:
889 		error = ip_fw3_ctl_nat_del(sopt);
890 		break;
891 	case IP_FW_NAT_FLUSH:
892 		error = ip_fw3_ctl_nat_flush(sopt);
893 		break;
894 	case IP_FW_NAT_GET:
895 		error = ip_fw3_ctl_nat_get_cfg(sopt);
896 		break;
897 	case IP_FW_NAT_GET_RECORD:
898 		error = ip_fw3_ctl_nat_get_record(sopt);
899 		break;
900 	default:
901 		kprintf("ipfw3 nat invalid socket option %d\n",
902 				sopt->sopt_name);
903 	}
904 	return error;
905 }
906 
907 void
908 nat_init_ctx_dispatch(netmsg_t msg)
909 {
910 	struct ip_fw3_nat_context *tmp;
911 	tmp = kmalloc(sizeof(struct ip_fw3_nat_context),
912 				M_IPFW3_NAT, M_WAITOK | M_ZERO);
913 
914 	ip_fw3_nat_ctx[mycpuid] = tmp;
915 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
916 }
917 
918 void
919 nat_fnit_ctx_dispatch(netmsg_t msg)
920 {
921 	kfree(ip_fw3_nat_ctx[mycpuid], M_IPFW3_NAT);
922 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
923 }
924 
925 static void
926 nat_cleanup_func_dispatch(netmsg_t nmsg)
927 {
928 	struct nat_state *s, *tmp;
929 	struct ip_fw3_nat_context *nat_ctx;
930 	struct cfg_nat *nat;
931 	struct cfg_alias *a1, *tmp2;
932 	struct nat_state2 *s2;
933 	int i, j;
934 
935 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
936 	for (j = 0; j < NAT_ID_MAX; j++) {
937 		nat = nat_ctx->nats[j];
938 		if (nat == NULL)
939 			continue;
940 		/* check the nat_states, remove the expired state */
941 		/* the icmp states will only stored in cpu 0 */
942 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
943 			if (time_uptime - s->timestamp > sysctl_var_icmp_timeout) {
944 				RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
945 				kfree(s, M_IPFW3_NAT);
946 			}
947 		}
948 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
949 			for (i = 0; i < ALIAS_RANGE; i++) {
950 				s2 = a1->icmp_in[i];
951 				if (s2 != NULL) {
952 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
953 						a1->icmp_in[i] = NULL;
954 						kfree(s2, M_IPFW3_NAT);
955 					}
956 				}
957 
958 			}
959 		}
960 
961 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
962 			if (time_uptime - s->timestamp > sysctl_var_tcp_timeout) {
963 				RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
964 				kfree(s, M_IPFW3_NAT);
965 			}
966 		}
967 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
968 			for (i = 0; i < ALIAS_RANGE; i++) {
969 				s2 = a1->tcp_in[i];
970 				if (s2 != NULL) {
971 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
972 						a1->tcp_in[i] = NULL;
973 						kfree(s2, M_IPFW3_NAT);
974 					}
975 				}
976 
977 			}
978 		}
979 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
980 			if (time_uptime - s->timestamp > sysctl_var_udp_timeout) {
981 				RB_REMOVE(state_tree, &nat->rb_udp_out, s);
982 				kfree(s, M_IPFW3_NAT);
983 			}
984 		}
985 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
986 			for (i = 0; i < ALIAS_RANGE; i++) {
987 				s2 = a1->udp_in[i];
988 				if (s2 != NULL) {
989 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
990 						a1->udp_in[i] = NULL;
991 						kfree(s2, M_IPFW3_NAT);
992 					}
993 				}
994 
995 			}
996 		}
997 	}
998 	netisr_forwardmsg_all(&nmsg->base, mycpuid + 1);
999 }
1000 
1001 static void
1002 ip_fw3_nat_cleanup_func(void *dummy __unused)
1003 {
1004 	struct netmsg_base msg;
1005 	netmsg_init(&msg, NULL, &curthread->td_msgport, 0,
1006 			nat_cleanup_func_dispatch);
1007 	netisr_domsg(&msg, 0);
1008 
1009 	callout_reset(&ip_fw3_nat_cleanup_callout,
1010 			sysctl_var_cleanup_interval * hz,
1011 			ip_fw3_nat_cleanup_func, NULL);
1012 }
1013 
1014 static
1015 int ip_fw3_nat_init(void)
1016 {
1017 	struct netmsg_base msg;
1018 	ip_fw3_register_module(MODULE_NAT_ID, MODULE_NAT_NAME);
1019 	ip_fw3_register_filter_funcs(MODULE_NAT_ID, O_NAT_NAT,
1020 			(filter_func)check_nat);
1021 	ip_fw3_ctl_nat_ptr = ip_fw3_ctl_nat_sockopt;
1022 	netmsg_init(&msg, NULL, &curthread->td_msgport,
1023 			0, nat_init_ctx_dispatch);
1024 	netisr_domsg(&msg, 0);
1025 
1026 	callout_init_mp(&ip_fw3_nat_cleanup_callout);
1027 	callout_reset(&ip_fw3_nat_cleanup_callout,
1028 			sysctl_var_cleanup_interval * hz,
1029 			ip_fw3_nat_cleanup_func,
1030 			NULL);
1031 	return 0;
1032 }
1033 
1034 static int
1035 ip_fw3_nat_fini(void)
1036 {
1037 	struct netmsg_base msg;
1038 	struct netmsg_nat_del nat_del_msg, *msg1;
1039 	int i;
1040 
1041 	callout_stop(&ip_fw3_nat_cleanup_callout);
1042 
1043 	msg1 = &nat_del_msg;
1044 	for (i = 0; i < NAT_ID_MAX; i++) {
1045 		msg1->id = i + 1;
1046 		netmsg_init(&msg1->base, NULL, &curthread->td_msgport,
1047 				0, nat_del_dispatch);
1048 
1049 		netisr_domsg(&msg1->base, 0);
1050 	}
1051 
1052 	netmsg_init(&msg, NULL, &curthread->td_msgport,
1053 			0, nat_fnit_ctx_dispatch);
1054 	netisr_domsg(&msg, 0);
1055 
1056 	return ip_fw3_unregister_module(MODULE_NAT_ID);
1057 }
1058 
1059 static int
1060 ip_fw3_nat_modevent(module_t mod, int type, void *data)
1061 {
1062 	switch (type) {
1063 	case MOD_LOAD:
1064 		return ip_fw3_nat_init();
1065 	case MOD_UNLOAD:
1066 		return ip_fw3_nat_fini();
1067 	default:
1068 		break;
1069 	}
1070 	return 0;
1071 }
1072 
1073 moduledata_t ip_fw3_nat_mod = {
1074 	"ipfw3_nat",
1075 	ip_fw3_nat_modevent,
1076 	NULL
1077 };
1078 
1079 DECLARE_MODULE(ipfw3_nat, ip_fw3_nat_mod,
1080 		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
1081 MODULE_DEPEND(ipfw3_nat, ipfw3_basic, 1, 1, 1);
1082 MODULE_VERSION(ipfw3_nat, 1);
1083