xref: /dragonfly/sys/net/ipfw3_nat/ip_fw3_nat.c (revision b0ae6fbc)
1 /*
2  * Copyright (c) 2014 - 2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Bill Yuan <bycn82@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "opt_ipfw.h"
36 #include "opt_inet.h"
37 #ifndef INET
38 #error IPFIREWALL3 requires INET.
39 #endif /* INET */
40 
41 #include <sys/param.h>
42 #include <sys/kernel.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/systimer.h>
48 #include <sys/thread2.h>
49 #include <sys/in_cksum.h>
50 #include <sys/systm.h>
51 #include <sys/proc.h>
52 #include <sys/socket.h>
53 #include <sys/syslog.h>
54 #include <sys/ucred.h>
55 #include <sys/lock.h>
56 #include <sys/mplock2.h>
57 
58 #include <net/ethernet.h>
59 #include <net/netmsg2.h>
60 #include <net/netisr2.h>
61 #include <net/route.h>
62 #include <net/if.h>
63 
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip_icmp.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_timer.h>
69 #include <netinet/tcp_var.h>
70 #include <netinet/tcpip.h>
71 #include <netinet/udp.h>
72 #include <netinet/udp_var.h>
73 #include <netinet/in_systm.h>
74 #include <netinet/in_var.h>
75 #include <netinet/in_pcb.h>
76 #include <netinet/ip_var.h>
77 #include <netinet/ip_divert.h>
78 #include <net/ipfw3/ip_fw.h>
79 
80 #include "ip_fw3_nat.h"
81 
82 MALLOC_DEFINE(M_IPFW3_NAT, "IP_FW3_NAT", "ipfw3_nat module");
83 
84 /*
85  * Highspeed Lockless Kernel NAT
86  *
87  * Kernel NAT
88  * The network address translation (NAT) will replace the `src` of the packet
89  * with an `alias` (alias_addr & alias_port). Accordingt to the configuration,
90  * The alias will be randomly picked from the configured range.
91  *
92  * Highspeed
93  * The first outgoing packet should trigger the creation of the `net_state`,
94  * and the `net_state` will keep in a RB-Tree for the subsequent outgoing
95  * packets.
96  * The first returning packet will trigger the creation of the `net_state2`,
97  * which will be stored in a multidimensional array of points ( of net_state2 ).
98  *
99  * Lockless
100  * The `net_state` for outgoing packet will be stored in the nat_context of
101  * current CPU. But due to the nature of the NAT, the returning packet may be
102  * handled by another CPU. Hence, The `net_state2` for the returning packet
103  * will be prepared and stored into the nat_context of the right CPU.
104  */
105 
106 struct ip_fw3_nat_context	*ip_fw3_nat_ctx[MAXCPU];
107 static struct callout 		ip_fw3_nat_cleanup_callout;
108 extern struct ipfw3_context 	*fw3_ctx[MAXCPU];
109 extern ip_fw_ctl_t 		*ip_fw3_ctl_nat_ptr;
110 
111 static int 			sysctl_var_cleanup_interval = 1;
112 static int 			sysctl_var_icmp_timeout = 10;
113 static int 			sysctl_var_tcp_timeout = 60;
114 static int 			sysctl_var_udp_timeout = 30;
115 
116 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw3_nat, CTLFLAG_RW, 0, "ipfw3 NAT");
117 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, cleanup_interval, CTLFLAG_RW,
118 		&sysctl_var_cleanup_interval, 0, "default life time");
119 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, icmp_timeout, CTLFLAG_RW,
120 		&sysctl_var_icmp_timeout, 0, "default icmp state life time");
121 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, tcp_timeout, CTLFLAG_RW,
122 		&sysctl_var_tcp_timeout, 0, "default tcp state life time");
123 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, udp_timeout, CTLFLAG_RW,
124 		&sysctl_var_udp_timeout, 0, "default udp state life time");
125 
126 RB_PROTOTYPE(state_tree, nat_state, entries, ip_fw3_nat_state_cmp);
127 RB_GENERATE(state_tree, nat_state, entries, ip_fw3_nat_state_cmp);
128 
129 static __inline uint16_t
130 fix_cksum(uint16_t cksum, uint16_t old_info, uint16_t new_info, uint8_t is_udp)
131 {
132 	uint32_t tmp;
133 
134 	if (is_udp && !cksum)
135 		return (0x0000);
136 	tmp = cksum + old_info - new_info;
137 	tmp = (tmp >> 16) + (tmp & 65535);
138 	tmp = tmp & 65535;
139 	if (is_udp && !tmp)
140 		return (0xFFFF);
141 	return tmp;
142 }
143 
144 void
145 check_nat(int *cmd_ctl, int *cmd_val, struct ip_fw_args **args,
146 		struct ip_fw **f, ipfw_insn *cmd, uint16_t ip_len)
147 {
148 	if ((*args)->eh != NULL) {
149 		*cmd_ctl = IP_FW_CTL_NO;
150 		*cmd_val = IP_FW_NOT_MATCH;
151 		return;
152 	}
153 
154 	struct ip_fw3_nat_context *nat_ctx;
155 	struct cfg_nat *nat;
156 	int nat_id;
157 
158 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
159 	(*args)->rule = *f;
160 	nat = ((ipfw_insn_nat *)cmd)->nat;
161 	if (nat == NULL) {
162 		nat_id = cmd->arg1;
163 		nat = nat_ctx->nats[nat_id - 1];
164 		if (nat == NULL) {
165 			*cmd_val = IP_FW_DENY;
166 			*cmd_ctl = IP_FW_CTL_DONE;
167 			return;
168 		}
169 		((ipfw_insn_nat *)cmd)->nat = nat;
170 	}
171 	*cmd_val = ip_fw3_nat(*args, nat, (*args)->m);
172 	*cmd_ctl = IP_FW_CTL_NAT;
173 }
174 
175 int
176 ip_fw3_nat(struct ip_fw_args *args, struct cfg_nat *nat, struct mbuf *m)
177 {
178 	struct state_tree *tree_out = NULL;
179 	struct nat_state *s = NULL, *dup, *k, key;
180 	struct nat_state2 *s2 = NULL;
181 	struct ip *ip = mtod(m, struct ip *);
182 	struct in_addr *old_addr = NULL, new_addr;
183 	uint16_t *old_port = NULL, new_port;
184 	uint16_t *csum = NULL, dlen = 0;
185 	uint8_t udp = 0;
186 	boolean_t pseudo = FALSE, need_return_state = FALSE;
187 	struct cfg_alias *alias;
188 	int i = 0, rand_n = 0;
189 
190 	k = &key;
191 	memset(k, 0, LEN_NAT_STATE);
192 	if (args->oif == NULL) {
193 		old_addr = &ip->ip_dst;
194 		k->dst_addr = ntohl(args->f_id.dst_ip);
195 		LIST_FOREACH(alias, &nat->alias, next) {
196 			if (alias->ip.s_addr == ntohl(args->f_id.dst_ip)) {
197 				break;
198 			}
199 		}
200 		if (alias == NULL) {
201 			goto oops;
202 		}
203 		switch (ip->ip_p) {
204 		case IPPROTO_TCP:
205 			old_port = &L3HDR(struct tcphdr, ip)->th_dport;
206 			s2 = alias->tcp_in[*old_port - ALIAS_BEGIN];
207 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
208 			break;
209 		case IPPROTO_UDP:
210 			old_port = &L3HDR(struct udphdr, ip)->uh_dport;
211 			s2 = alias->udp_in[*old_port - ALIAS_BEGIN];
212 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
213 			udp = 1;
214 			break;
215 		case IPPROTO_ICMP:
216 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
217 			s2 = alias->icmp_in[*old_port];
218 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
219 			break;
220 		default:
221 			panic("ipfw3: unsupported proto %u", ip->ip_p);
222 		}
223 		if (s2 == NULL) {
224 			goto oops;
225 		}
226 	} else {
227 		old_addr = &ip->ip_src;
228 		k->src_addr = args->f_id.src_ip;
229 		k->dst_addr = args->f_id.dst_ip;
230 		switch (ip->ip_p) {
231 		case IPPROTO_TCP:
232 			k->src_port = args->f_id.src_port;
233 			k->dst_port = args->f_id.dst_port;
234 			m->m_pkthdr.csum_flags = CSUM_TCP;
235 			tree_out = &nat->rb_tcp_out;
236 			old_port = &L3HDR(struct tcphdr, ip)->th_sport;
237 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
238 			break;
239 		case IPPROTO_UDP:
240 			k->src_port = args->f_id.src_port;
241 			k->dst_port = args->f_id.dst_port;
242 			m->m_pkthdr.csum_flags = CSUM_UDP;
243 			tree_out = &nat->rb_udp_out;
244 			old_port = &L3HDR(struct udphdr, ip)->uh_sport;
245 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
246 			udp = 1;
247 			break;
248 		case IPPROTO_ICMP:
249 			k->src_port = L3HDR(struct icmp, ip)->icmp_id;
250 			k->dst_port = k->src_port;
251 			tree_out = &nat->rb_icmp_out;
252 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
253 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
254 			break;
255 		default:
256 			panic("ipfw3: unsupported proto %u", ip->ip_p);
257 		}
258 		s = RB_FIND(state_tree, tree_out, k);
259 		if (s == NULL) {
260 			/* pick an alias ip randomly when there are multiple */
261 			if (nat->count > 1) {
262 				rand_n = krandom() % nat->count;
263 			}
264 			LIST_FOREACH(alias, &nat->alias, next) {
265 				if (i++ == rand_n) {
266 					break;
267 				}
268 			}
269 			switch  (ip->ip_p) {
270 			case IPPROTO_TCP:
271 				m->m_pkthdr.csum_flags = CSUM_TCP;
272 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
273 						M_INTWAIT | M_NULLOK | M_ZERO);
274 
275 				s->src_addr = args->f_id.src_ip;
276 				s->src_port = args->f_id.src_port;
277 
278 				s->dst_addr = args->f_id.dst_ip;
279 				s->dst_port = args->f_id.dst_port;
280 
281 				s->alias_addr = alias->ip.s_addr;
282 				pick_alias_port(s, tree_out);
283 				dup = RB_INSERT(state_tree, tree_out, s);
284 				need_return_state = TRUE;
285 				break;
286 			case IPPROTO_UDP:
287 				m->m_pkthdr.csum_flags = CSUM_UDP;
288 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
289 						M_INTWAIT | M_NULLOK | M_ZERO);
290 
291 				s->src_addr = args->f_id.src_ip;
292 				s->src_port = args->f_id.src_port;
293 
294 				s->dst_addr = args->f_id.dst_ip;
295 				s->dst_port = args->f_id.dst_port;
296 
297 				s->alias_addr = alias->ip.s_addr;
298 				pick_alias_port(s, tree_out);
299 				dup = RB_INSERT(state_tree, tree_out, s);
300 				need_return_state = TRUE;
301 				break;
302 			case IPPROTO_ICMP:
303 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
304 						M_INTWAIT | M_NULLOK | M_ZERO);
305 				s->src_addr = args->f_id.src_ip;
306 				s->dst_addr = args->f_id.dst_ip;
307 
308 				s->src_port = *old_port;
309 				s->dst_port = *old_port;
310 
311 				s->alias_addr = alias->ip.s_addr;
312 				s->alias_port = htons(s->src_addr % ALIAS_RANGE);
313 				dup = RB_INSERT(state_tree, tree_out, s);
314 
315 				s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
316 						M_INTWAIT | M_NULLOK | M_ZERO);
317 
318 				s2->src_addr = args->f_id.dst_ip;
319 				s2->dst_addr = alias->ip.s_addr;
320 
321 				s2->src_port = s->alias_port;
322 				s2->dst_port = s->alias_port;
323 
324 				s2->alias_addr = htonl(args->f_id.src_ip);
325 				s2->alias_port = *old_port;
326 
327 				alias->icmp_in[s->alias_port] = s2;
328 				break;
329 			default :
330 				goto oops;
331 			}
332 		}
333 	}
334 	if (args->oif == NULL) {
335 		new_addr.s_addr = s2->src_addr;
336 		new_port = s2->src_port;
337 		s2->timestamp = time_uptime;
338 	} else {
339 		new_addr.s_addr = s->alias_addr;
340 		new_port = s->alias_port;
341 		s->timestamp = time_uptime;
342 	}
343 
344 	/* replace src/dst and fix the checksum */
345 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
346 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
347 			dlen = ip->ip_len - (ip->ip_hl << 2);
348 		}
349 		pseudo = TRUE;
350 	}
351 	if (!pseudo) {
352 		const uint16_t *oaddr, *naddr;
353 		oaddr = (const uint16_t *)&old_addr->s_addr;
354 		naddr = (const uint16_t *)&new_addr.s_addr;
355 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[0], naddr[0], 0);
356 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[1], naddr[1], 0);
357 		if (ip->ip_p != IPPROTO_ICMP) {
358 			*csum = fix_cksum(*csum, oaddr[0], naddr[0], udp);
359 			*csum = fix_cksum(*csum, oaddr[1], naddr[1], udp);
360 		}
361 	}
362 	old_addr->s_addr = new_addr.s_addr;
363 	if (!pseudo) {
364 		*csum = fix_cksum(*csum, *old_port, new_port, udp);
365 	}
366 	*old_port = new_port;
367 
368 	if (pseudo) {
369 		*csum = in_pseudo(ip->ip_src.s_addr,
370 				ip->ip_dst.s_addr, htons(dlen + ip->ip_p));
371 	}
372 
373 	/* prepare the state for return traffic */
374 	if (need_return_state) {
375 		ip->ip_len = htons(ip->ip_len);
376 		ip->ip_off = htons(ip->ip_off);
377 
378 		m->m_flags &= ~M_HASH;
379 		ip_hashfn(&m, 0);
380 
381 		ip->ip_len = ntohs(ip->ip_len);
382 		ip->ip_off = ntohs(ip->ip_off);
383 
384 		int nextcpu = netisr_hashcpu(m->m_pkthdr.hash);
385 		if (nextcpu != mycpuid) {
386 			struct netmsg_nat_state_add *msg;
387 			msg = kmalloc(LEN_NMSG_NAT_STATE_ADD,
388 					M_LWKTMSG, M_NOWAIT | M_ZERO);
389 			netmsg_init(&msg->base, NULL, &curthread->td_msgport,
390 					0, nat_state_add_dispatch);
391 			s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
392 					M_INTWAIT | M_NULLOK | M_ZERO);
393 
394 			s2->src_addr = args->f_id.dst_ip;
395 			s2->src_port = args->f_id.dst_port;
396 
397 			s2->dst_addr = alias->ip.s_addr;
398 			s2->dst_port = s->alias_port;
399 
400 			s2->src_addr = htonl(args->f_id.src_ip);
401 			s2->src_port = htons(args->f_id.src_port);
402 
403 			s2->timestamp = s->timestamp;
404 			msg->alias_addr.s_addr = alias->ip.s_addr;
405 			msg->alias_port = s->alias_port;
406 			msg->state = s2;
407 			msg->nat_id = nat->id;
408 			msg->proto = ip->ip_p;
409 			netisr_sendmsg(&msg->base, nextcpu);
410 		} else {
411 			s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
412 					M_INTWAIT | M_NULLOK | M_ZERO);
413 
414 			s2->src_addr = args->f_id.dst_ip;
415 			s2->dst_addr = alias->ip.s_addr;
416 
417 			s2->src_port = s->alias_port;
418 			s2->dst_port = s->alias_port;
419 
420 			s2->src_addr = htonl(args->f_id.src_ip);
421 			s2->src_port = htons(args->f_id.src_port);
422 
423 			s2->timestamp = s->timestamp;
424 			if (ip->ip_p == IPPROTO_TCP) {
425 				alias->tcp_in[s->alias_port - ALIAS_BEGIN] = s2;
426 			} else {
427 				alias->udp_in[s->alias_port - ALIAS_BEGIN] = s2;
428 			}
429 		}
430 	}
431 	return IP_FW_NAT;
432 oops:
433 	DEBUG1("oops\n");
434 	return IP_FW_DENY;
435 }
436 
437 void
438 pick_alias_port(struct nat_state *s, struct state_tree *tree)
439 {
440 	do {
441 		s->alias_port = htons(krandom() % ALIAS_RANGE + ALIAS_BEGIN);
442 	} while (RB_FIND(state_tree, tree, s) != NULL);
443 }
444 
445 int
446 ip_fw3_nat_state_cmp(struct nat_state *s1, struct nat_state *s2)
447 {
448 	if (s1->src_addr > s2->src_addr)
449 		return 1;
450 	if (s1->src_addr < s2->src_addr)
451 		return -1;
452 
453 	if (s1->dst_addr > s2->dst_addr)
454 		return 1;
455 	if (s1->dst_addr < s2->dst_addr)
456 		return -1;
457 
458 	if (s1->src_port > s2->src_port)
459 		return 1;
460 	if (s1->src_port < s2->src_port)
461 		return -1;
462 
463 	if (s1->dst_port > s2->dst_port)
464 		return 1;
465 	if (s1->dst_port < s2->dst_port)
466 		return -1;
467 
468 	return 0;
469 }
470 
471 int
472 ip_fw3_ctl_nat_get_cfg(struct sockopt *sopt)
473 {
474 	struct ip_fw3_nat_context *nat_ctx;
475 	struct ioc_nat *ioc;
476 	struct cfg_nat *nat;
477 	struct cfg_alias *alias;
478 	struct in_addr *ip;
479 	size_t valsize;
480 	int i, len;
481 
482 	len = 0;
483 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
484 	valsize = sopt->sopt_valsize;
485 	ioc = (struct ioc_nat *)sopt->sopt_val;
486 
487 	for (i = 0; i < NAT_ID_MAX; i++) {
488 		nat = nat_ctx->nats[i];
489 		if (nat != NULL) {
490 			len += LEN_IOC_NAT;
491 			if (len >= valsize) {
492 				goto nospace;
493 			}
494 			ioc->id = nat->id;
495 			ioc->count = nat->count;
496 			ip = &ioc->ip;
497 			LIST_FOREACH(alias, &nat->alias, next) {
498 				len += LEN_IN_ADDR;
499 				if (len > valsize) {
500 					goto nospace;
501 				}
502 				bcopy(&alias->ip, ip, LEN_IN_ADDR);
503 				ip++;
504 			}
505 		}
506 	}
507 	sopt->sopt_valsize = len;
508 	return 0;
509 nospace:
510 	bzero(sopt->sopt_val, sopt->sopt_valsize);
511 	sopt->sopt_valsize = 0;
512 	return 0;
513 }
514 
515 int
516 ip_fw3_ctl_nat_get_record(struct sockopt *sopt)
517 {
518 	struct ip_fw3_nat_context *nat_ctx;
519 	struct cfg_nat *the;
520 	size_t sopt_size, total_len = 0;
521 	struct ioc_nat_state *ioc;
522 	int ioc_nat_id, i, n, cpu;
523 	struct nat_state 	*s;
524 	struct nat_state2 	*s2;
525 	struct cfg_alias	*a1;
526 
527 	ioc_nat_id = *((int *)(sopt->sopt_val));
528 	sopt_size = sopt->sopt_valsize;
529 	ioc = (struct ioc_nat_state *)sopt->sopt_val;
530 	/* icmp states only in CPU 0 */
531 	cpu = 0;
532 	nat_ctx = ip_fw3_nat_ctx[cpu];
533 	for (n = 0; n < NAT_ID_MAX; n++) {
534 		if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
535 			if (nat_ctx->nats[n] == NULL)
536 				break;
537 			the = nat_ctx->nats[n];
538 			RB_FOREACH(s, state_tree, &the->rb_icmp_out) {
539 				total_len += LEN_IOC_NAT_STATE;
540 				if (total_len > sopt_size)
541 					goto nospace;
542 				ioc->src_addr.s_addr = ntohl(s->src_addr);
543 				ioc->dst_addr.s_addr = s->dst_addr;
544 				ioc->alias_addr.s_addr = s->alias_addr;
545 				ioc->src_port = s->src_port;
546 				ioc->dst_port = s->dst_port;
547 				ioc->alias_port = s->alias_port;
548 				ioc->nat_id = n + 1;
549 				ioc->cpu_id = cpu;
550 				ioc->proto = IPPROTO_ICMP;
551 				ioc->direction = 1;
552 				ioc->life = s->timestamp +
553 					sysctl_var_icmp_timeout - time_uptime;
554 				ioc++;
555 			}
556 
557 			LIST_FOREACH(a1, &the->alias, next) {
558 			for (i = 0; i < ALIAS_RANGE; i++) {
559 				s2 = a1->icmp_in[i];
560 				if (s2 == NULL) {
561 					continue;
562 				}
563 
564 				total_len += LEN_IOC_NAT_STATE;
565 				if (total_len > sopt_size)
566 					goto nospace;
567 
568 				ioc->src_addr.s_addr = ntohl(s2->src_addr);
569 				ioc->dst_addr.s_addr = s2->dst_addr;
570 				ioc->alias_addr.s_addr = s2->alias_addr;
571 				ioc->src_port = s2->src_port;
572 				ioc->dst_port = s2->dst_port;
573 				ioc->alias_port = s2->alias_port;
574 				ioc->nat_id = n + 1;
575 				ioc->cpu_id = cpu;
576 				ioc->proto = IPPROTO_ICMP;
577 				ioc->direction = 0;
578 				ioc->life = s2->timestamp +
579 					sysctl_var_icmp_timeout - time_uptime;
580 				ioc++;
581 			}
582 			}
583 		}
584 	}
585 
586 	/* tcp states */
587 	for (cpu = 0; cpu < ncpus; cpu++) {
588 		nat_ctx = ip_fw3_nat_ctx[cpu];
589 		for (n = 0; n < NAT_ID_MAX; n++) {
590 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
591 				if (nat_ctx->nats[n] == NULL)
592 					break;
593 				the = nat_ctx->nats[n];
594 				RB_FOREACH(s, state_tree, &the->rb_tcp_out) {
595 					total_len += LEN_IOC_NAT_STATE;
596 					if (total_len > sopt_size)
597 						goto nospace;
598 					ioc->src_addr.s_addr = ntohl(s->src_addr);
599 					ioc->dst_addr.s_addr = ntohl(s->dst_addr);
600 					ioc->alias_addr.s_addr = s->alias_addr;
601 					ioc->src_port = ntohs(s->src_port);
602 					ioc->dst_port = ntohs(s->dst_port);
603 					ioc->alias_port = s->alias_port;
604 					ioc->nat_id = n + 1;
605 					ioc->cpu_id = cpu;
606 					ioc->proto = IPPROTO_TCP;
607 					ioc->direction = 1;
608 					ioc->life = s->timestamp +
609 						sysctl_var_tcp_timeout - time_uptime;
610 					ioc++;
611 				}
612 				LIST_FOREACH(a1, &the->alias, next) {
613 					for (i = 0; i < ALIAS_RANGE; i++) {
614 						s2 = a1->tcp_in[i];
615 						if (s2 == NULL) {
616 							continue;
617 						}
618 
619 						total_len += LEN_IOC_NAT_STATE;
620 						if (total_len > sopt_size)
621 							goto nospace;
622 
623 						ioc->src_addr.s_addr = ntohl(s2->src_addr);
624 						ioc->dst_addr.s_addr = s2->dst_addr;
625 						ioc->alias_addr.s_addr = s2->alias_addr;
626 						ioc->src_port = s2->src_port;
627 						ioc->dst_port = s2->dst_port;
628 						ioc->alias_port = s2->alias_port;
629 						ioc->nat_id = n + 1;
630 						ioc->cpu_id = cpu;
631 						ioc->proto = IPPROTO_TCP;
632 						ioc->direction = 0;
633 						ioc->life = s2->timestamp +
634 							sysctl_var_icmp_timeout - time_uptime;
635 						ioc++;
636 					}
637 				}
638 			}
639 		}
640 	}
641 
642 	/* udp states */
643 	for (cpu = 0; cpu < ncpus; cpu++) {
644 		nat_ctx = ip_fw3_nat_ctx[cpu];
645 		for (n = 0; n < NAT_ID_MAX; n++) {
646 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
647 				if (nat_ctx->nats[n] == NULL)
648 					break;
649 				the = nat_ctx->nats[n];
650 				RB_FOREACH(s, state_tree, &the->rb_udp_out) {
651 					total_len += LEN_IOC_NAT_STATE;
652 					if (total_len > sopt_size)
653 						goto nospace;
654 					ioc->src_addr.s_addr = ntohl(s->src_addr);
655 					ioc->dst_addr.s_addr = s->dst_addr;
656 					ioc->alias_addr.s_addr = s->alias_addr;
657 					ioc->src_port = s->src_port;
658 					ioc->dst_port = s->dst_port;
659 					ioc->alias_port = s->alias_port;
660 					ioc->nat_id = n + 1;
661 					ioc->cpu_id = cpu;
662 					ioc->proto = IPPROTO_UDP;
663 					ioc->direction = 1;
664 					ioc->life = s->timestamp +
665 						sysctl_var_udp_timeout - time_uptime;
666 					ioc++;
667 				}
668 				LIST_FOREACH(a1, &the->alias, next) {
669 					for (i = 0; i < ALIAS_RANGE; i++) {
670 						s2 = a1->udp_in[i];
671 						if (s2 == NULL) {
672 							continue;
673 						}
674 
675 						total_len += LEN_IOC_NAT_STATE;
676 						if (total_len > sopt_size)
677 							goto nospace;
678 
679 						ioc->src_addr.s_addr = ntohl(s2->src_addr);
680 						ioc->dst_addr.s_addr = s2->dst_addr;
681 						ioc->alias_addr.s_addr = s2->alias_addr;
682 						ioc->src_port = s2->src_port;
683 						ioc->dst_port = s2->dst_port;
684 						ioc->alias_port = s2->alias_port;
685 						ioc->nat_id = n + 1;
686 						ioc->cpu_id = cpu;
687 						ioc->proto = IPPROTO_UDP;
688 						ioc->direction = 0;
689 						ioc->life = s2->timestamp +
690 							sysctl_var_icmp_timeout - time_uptime;
691 						ioc++;
692 					}
693 				}
694 			}
695 		}
696 	}
697 	sopt->sopt_valsize = total_len;
698 	return 0;
699 nospace:
700 	return 0;
701 }
702 
703 void
704 nat_state_add_dispatch(netmsg_t add_msg)
705 {
706 	struct ip_fw3_nat_context *nat_ctx;
707 	struct netmsg_nat_state_add *msg;
708 	struct cfg_nat *nat;
709 	struct nat_state2 *s2;
710 	struct cfg_alias *alias;
711 
712 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
713 	msg = (struct netmsg_nat_state_add *)add_msg;
714 	nat = nat_ctx->nats[msg->nat_id - 1];
715 
716 	LIST_FOREACH(alias, &nat->alias, next) {
717 		if (alias->ip.s_addr == msg->alias_addr.s_addr) {
718 			break;
719 		}
720 	}
721 	s2 = msg->state;
722 	if (msg->proto == IPPROTO_TCP) {
723 		alias->tcp_in[msg->alias_port - ALIAS_BEGIN] = s2;
724 	} else {
725 		alias->udp_in[msg->alias_port - ALIAS_BEGIN] = s2;
726 	}
727 }
728 
729 /*
730  * Init the RB trees only when the NAT is configured.
731  */
732 void
733 nat_add_dispatch(netmsg_t nat_add_msg)
734 {
735 	struct ip_fw3_nat_context *nat_ctx;
736 	struct netmsg_nat_add *msg;
737 	struct ioc_nat *ioc;
738 	struct cfg_nat *nat;
739 	struct cfg_alias *alias;
740 	struct in_addr *ip;
741 	int n;
742 
743 	msg = (struct netmsg_nat_add *)nat_add_msg;
744 	ioc = &msg->ioc_nat;
745 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
746 
747 	if (nat_ctx->nats[ioc->id - 1] == NULL) {
748 		/* op = set, and nat not exists */
749 		nat = kmalloc(LEN_CFG_NAT, M_IPFW3_NAT, M_WAITOK | M_ZERO);
750 		LIST_INIT(&nat->alias);
751 		RB_INIT(&nat->rb_tcp_out);
752 		RB_INIT(&nat->rb_udp_out);
753 		if (mycpuid == 0) {
754 			RB_INIT(&nat->rb_icmp_out);
755 		}
756 		nat->id = ioc->id;
757 		nat->count = ioc->count;
758 		ip = &ioc->ip;
759 		for (n = 0; n < ioc->count; n++) {
760 			alias = kmalloc(LEN_CFG_ALIAS,
761 					M_IPFW3_NAT, M_WAITOK | M_ZERO);
762 			memcpy(&alias->ip, ip, LEN_IN_ADDR);
763 			LIST_INSERT_HEAD((&nat->alias), alias, next);
764 			ip++;
765 		}
766 		nat_ctx->nats[ioc->id - 1] = nat;
767 	}
768 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
769 }
770 
771 int
772 ip_fw3_ctl_nat_add(struct sockopt *sopt)
773 {
774 	struct netmsg_nat_add nat_add_msg, *msg;
775 	struct ioc_nat *ioc;
776 	msg = &nat_add_msg;
777 
778 	ioc = (struct ioc_nat *)(sopt->sopt_val);
779 	sooptcopyin(sopt, &msg->ioc_nat, sopt->sopt_valsize,
780 			sizeof(struct ioc_nat));
781 	netmsg_init(&msg->base, NULL, &curthread->td_msgport, 0,
782 			nat_add_dispatch);
783 	netisr_domsg(&msg->base, 0);
784 	return 0;
785 }
786 
787 void
788 nat_del_dispatch(netmsg_t nat_del_msg)
789 {
790 	struct ip_fw3_nat_context *nat_ctx;
791 	struct netmsg_nat_del *msg;
792 	struct cfg_nat *nat;
793 	struct nat_state *s, *tmp;
794 	struct cfg_alias *alias, *tmp3;
795 
796 	msg = (struct netmsg_nat_del *)nat_del_msg;
797 
798 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
799 	nat = nat_ctx->nats[msg->id - 1];
800 	if (nat != NULL) {
801 		/* the icmp states will only stored in cpu 0 */
802 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
803 			RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
804 			if (s != NULL) {
805 				kfree(s, M_IPFW3_NAT);
806 			}
807 		}
808 		/*
809 		LIST_FOREACH_MUTABLE(s2, &nat->alias->icmp_in, next, tmp2) {
810 			LIST_REMOVE(s2, next);
811 			if (s != NULL) {
812 				kfree(s, M_IPFW3_NAT);
813 			}
814 		}
815 		*/
816 
817 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
818 			RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
819 			if (s != NULL) {
820 				kfree(s, M_IPFW3_NAT);
821 			}
822 		}
823 		/*
824 		LIST_FOREACH_MUTABLE(s2, &nat->alias->tcp_in, next, tmp2) {
825 			LIST_REMOVE(s2, next);
826 			if (s != NULL) {
827 				kfree(s, M_IPFW3_NAT);
828 			}
829 		}
830 		*/
831 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
832 			RB_REMOVE(state_tree, &nat->rb_udp_out, s);
833 			if (s != NULL) {
834 				kfree(s, M_IPFW3_NAT);
835 			}
836 		}
837 		/*
838 		LIST_FOREACH_MUTABLE(s2, &nat->alias->udp_in, next, tmp2) {
839 			LIST_REMOVE(s2, next);
840 			if (s != NULL) {
841 				kfree(s, M_IPFW3_NAT);
842 			}
843 		}
844 		*/
845 		LIST_FOREACH_MUTABLE(alias, &nat->alias, next, tmp3) {
846 			kfree(alias, M_IPFW3_NAT);
847 		}
848 		kfree(nat, M_IPFW3_NAT);
849 		nat_ctx->nats[msg->id - 1] = NULL;
850 	}
851 	netisr_forwardmsg_all(&nat_del_msg->base, mycpuid + 1);
852 }
853 int
854 ip_fw3_ctl_nat_del(struct sockopt *sopt)
855 {
856 	struct netmsg_nat_del nat_del_msg, *msg;
857 
858 	msg = &nat_del_msg;
859 	msg->id = *((int *)sopt->sopt_val);
860 	netmsg_init(&msg->base, NULL, &curthread->td_msgport,
861 			0, nat_del_dispatch);
862 
863 	netisr_domsg(&msg->base, 0);
864 	return 0;
865 }
866 int
867 ip_fw3_ctl_nat_flush(struct sockopt *sopt)
868 {
869 	struct netmsg_nat_del nat_del_msg, *msg;
870 	int i;
871 	msg = &nat_del_msg;
872 	for (i = 0; i < NAT_ID_MAX; i++) {
873 		msg->id = i + 1;
874 		netmsg_init(&msg->base, NULL, &curthread->td_msgport,
875 				0, nat_del_dispatch);
876 
877 		netisr_domsg(&msg->base, 0);
878 	}
879 	return 0;
880 }
881 
882 int
883 ip_fw3_ctl_nat_sockopt(struct sockopt *sopt)
884 {
885 	int error = 0;
886 	switch (sopt->sopt_name) {
887 	case IP_FW_NAT_ADD:
888 		error = ip_fw3_ctl_nat_add(sopt);
889 		break;
890 	case IP_FW_NAT_DEL:
891 		error = ip_fw3_ctl_nat_del(sopt);
892 		break;
893 	case IP_FW_NAT_FLUSH:
894 		error = ip_fw3_ctl_nat_flush(sopt);
895 		break;
896 	case IP_FW_NAT_GET:
897 		error = ip_fw3_ctl_nat_get_cfg(sopt);
898 		break;
899 	case IP_FW_NAT_GET_RECORD:
900 		error = ip_fw3_ctl_nat_get_record(sopt);
901 		break;
902 	default:
903 		kprintf("ipfw3 nat invalid socket option %d\n",
904 				sopt->sopt_name);
905 	}
906 	return error;
907 }
908 
909 void
910 nat_init_ctx_dispatch(netmsg_t msg)
911 {
912 	struct ip_fw3_nat_context *tmp;
913 	tmp = kmalloc(sizeof(struct ip_fw3_nat_context),
914 				M_IPFW3_NAT, M_WAITOK | M_ZERO);
915 
916 	ip_fw3_nat_ctx[mycpuid] = tmp;
917 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
918 }
919 
920 void
921 nat_fnit_ctx_dispatch(netmsg_t msg)
922 {
923 	kfree(ip_fw3_nat_ctx[mycpuid], M_IPFW3_NAT);
924 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
925 }
926 
927 static void
928 nat_cleanup_func_dispatch(netmsg_t nmsg)
929 {
930 	struct nat_state *s, *tmp;
931 	struct ip_fw3_nat_context *nat_ctx;
932 	struct cfg_nat *nat;
933 	struct cfg_alias *a1, *tmp2;
934 	struct nat_state2 *s2;
935 	int i, j;
936 
937 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
938 	for (j = 0; j < NAT_ID_MAX; j++) {
939 		nat = nat_ctx->nats[j];
940 		if (nat == NULL)
941 			continue;
942 		/* check the nat_states, remove the expired state */
943 		/* the icmp states will only stored in cpu 0 */
944 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
945 			if (time_uptime - s->timestamp > sysctl_var_icmp_timeout) {
946 				RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
947 				kfree(s, M_IPFW3_NAT);
948 			}
949 		}
950 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
951 			for (i = 0; i < ALIAS_RANGE; i++) {
952 				s2 = a1->icmp_in[i];
953 				if (s2 != NULL) {
954 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
955 						a1->icmp_in[i] = NULL;
956 						kfree(s2, M_IPFW3_NAT);
957 					}
958 				}
959 
960 			}
961 		}
962 
963 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
964 			if (time_uptime - s->timestamp > sysctl_var_tcp_timeout) {
965 				RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
966 				kfree(s, M_IPFW3_NAT);
967 			}
968 		}
969 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
970 			for (i = 0; i < ALIAS_RANGE; i++) {
971 				s2 = a1->tcp_in[i];
972 				if (s2 != NULL) {
973 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
974 						a1->tcp_in[i] = NULL;
975 						kfree(s2, M_IPFW3_NAT);
976 					}
977 				}
978 
979 			}
980 		}
981 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
982 			if (time_uptime - s->timestamp > sysctl_var_udp_timeout) {
983 				RB_REMOVE(state_tree, &nat->rb_udp_out, s);
984 				kfree(s, M_IPFW3_NAT);
985 			}
986 		}
987 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
988 			for (i = 0; i < ALIAS_RANGE; i++) {
989 				s2 = a1->udp_in[i];
990 				if (s2 != NULL) {
991 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
992 						a1->udp_in[i] = NULL;
993 						kfree(s2, M_IPFW3_NAT);
994 					}
995 				}
996 
997 			}
998 		}
999 	}
1000 	netisr_forwardmsg_all(&nmsg->base, mycpuid + 1);
1001 }
1002 
1003 static void
1004 ip_fw3_nat_cleanup_func(void *dummy __unused)
1005 {
1006 	struct netmsg_base msg;
1007 	netmsg_init(&msg, NULL, &curthread->td_msgport, 0,
1008 			nat_cleanup_func_dispatch);
1009 	netisr_domsg(&msg, 0);
1010 
1011 	callout_reset(&ip_fw3_nat_cleanup_callout,
1012 			sysctl_var_cleanup_interval * hz,
1013 			ip_fw3_nat_cleanup_func, NULL);
1014 }
1015 
1016 static
1017 int ip_fw3_nat_init(void)
1018 {
1019 	struct netmsg_base msg;
1020 	ip_fw3_register_module(MODULE_NAT_ID, MODULE_NAT_NAME);
1021 	ip_fw3_register_filter_funcs(MODULE_NAT_ID, O_NAT_NAT,
1022 			(filter_func)check_nat);
1023 	ip_fw3_ctl_nat_ptr = ip_fw3_ctl_nat_sockopt;
1024 	netmsg_init(&msg, NULL, &curthread->td_msgport,
1025 			0, nat_init_ctx_dispatch);
1026 	netisr_domsg(&msg, 0);
1027 
1028 	callout_init_mp(&ip_fw3_nat_cleanup_callout);
1029 	callout_reset(&ip_fw3_nat_cleanup_callout,
1030 			sysctl_var_cleanup_interval * hz,
1031 			ip_fw3_nat_cleanup_func,
1032 			NULL);
1033 	return 0;
1034 }
1035 
1036 static int
1037 ip_fw3_nat_fini(void)
1038 {
1039 	struct netmsg_base msg;
1040 	struct netmsg_nat_del nat_del_msg, *msg1;
1041 	int i;
1042 
1043 	callout_stop(&ip_fw3_nat_cleanup_callout);
1044 
1045 	msg1 = &nat_del_msg;
1046 	for (i = 0; i < NAT_ID_MAX; i++) {
1047 		msg1->id = i + 1;
1048 		netmsg_init(&msg1->base, NULL, &curthread->td_msgport,
1049 				0, nat_del_dispatch);
1050 
1051 		netisr_domsg(&msg1->base, 0);
1052 	}
1053 
1054 	netmsg_init(&msg, NULL, &curthread->td_msgport,
1055 			0, nat_fnit_ctx_dispatch);
1056 	netisr_domsg(&msg, 0);
1057 
1058 	return ip_fw3_unregister_module(MODULE_NAT_ID);
1059 }
1060 
1061 static int
1062 ip_fw3_nat_modevent(module_t mod, int type, void *data)
1063 {
1064 	switch (type) {
1065 	case MOD_LOAD:
1066 		return ip_fw3_nat_init();
1067 	case MOD_UNLOAD:
1068 		return ip_fw3_nat_fini();
1069 	default:
1070 		break;
1071 	}
1072 	return 0;
1073 }
1074 
1075 moduledata_t ip_fw3_nat_mod = {
1076 	"ipfw3_nat",
1077 	ip_fw3_nat_modevent,
1078 	NULL
1079 };
1080 
1081 DECLARE_MODULE(ipfw3_nat, ip_fw3_nat_mod,
1082 		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
1083 MODULE_DEPEND(ipfw3_nat, ipfw3_basic, 1, 1, 1);
1084 MODULE_VERSION(ipfw3_nat, 1);
1085