xref: /dragonfly/sys/net/ipfw3_nat/ip_fw3_nat.c (revision 70344474)
1 /*
2  * Copyright (c) 2014 - 2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Bill Yuan <bycn82@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/socketvar.h>
40 #include <sys/sysctl.h>
41 #include <sys/systimer.h>
42 #include <sys/thread2.h>
43 #include <sys/in_cksum.h>
44 #include <sys/systm.h>
45 #include <sys/proc.h>
46 #include <sys/socket.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/lock.h>
50 #include <sys/mplock2.h>
51 
52 #include <net/ethernet.h>
53 #include <net/netmsg2.h>
54 #include <net/netisr2.h>
55 #include <net/route.h>
56 #include <net/if.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/ip_icmp.h>
61 #include <netinet/tcp.h>
62 #include <netinet/tcp_timer.h>
63 #include <netinet/tcp_var.h>
64 #include <netinet/tcpip.h>
65 #include <netinet/udp.h>
66 #include <netinet/udp_var.h>
67 #include <netinet/in_systm.h>
68 #include <netinet/in_var.h>
69 #include <netinet/in_pcb.h>
70 #include <netinet/ip_var.h>
71 #include <netinet/ip_divert.h>
72 #include <net/ipfw3/ip_fw.h>
73 
74 #include "ip_fw3_nat.h"
75 
76 /*
77  * Lockless Kernel NAT
78  *
79  * The `src` will be replaced by `alias` when a packet is leaving the system.
80  * Hence, the packet is from `src` to `dst` before been translated. And after
81  * been translated, the packet is from `alias` to `dst`.
82  *
83  * The state for outgoing packet will be stored in the nat_context of current
84  * CPU. But due to the nature of the NAT, the returning packet may be handled
85  * by another CPU. Hence, a state for the returning packet will be prepared and
86  * store into the nat_context of the right CPU.
87  */
88 
89 struct ip_fw3_nat_context	*ip_fw3_nat_ctx[MAXCPU];
90 static struct callout 		ip_fw3_nat_cleanup_callout;
91 extern struct ipfw_context 	*ipfw_ctx[MAXCPU];
92 extern ip_fw_ctl_t 		*ipfw_ctl_nat_ptr;
93 
94 static int 			sysctl_var_cleanup_interval = 1;
95 static int 			sysctl_var_icmp_timeout = 10;
96 static int 			sysctl_var_tcp_timeout = 60;
97 static int 			sysctl_var_udp_timeout = 30;
98 
99 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw3_nat, CTLFLAG_RW, 0, "ipfw3 NAT");
100 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, cleanup_interval, CTLFLAG_RW,
101 		&sysctl_var_cleanup_interval, 0, "default life time");
102 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, icmp_timeout, CTLFLAG_RW,
103 		&sysctl_var_icmp_timeout, 0, "default icmp state life time");
104 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, tcp_timeout, CTLFLAG_RW,
105 		&sysctl_var_tcp_timeout, 0, "default tcp state life time");
106 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, udp_timeout, CTLFLAG_RW,
107 		&sysctl_var_udp_timeout, 0, "default udp state life time");
108 
109 RB_PROTOTYPE(state_tree, nat_state, entries, nat_state_cmp);
110 RB_GENERATE(state_tree, nat_state, entries, nat_state_cmp);
111 
112 static __inline uint16_t
113 fix_cksum(uint16_t cksum, uint16_t old_info, uint16_t new_info, uint8_t is_udp)
114 {
115 	uint32_t tmp;
116 
117 	if (is_udp && !cksum)
118 		return (0x0000);
119 	tmp = cksum + old_info - new_info;
120 	tmp = (tmp >> 16) + (tmp & 65535);
121 	tmp = tmp & 65535;
122 	if (is_udp && !tmp)
123 		return (0xFFFF);
124 	return tmp;
125 }
126 
127 void
128 check_nat(int *cmd_ctl, int *cmd_val, struct ip_fw_args **args,
129 		struct ip_fw **f, ipfw_insn *cmd, uint16_t ip_len)
130 {
131 	if ((*args)->eh != NULL) {
132 		*cmd_ctl = IP_FW_CTL_NO;
133 		*cmd_val = IP_FW_NOT_MATCH;
134 		return;
135 	}
136 
137 	struct ip_fw3_nat_context *nat_ctx;
138 	struct cfg_nat *nat;
139 	int nat_id;
140 
141 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
142 	(*args)->rule = *f;
143 	nat = ((ipfw_insn_nat *)cmd)->nat;
144 	if (nat == NULL) {
145 		nat_id = cmd->arg1;
146 		nat = nat_ctx->nats[nat_id - 1];
147 		if (nat == NULL) {
148 			*cmd_val = IP_FW_DENY;
149 			*cmd_ctl = IP_FW_CTL_DONE;
150 			return;
151 		}
152 		((ipfw_insn_nat *)cmd)->nat = nat;
153 	}
154 	*cmd_val = ip_fw3_nat(*args, nat, (*args)->m);
155 	*cmd_ctl = IP_FW_CTL_NAT;
156 }
157 
158 int
159 ip_fw3_nat(struct ip_fw_args *args, struct cfg_nat *nat, struct mbuf *m)
160 {
161 	struct state_tree *tree_in = NULL, *tree_out = NULL;
162 	struct nat_state *s, *s2, *dup, *k, key;
163 	struct ip *ip = mtod(m, struct ip *);
164 	struct in_addr *old_addr = NULL, new_addr;
165 	uint16_t *old_port = NULL, new_port;
166 	uint16_t *csum = NULL, dlen = 0;
167 	uint8_t udp = 0;
168 	boolean_t pseudo = FALSE, need_return_state = FALSE;
169 	struct cfg_alias *alias;
170 	int i = 0, rand_n = 0;
171 
172 	k = &key;
173 	memset(k, 0, LEN_NAT_STATE);
174 	if (args->oif == NULL) {
175 		old_addr = &ip->ip_dst;
176 		k->src_addr = args->f_id.src_ip;
177 		k->dst_addr = ntohl(args->f_id.dst_ip);
178 		switch (ip->ip_p) {
179 		case IPPROTO_TCP:
180 			k->src_port = args->f_id.src_port;
181 			k->dst_port = ntohs(args->f_id.dst_port);
182 			tree_in = &nat->rb_tcp_in;
183 			old_port = &L3HDR(struct tcphdr, ip)->th_dport;
184 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
185 			break;
186 		case IPPROTO_UDP:
187 			k->src_port = args->f_id.src_port;
188 			k->dst_port = ntohs(args->f_id.dst_port);
189 			tree_in = &nat->rb_udp_in;
190 			old_port = &L3HDR(struct udphdr, ip)->uh_dport;
191 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
192 			udp = 1;
193 			break;
194 		case IPPROTO_ICMP:
195 			k->src_port = L3HDR(struct icmp, ip)->icmp_id;;
196 			k->dst_port = L3HDR(struct icmp, ip)->icmp_id;;
197 			tree_in = &nat->rb_icmp_in;
198 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
199 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
200 			break;
201 		default:
202 			panic("ipfw3: unsupported proto %u", ip->ip_p);
203 		}
204 		s = RB_FIND(state_tree, tree_in, k);
205 		if (s == NULL) {
206 			goto oops;
207 		}
208 	} else {
209 		old_addr = &ip->ip_src;
210 		k->src_addr = args->f_id.src_ip;
211 		k->dst_addr = args->f_id.dst_ip;
212 		switch (ip->ip_p) {
213 		case IPPROTO_TCP:
214 			k->src_port = args->f_id.src_port;
215 			k->dst_port = args->f_id.dst_port;
216 			m->m_pkthdr.csum_flags = CSUM_TCP;
217 			tree_out = &nat->rb_tcp_out;
218 			old_port = &L3HDR(struct tcphdr, ip)->th_sport;
219 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
220 			break;
221 		case IPPROTO_UDP:
222 			k->src_port = args->f_id.src_port;
223 			k->dst_port = args->f_id.dst_port;
224 			m->m_pkthdr.csum_flags = CSUM_UDP;
225 			tree_out = &nat->rb_udp_out;
226 			old_port = &L3HDR(struct udphdr, ip)->uh_sport;
227 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
228 			udp = 1;
229 			break;
230 		case IPPROTO_ICMP:
231 			k->src_port = L3HDR(struct icmp, ip)->icmp_id;
232 			k->dst_port = k->src_port;
233 			tree_out = &nat->rb_icmp_out;
234 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
235 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
236 			break;
237 		default:
238 			panic("ipfw3: unsupported proto %u", ip->ip_p);
239 		}
240 		s = RB_FIND(state_tree, tree_out, k);
241 		if (s == NULL) {
242 			/* pick an alias ip randomly when there are multiple */
243 			if (nat->count > 1) {
244 				rand_n = krandom() % nat->count;
245 			}
246 			LIST_FOREACH(alias, &nat->alias, next) {
247 				if (i++ == rand_n) {
248 					break;
249 				}
250 			}
251 			switch  (ip->ip_p) {
252 			case IPPROTO_TCP:
253 				m->m_pkthdr.csum_flags = CSUM_TCP;
254 				s = kmalloc(LEN_NAT_STATE, M_IP_FW3_NAT,
255 						M_INTWAIT | M_NULLOK | M_ZERO);
256 
257 				s->src_addr = args->f_id.src_ip;
258 				s->src_port = args->f_id.src_port;
259 
260 				s->dst_addr = args->f_id.dst_ip;
261 				s->dst_port = args->f_id.dst_port;
262 
263 				s->alias_addr = alias->ip.s_addr;
264 				pick_alias_port(s, tree_out);
265 				dup = RB_INSERT(state_tree, tree_out, s);
266 				need_return_state = TRUE;
267 				break;
268 			case IPPROTO_UDP:
269 				m->m_pkthdr.csum_flags = CSUM_UDP;
270 				s = kmalloc(LEN_NAT_STATE, M_IP_FW3_NAT,
271 						M_INTWAIT | M_NULLOK | M_ZERO);
272 
273 				s->src_addr = args->f_id.src_ip;
274 				s->src_port = args->f_id.src_port;
275 
276 				s->dst_addr = args->f_id.dst_ip;
277 				s->dst_port = args->f_id.dst_port;
278 
279 				s->alias_addr = alias->ip.s_addr;
280 				pick_alias_port(s, tree_out);
281 				dup = RB_INSERT(state_tree, tree_out, s);
282 				need_return_state = TRUE;
283 				break;
284 			case IPPROTO_ICMP:
285 				s = kmalloc(LEN_NAT_STATE, M_IP_FW3_NAT,
286 						M_INTWAIT | M_NULLOK | M_ZERO);
287 				s->src_addr = args->f_id.src_ip;
288 				s->dst_addr = args->f_id.dst_ip;
289 
290 				s->src_port = *old_port;
291 				s->dst_port = *old_port;
292 
293 				s->alias_addr = alias->ip.s_addr;
294 				s->alias_port = htons(s->src_addr % ALIAS_RANGE);
295 				dup = RB_INSERT(state_tree, tree_out, s);
296 
297 				s2 = kmalloc(LEN_NAT_STATE, M_IP_FW3_NAT,
298 						M_INTWAIT | M_NULLOK | M_ZERO);
299 
300 				s2->src_addr = args->f_id.dst_ip;
301 				s2->dst_addr = alias->ip.s_addr;
302 
303 				s2->src_port = s->alias_port;
304 				s2->dst_port = s->alias_port;
305 
306 				s2->alias_addr = htonl(args->f_id.src_ip);
307 				s2->alias_port = *old_port;
308 				tree_in = &nat->rb_icmp_in;
309 				dup = RB_INSERT(state_tree, tree_in, s2);
310 				break;
311 			default :
312 				goto oops;
313 			}
314 		}
315 	}
316 	new_addr.s_addr = s->alias_addr;
317 	new_port = s->alias_port;
318 	s->timestamp = time_uptime;
319 
320 	/* replace src/dst and fix the checksum */
321 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
322 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
323 			dlen = ip->ip_len - (ip->ip_hl << 2);
324 		}
325 		pseudo = TRUE;
326 	}
327 	if (!pseudo) {
328 		const uint16_t *oaddr, *naddr;
329 		oaddr = (const uint16_t *)&old_addr->s_addr;
330 		naddr = (const uint16_t *)&new_addr.s_addr;
331 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[0], naddr[0], 0);
332 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[1], naddr[1], 0);
333 		if (ip->ip_p != IPPROTO_ICMP) {
334 			*csum = fix_cksum(*csum, oaddr[0], naddr[0], udp);
335 			*csum = fix_cksum(*csum, oaddr[1], naddr[1], udp);
336 		}
337 	}
338 	old_addr->s_addr = new_addr.s_addr;
339 	if (!pseudo) {
340 		*csum = fix_cksum(*csum, *old_port, new_port, udp);
341 	}
342 	*old_port = new_port;
343 
344 	if (pseudo) {
345 		*csum = in_pseudo(ip->ip_src.s_addr,
346 				ip->ip_dst.s_addr, htons(dlen + ip->ip_p));
347 	}
348 
349 	/* prepare the state for return traffic */
350 	if (need_return_state) {
351 		ip->ip_len = htons(ip->ip_len);
352 		ip->ip_off = htons(ip->ip_off);
353 
354 		m->m_flags &= ~M_HASH;
355 		ip_hashfn(&m, 0);
356 
357 		ip->ip_len = ntohs(ip->ip_len);
358 		ip->ip_off = ntohs(ip->ip_off);
359 
360 		int nextcpu = netisr_hashcpu(m->m_pkthdr.hash);
361 		if (nextcpu != mycpuid) {
362 			struct netmsg_nat_state_add *msg;
363 			msg = kmalloc(LEN_NMSG_NAT_STATE_ADD,
364 					M_LWKTMSG, M_NOWAIT | M_ZERO);
365 			netmsg_init(&msg->base, NULL, &curthread->td_msgport,
366 					0, nat_state_add_dispatch);
367 			s2 = kmalloc(LEN_NAT_STATE, M_IP_FW3_NAT,
368 					M_INTWAIT | M_NULLOK | M_ZERO);
369 
370 			s2->src_addr = args->f_id.dst_ip;
371 			s2->src_port = s->dst_port;
372 
373 			s2->dst_addr = alias->ip.s_addr;
374 			s2->dst_port = s->alias_port;
375 
376 			s2->alias_addr = htonl(args->f_id.src_ip);
377 			s2->alias_port = htons(args->f_id.src_port);
378 
379 			s2->timestamp = s->timestamp;
380 
381 			msg->state = s2;
382 			msg->nat_id = nat->id;
383 			msg->proto = ip->ip_p;
384 			netisr_sendmsg(&msg->base, nextcpu);
385 		} else {
386 			s2 = kmalloc(LEN_NAT_STATE, M_IP_FW3_NAT,
387 					M_INTWAIT | M_NULLOK | M_ZERO);
388 
389 			s2->src_addr = args->f_id.dst_ip;
390 			s2->src_port = s->dst_port;
391 
392 			s2->dst_addr = alias->ip.s_addr;
393 			s2->dst_port = s->alias_port;
394 
395 			s2->alias_addr = htonl(args->f_id.src_ip);
396 			s2->alias_port = htons(args->f_id.src_port);
397 
398 			s2->timestamp = s->timestamp;
399 			if (ip->ip_p == IPPROTO_TCP) {
400 				tree_in = &nat->rb_tcp_in;
401 			} else {
402 				tree_in = &nat->rb_udp_in;
403 			}
404 			dup = RB_INSERT(state_tree, tree_in, s2);
405 		}
406 	}
407 	return IP_FW_NAT;
408 oops:
409 	return IP_FW_DENY;
410 }
411 
412 void
413 pick_alias_port(struct nat_state *s, struct state_tree *tree)
414 {
415 	do {
416 		s->alias_port = htons(krandom() % ALIAS_RANGE + ALIAS_BEGIN);
417 	} while (RB_FIND(state_tree, tree, s) != NULL);
418 }
419 
420 int
421 nat_state_cmp(struct nat_state *s1, struct nat_state *s2)
422 {
423 	if (s1->src_addr > s2->src_addr)
424 		return 1;
425 	if (s1->src_addr < s2->src_addr)
426 		return -1;
427 
428 	if (s1->dst_addr > s2->dst_addr)
429 		return 1;
430 	if (s1->dst_addr < s2->dst_addr)
431 		return -1;
432 
433 	if (s1->src_port > s2->src_port)
434 		return 1;
435 	if (s1->src_port < s2->src_port)
436 		return -1;
437 
438 	if (s1->dst_port > s2->dst_port)
439 		return 1;
440 	if (s1->dst_port < s2->dst_port)
441 		return -1;
442 
443 	return 0;
444 }
445 
446 int
447 ip_fw3_ctl_nat_get_cfg(struct sockopt *sopt)
448 {
449 	struct ip_fw3_nat_context *nat_ctx;
450 	struct ioc_nat *ioc;
451 	struct cfg_nat *nat;
452 	struct cfg_alias *alias;
453 	struct in_addr *ip;
454 	size_t valsize;
455 	int i, len;
456 
457 	len = 0;
458 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
459 	valsize = sopt->sopt_valsize;
460 	ioc = (struct ioc_nat *)sopt->sopt_val;
461 
462 	for (i = 0; i < NAT_ID_MAX; i++) {
463 		nat = nat_ctx->nats[i];
464 		if (nat != NULL) {
465 			len += LEN_IOC_NAT;
466 			if (len >= valsize) {
467 				goto nospace;
468 			}
469 			ioc->id = nat->id;
470 			ioc->count = nat->count;
471 			ip = &ioc->ip;
472 			LIST_FOREACH(alias, &nat->alias, next) {
473 				len += LEN_IN_ADDR;
474 				if (len > valsize) {
475 					goto nospace;
476 				}
477 				bcopy(&alias->ip, ip, LEN_IN_ADDR);
478 				ip++;
479 			}
480 		}
481 	}
482 	sopt->sopt_valsize = len;
483 	return 0;
484 nospace:
485 	bzero(sopt->sopt_val, sopt->sopt_valsize);
486 	sopt->sopt_valsize = 0;
487 	return 0;
488 }
489 
490 int
491 ip_fw3_ctl_nat_get_record(struct sockopt *sopt)
492 {
493 	struct ip_fw3_nat_context *nat_ctx;
494 	struct cfg_nat *the;
495 	size_t sopt_size, total_len = 0;
496 	struct ioc_nat_state *ioc;
497 	int ioc_nat_id, n, cpu;
498 	struct nat_state *s;
499 
500 	ioc_nat_id = *((int *)(sopt->sopt_val));
501 	sopt_size = sopt->sopt_valsize;
502 	ioc = (struct ioc_nat_state *)sopt->sopt_val;
503 	/* icmp states only in CPU 0 */
504 	cpu = 0;
505 	nat_ctx = ip_fw3_nat_ctx[cpu];
506 	for (n = 0; n < NAT_ID_MAX; n++) {
507 		if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
508 			if (nat_ctx->nats[n] == NULL)
509 				break;
510 			the = nat_ctx->nats[n];
511 			RB_FOREACH(s, state_tree, &the->rb_icmp_out) {
512 				total_len += LEN_IOC_NAT_STATE;
513 				if (total_len > sopt_size)
514 					goto nospace;
515 				ioc->src_addr.s_addr = ntohl(s->src_addr);
516 				ioc->dst_addr.s_addr = s->dst_addr;
517 				ioc->alias_addr.s_addr = s->alias_addr;
518 				ioc->src_port = s->src_port;
519 				ioc->dst_port = s->dst_port;
520 				ioc->alias_port = s->alias_port;
521 				ioc->nat_id = n + 1;
522 				ioc->cpu_id = cpu;
523 				ioc->proto = IPPROTO_ICMP;
524 				ioc->direction = 1;
525 				ioc->life = s->timestamp +
526 					sysctl_var_icmp_timeout - time_uptime;
527 				ioc++;
528 			}
529 			RB_FOREACH(s, state_tree, &the->rb_icmp_in) {
530 				total_len += LEN_IOC_NAT_STATE;
531 				if (total_len > sopt_size)
532 					goto nospace;
533 				ioc->src_addr.s_addr = ntohl(s->src_addr);
534 				ioc->dst_addr.s_addr = s->dst_addr;
535 				ioc->alias_addr.s_addr = s->alias_addr;
536 				ioc->src_port = s->src_port;
537 				ioc->dst_port = s->dst_port;
538 				ioc->alias_port = s->alias_port;
539 				ioc->nat_id = n + 1;
540 				ioc->cpu_id = cpu;
541 				ioc->proto = IPPROTO_ICMP;
542 				ioc->direction = 0;
543 				ioc->life = s->timestamp +
544 					sysctl_var_icmp_timeout - time_uptime;
545 				ioc++;
546 			}
547 		}
548 	}
549 
550 	/* tcp states */
551 	for (cpu = 0; cpu < ncpus; cpu++) {
552 		nat_ctx = ip_fw3_nat_ctx[cpu];
553 		for (n = 0; n < NAT_ID_MAX; n++) {
554 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
555 				if (nat_ctx->nats[n] == NULL)
556 					break;
557 				the = nat_ctx->nats[n];
558 				RB_FOREACH(s, state_tree, &the->rb_tcp_out) {
559 					total_len += LEN_IOC_NAT_STATE;
560 					if (total_len > sopt_size)
561 						goto nospace;
562 					ioc->src_addr.s_addr = ntohl(s->src_addr);
563 					ioc->dst_addr.s_addr = ntohl(s->dst_addr);
564 					ioc->alias_addr.s_addr = s->alias_addr;
565 					ioc->src_port = ntohs(s->src_port);
566 					ioc->dst_port = ntohs(s->dst_port);
567 					ioc->alias_port = s->alias_port;
568 					ioc->nat_id = n + 1;
569 					ioc->cpu_id = cpu;
570 					ioc->proto = IPPROTO_TCP;
571 					ioc->direction = 1;
572 					ioc->life = s->timestamp +
573 						sysctl_var_tcp_timeout - time_uptime;
574 					ioc++;
575 				}
576 				RB_FOREACH(s, state_tree, &the->rb_tcp_in) {
577 					total_len += LEN_IOC_NAT_STATE;
578 					if (total_len > sopt_size)
579 						goto nospace;
580 					ioc->src_addr.s_addr = ntohl(s->src_addr);
581 					ioc->dst_addr.s_addr = s->dst_addr;
582 					ioc->alias_addr.s_addr = s->alias_addr;
583 					ioc->src_port = ntohs(s->src_port);
584 					ioc->dst_port = s->dst_port;
585 					ioc->alias_port = s->alias_port;
586 					ioc->nat_id = n + 1;
587 					ioc->cpu_id = cpu;
588 					ioc->proto = IPPROTO_TCP;
589 					ioc->direction = 0;
590 					ioc->life = s->timestamp +
591 						sysctl_var_tcp_timeout - time_uptime;
592 					ioc++;
593 				}
594 			}
595 		}
596 	}
597 
598 	/* udp states */
599 	for (cpu = 0; cpu < ncpus; cpu++) {
600 		nat_ctx = ip_fw3_nat_ctx[cpu];
601 		for (n = 0; n < NAT_ID_MAX; n++) {
602 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
603 				if (nat_ctx->nats[n] == NULL)
604 					break;
605 				the = nat_ctx->nats[n];
606 				RB_FOREACH(s, state_tree, &the->rb_udp_out) {
607 					total_len += LEN_IOC_NAT_STATE;
608 					if (total_len > sopt_size)
609 						goto nospace;
610 					ioc->src_addr.s_addr = ntohl(s->src_addr);
611 					ioc->dst_addr.s_addr = s->dst_addr;
612 					ioc->alias_addr.s_addr = s->alias_addr;
613 					ioc->src_port = s->src_port;
614 					ioc->dst_port = s->dst_port;
615 					ioc->alias_port = s->alias_port;
616 					ioc->nat_id = n + 1;
617 					ioc->cpu_id = cpu;
618 					ioc->proto = IPPROTO_UDP;
619 					ioc->direction = 1;
620 					ioc->life = s->timestamp +
621 						sysctl_var_udp_timeout - time_uptime;
622 					ioc++;
623 				}
624 				RB_FOREACH(s, state_tree, &the->rb_udp_in) {
625 					total_len += LEN_IOC_NAT_STATE;
626 					if (total_len > sopt_size)
627 						goto nospace;
628 					ioc->src_addr.s_addr = ntohl(s->src_addr);
629 					ioc->dst_addr.s_addr = s->dst_addr;
630 					ioc->alias_addr.s_addr = s->alias_addr;
631 					ioc->src_port = s->src_port;
632 					ioc->dst_port = s->dst_port;
633 					ioc->alias_port = s->alias_port;
634 					ioc->nat_id = n + 1;
635 					ioc->cpu_id = cpu;
636 					ioc->proto = IPPROTO_UDP;
637 					ioc->direction = 0;
638 					ioc->life = s->timestamp +
639 						sysctl_var_udp_timeout - time_uptime;
640 					ioc++;
641 				}
642 			}
643 		}
644 	}
645 	sopt->sopt_valsize = total_len;
646 	return 0;
647 nospace:
648 	return 0;
649 }
650 
651 void
652 nat_state_add_dispatch(netmsg_t add_msg)
653 {
654 	struct ip_fw3_nat_context *nat_ctx;
655 	struct netmsg_nat_state_add *msg;
656 	struct cfg_nat *nat;
657 	struct state_tree *tree_in = NULL;
658 	struct nat_state *s2;
659 
660 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
661 	msg = (struct netmsg_nat_state_add *)add_msg;
662 	nat = nat_ctx->nats[msg->nat_id - 1];
663 	if (msg->proto == IPPROTO_TCP) {
664 		tree_in = &nat->rb_tcp_in;
665 	} else {
666 		tree_in = &nat->rb_udp_in;
667 	}
668 	s2 = msg->state;
669 	RB_INSERT(state_tree, tree_in, msg->state);
670 }
671 
672 /*
673  * Init the RB trees only when the NAT is configured.
674  */
675 void
676 nat_add_dispatch(netmsg_t nat_add_msg)
677 {
678 	struct ip_fw3_nat_context *nat_ctx;
679 	struct netmsg_nat_add *msg;
680 	struct ioc_nat *ioc;
681 	struct cfg_nat *nat;
682 	struct cfg_alias *alias;
683 	struct in_addr *ip;
684 	int n;
685 
686 	msg = (struct netmsg_nat_add *)nat_add_msg;
687 	ioc = &msg->ioc_nat;
688 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
689 
690 	if (nat_ctx->nats[ioc->id - 1] == NULL) {
691 		/* op = set, and nat not exists */
692 		nat = kmalloc(LEN_CFG_NAT, M_IP_FW3_NAT, M_WAITOK | M_ZERO);
693 		LIST_INIT(&nat->alias);
694 		RB_INIT(&nat->rb_tcp_in);
695 		RB_INIT(&nat->rb_tcp_out);
696 		RB_INIT(&nat->rb_udp_in);
697 		RB_INIT(&nat->rb_udp_out);
698 		if (mycpuid == 0) {
699 			RB_INIT(&nat->rb_icmp_in);
700 			RB_INIT(&nat->rb_icmp_out);
701 		}
702 		nat->id = ioc->id;
703 		nat->count = ioc->count;
704 		ip = &ioc->ip;
705 		for (n = 0; n < ioc->count; n++) {
706 			alias = kmalloc(LEN_CFG_ALIAS,
707 					M_IP_FW3_NAT, M_WAITOK | M_ZERO);
708 			memcpy(&alias->ip, ip, LEN_IN_ADDR);
709 			LIST_INSERT_HEAD((&nat->alias), alias, next);
710 			ip++;
711 		}
712 		nat_ctx->nats[ioc->id - 1] = nat;
713 	}
714 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
715 }
716 
717 int
718 ip_fw3_ctl_nat_add(struct sockopt *sopt)
719 {
720 	struct netmsg_nat_add nat_add_msg, *msg;
721 	struct ioc_nat *ioc;
722 	msg = &nat_add_msg;
723 
724 	ioc = (struct ioc_nat *)(sopt->sopt_val);
725 	sooptcopyin(sopt, &msg->ioc_nat, sopt->sopt_valsize,
726 			sizeof(struct ioc_nat));
727 	netmsg_init(&msg->base, NULL, &curthread->td_msgport, 0,
728 			nat_add_dispatch);
729 	netisr_domsg(&msg->base, 0);
730 	return 0;
731 }
732 
733 void
734 nat_del_dispatch(netmsg_t nat_del_msg)
735 {
736 	struct ip_fw3_nat_context *nat_ctx;
737 	struct netmsg_nat_del *msg;
738 	struct cfg_nat *nat;
739 	struct nat_state *s, *tmp;
740 	struct cfg_alias *alias, *tmp2;
741 
742 	msg = (struct netmsg_nat_del *)nat_del_msg;
743 
744 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
745 	nat = nat_ctx->nats[msg->id - 1];
746 	if (nat != NULL) {
747 		/* the icmp states will only stored in cpu 0 */
748 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_in, tmp) {
749 			RB_REMOVE(state_tree, &nat->rb_icmp_in, s);
750 			if (s != NULL) {
751 				kfree(s, M_IP_FW3_NAT);
752 			}
753 		}
754 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
755 			RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
756 			if (s != NULL) {
757 				kfree(s, M_IP_FW3_NAT);
758 			}
759 		}
760 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_in, tmp) {
761 			RB_REMOVE(state_tree, &nat->rb_tcp_in, s);
762 			if (s != NULL) {
763 				kfree(s, M_IP_FW3_NAT);
764 			}
765 		}
766 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
767 			RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
768 			if (s != NULL) {
769 				kfree(s, M_IP_FW3_NAT);
770 			}
771 		}
772 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_in, tmp) {
773 			RB_REMOVE(state_tree, &nat->rb_udp_in, s);
774 			if (s != NULL) {
775 				kfree(s, M_IP_FW3_NAT);
776 			}
777 		}
778 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
779 			RB_REMOVE(state_tree, &nat->rb_udp_out, s);
780 			if (s != NULL) {
781 				kfree(s, M_IP_FW3_NAT);
782 			}
783 		}
784 		LIST_FOREACH_MUTABLE(alias, &nat->alias, next, tmp2) {
785 			kfree(alias, M_IP_FW3_NAT);
786 		}
787 		kfree(nat, M_IP_FW3_NAT);
788 		nat_ctx->nats[msg->id - 1] = NULL;
789 	}
790 	netisr_forwardmsg_all(&nat_del_msg->base, mycpuid + 1);
791 }
792 int
793 ip_fw3_ctl_nat_del(struct sockopt *sopt)
794 {
795 	struct netmsg_nat_del nat_del_msg, *msg;
796 
797 	msg = &nat_del_msg;
798 	msg->id = *((int *)sopt->sopt_val);
799 	netmsg_init(&msg->base, NULL, &curthread->td_msgport,
800 			0, nat_del_dispatch);
801 
802 	netisr_domsg(&msg->base, 0);
803 	return 0;
804 }
805 int
806 ip_fw3_ctl_nat_flush(struct sockopt *sopt)
807 {
808 	struct netmsg_nat_del nat_del_msg, *msg;
809 	int i;
810 	msg = &nat_del_msg;
811 	for (i = 0; i < NAT_ID_MAX; i++) {
812 		msg->id = i + 1;
813 		netmsg_init(&msg->base, NULL, &curthread->td_msgport,
814 				0, nat_del_dispatch);
815 
816 		netisr_domsg(&msg->base, 0);
817 	}
818 	return 0;
819 }
820 int
821 ip_fw3_ctl_nat_sockopt(struct sockopt *sopt)
822 {
823 	int error = 0;
824 	switch (sopt->sopt_name) {
825 	case IP_FW_NAT_ADD:
826 		error = ip_fw3_ctl_nat_add(sopt);
827 		break;
828 	case IP_FW_NAT_DEL:
829 		error = ip_fw3_ctl_nat_del(sopt);
830 		break;
831 	case IP_FW_NAT_FLUSH:
832 		error = ip_fw3_ctl_nat_flush(sopt);
833 		break;
834 	case IP_FW_NAT_GET:
835 		error = ip_fw3_ctl_nat_get_cfg(sopt);
836 		break;
837 	case IP_FW_NAT_GET_RECORD:
838 		error = ip_fw3_ctl_nat_get_record(sopt);
839 		break;
840 	default:
841 		kprintf("ipfw3 nat invalid socket option %d\n",
842 				sopt->sopt_name);
843 	}
844 	return error;
845 }
846 
847 void
848 nat_init_ctx_dispatch(netmsg_t msg)
849 {
850 	struct ip_fw3_nat_context *tmp;
851 	tmp = kmalloc(sizeof(struct ip_fw3_nat_context),
852 				M_IP_FW3_NAT, M_WAITOK | M_ZERO);
853 
854 	ip_fw3_nat_ctx[mycpuid] = tmp;
855 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
856 }
857 
858 void
859 nat_fnit_ctx_dispatch(netmsg_t msg)
860 {
861 	kfree(ip_fw3_nat_ctx[mycpuid], M_IP_FW3_NAT);
862 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
863 }
864 
865 static void
866 ip_fw3_nat_cleanup_func_dispatch(netmsg_t nmsg)
867 {
868 	struct nat_state *s, *tmp;
869 	struct ip_fw3_nat_context *nat_ctx;
870 	struct cfg_nat *nat;
871 	int i;
872 
873 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
874 	for (i = 0; i < NAT_ID_MAX; i++) {
875 		nat = nat_ctx->nats[i];
876 		if (nat == NULL)
877 			continue;
878 		/* check the nat_states, remove the expired state */
879 		/* the icmp states will only stored in cpu 0 */
880 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_in, tmp) {
881 			if (time_uptime - s->timestamp > sysctl_var_icmp_timeout) {
882 				RB_REMOVE(state_tree, &nat->rb_icmp_in, s);
883 				kfree(s, M_IP_FW3_NAT);
884 			}
885 		}
886 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
887 			if (time_uptime - s->timestamp > sysctl_var_icmp_timeout) {
888 				RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
889 				kfree(s, M_IP_FW3_NAT);
890 			}
891 		}
892 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_in, tmp) {
893 			if (time_uptime - s->timestamp > sysctl_var_tcp_timeout) {
894 				RB_REMOVE(state_tree, &nat->rb_tcp_in, s);
895 				kfree(s, M_IP_FW3_NAT);
896 			}
897 		}
898 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
899 			if (time_uptime - s->timestamp > sysctl_var_tcp_timeout) {
900 				RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
901 				kfree(s, M_IP_FW3_NAT);
902 			}
903 		}
904 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_in, tmp) {
905 			if (time_uptime - s->timestamp > sysctl_var_udp_timeout) {
906 				RB_REMOVE(state_tree, &nat->rb_udp_in, s);
907 				kfree(s, M_IP_FW3_NAT);
908 			}
909 		}
910 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
911 			if (time_uptime - s->timestamp > sysctl_var_udp_timeout) {
912 				RB_REMOVE(state_tree, &nat->rb_udp_out, s);
913 				kfree(s, M_IP_FW3_NAT);
914 			}
915 		}
916 	}
917 	netisr_forwardmsg_all(&nmsg->base, mycpuid + 1);
918 }
919 
920 static void
921 ip_fw3_nat_cleanup_func(void *dummy __unused)
922 {
923 	struct netmsg_base msg;
924 	netmsg_init(&msg, NULL, &curthread->td_msgport, 0,
925 			ip_fw3_nat_cleanup_func_dispatch);
926 	netisr_domsg(&msg, 0);
927 
928 	callout_reset(&ip_fw3_nat_cleanup_callout,
929 			sysctl_var_cleanup_interval * hz,
930 			ip_fw3_nat_cleanup_func, NULL);
931 }
932 
933 static
934 int ip_fw3_nat_init(void)
935 {
936 	struct netmsg_base msg;
937 	register_ipfw_module(MODULE_NAT_ID, MODULE_NAT_NAME);
938 	register_ipfw_filter_funcs(MODULE_NAT_ID, O_NAT_NAT,
939 			(filter_func)check_nat);
940 	ipfw_ctl_nat_ptr = ip_fw3_ctl_nat_sockopt;
941 	netmsg_init(&msg, NULL, &curthread->td_msgport,
942 			0, nat_init_ctx_dispatch);
943 	netisr_domsg(&msg, 0);
944 
945 	callout_init_mp(&ip_fw3_nat_cleanup_callout);
946 	callout_reset(&ip_fw3_nat_cleanup_callout,
947 			sysctl_var_cleanup_interval * hz,
948 			ip_fw3_nat_cleanup_func,
949 			NULL);
950 	return 0;
951 }
952 
953 static int
954 ip_fw3_nat_fini(void)
955 {
956 	struct netmsg_base msg;
957 	struct netmsg_nat_del nat_del_msg, *msg1;
958 	int i;
959 
960 	callout_stop(&ip_fw3_nat_cleanup_callout);
961 
962 	msg1 = &nat_del_msg;
963 	for (i = 0; i < NAT_ID_MAX; i++) {
964 		msg1->id = i + 1;
965 		netmsg_init(&msg1->base, NULL, &curthread->td_msgport,
966 				0, nat_del_dispatch);
967 
968 		netisr_domsg(&msg1->base, 0);
969 	}
970 
971 	netmsg_init(&msg, NULL, &curthread->td_msgport,
972 			0, nat_fnit_ctx_dispatch);
973 	netisr_domsg(&msg, 0);
974 
975 	return unregister_ipfw_module(MODULE_NAT_ID);
976 }
977 
978 static int
979 ip_fw3_nat_modevent(module_t mod, int type, void *data)
980 {
981 	switch (type) {
982 	case MOD_LOAD:
983 		return ip_fw3_nat_init();
984 	case MOD_UNLOAD:
985 		return ip_fw3_nat_fini();
986 	default:
987 		break;
988 	}
989 	return 0;
990 }
991 
992 moduledata_t ip_fw3_nat_mod = {
993 	"ipfw3_nat",
994 	ip_fw3_nat_modevent,
995 	NULL
996 };
997 
998 DECLARE_MODULE(ipfw3_nat, ip_fw3_nat_mod,
999 		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
1000 MODULE_DEPEND(ipfw3_nat, ipfw3_basic, 1, 1, 1);
1001 MODULE_VERSION(ipfw3_nat, 1);
1002