xref: /dragonfly/sys/net/ipfw3_nat/ip_fw3_nat.c (revision 655933d6)
1 /*
2  * Copyright (c) 2014 - 2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Bill Yuan <bycn82@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "opt_ipfw.h"
36 #include "opt_inet.h"
37 #ifndef INET
38 #error IPFIREWALL3 requires INET.
39 #endif /* INET */
40 
41 #include <sys/param.h>
42 #include <sys/kernel.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/systimer.h>
48 #include <sys/in_cksum.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/syslog.h>
53 #include <sys/ucred.h>
54 #include <sys/lock.h>
55 
56 #include <net/ethernet.h>
57 #include <net/netmsg2.h>
58 #include <net/netisr2.h>
59 #include <net/route.h>
60 #include <net/if.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/ip_icmp.h>
65 #include <netinet/tcp.h>
66 #include <netinet/tcp_timer.h>
67 #include <netinet/tcp_var.h>
68 #include <netinet/tcpip.h>
69 #include <netinet/udp.h>
70 #include <netinet/udp_var.h>
71 #include <netinet/in_systm.h>
72 #include <netinet/in_var.h>
73 #include <netinet/in_pcb.h>
74 #include <netinet/ip_var.h>
75 #include <netinet/ip_divert.h>
76 #include <net/ipfw3/ip_fw.h>
77 
78 #include "ip_fw3_nat.h"
79 
80 MALLOC_DEFINE(M_IPFW3_NAT, "IP_FW3_NAT", "ipfw3_nat module");
81 
82 /*
83  * Highspeed Lockless Kernel NAT
84  *
85  * Kernel NAT
86  * The network address translation (NAT) will replace the `src` of the packet
87  * with an `alias` (alias_addr & alias_port). Accordingt to the configuration,
88  * The alias will be randomly picked from the configured range.
89  *
90  * Highspeed
91  * The first outgoing packet should trigger the creation of the `net_state`,
92  * and the `net_state` will keep in a RB-Tree for the subsequent outgoing
93  * packets.
94  * The first returning packet will trigger the creation of the `net_state2`,
95  * which will be stored in a multidimensional array of points ( of net_state2 ).
96  *
97  * Lockless
98  * The `net_state` for outgoing packet will be stored in the nat_context of
99  * current CPU. But due to the nature of the NAT, the returning packet may be
100  * handled by another CPU. Hence, The `net_state2` for the returning packet
101  * will be prepared and stored into the nat_context of the right CPU.
102  */
103 
104 struct ip_fw3_nat_context	*ip_fw3_nat_ctx[MAXCPU];
105 static struct callout 		ip_fw3_nat_cleanup_callout;
106 extern struct ipfw3_context 	*fw3_ctx[MAXCPU];
107 extern ip_fw_ctl_t 		*ip_fw3_ctl_nat_ptr;
108 
109 static int 			sysctl_var_cleanup_interval = 1;
110 static int 			sysctl_var_icmp_timeout = 10;
111 static int 			sysctl_var_tcp_timeout = 60;
112 static int 			sysctl_var_udp_timeout = 30;
113 
114 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw3_nat, CTLFLAG_RW, 0, "ipfw3 NAT");
115 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, cleanup_interval, CTLFLAG_RW,
116 		&sysctl_var_cleanup_interval, 0, "default life time");
117 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, icmp_timeout, CTLFLAG_RW,
118 		&sysctl_var_icmp_timeout, 0, "default icmp state life time");
119 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, tcp_timeout, CTLFLAG_RW,
120 		&sysctl_var_tcp_timeout, 0, "default tcp state life time");
121 SYSCTL_INT(_net_inet_ip_fw3_nat, OID_AUTO, udp_timeout, CTLFLAG_RW,
122 		&sysctl_var_udp_timeout, 0, "default udp state life time");
123 
124 RB_PROTOTYPE(state_tree, nat_state, entries, ip_fw3_nat_state_cmp);
125 RB_GENERATE(state_tree, nat_state, entries, ip_fw3_nat_state_cmp);
126 
127 static __inline uint16_t
128 fix_cksum(uint16_t cksum, uint16_t old_info, uint16_t new_info, uint8_t is_udp)
129 {
130 	uint32_t tmp;
131 
132 	if (is_udp && !cksum)
133 		return (0x0000);
134 	tmp = cksum + old_info - new_info;
135 	tmp = (tmp >> 16) + (tmp & 65535);
136 	tmp = tmp & 65535;
137 	if (is_udp && !tmp)
138 		return (0xFFFF);
139 	return tmp;
140 }
141 
142 void
143 check_nat(int *cmd_ctl, int *cmd_val, struct ip_fw_args **args,
144 		struct ip_fw **f, ipfw_insn *cmd, uint16_t ip_len)
145 {
146 	if ((*args)->eh != NULL) {
147 		*cmd_ctl = IP_FW_CTL_NO;
148 		*cmd_val = IP_FW_NOT_MATCH;
149 		return;
150 	}
151 
152 	struct ip_fw3_nat_context *nat_ctx;
153 	struct cfg_nat *nat;
154 	int nat_id;
155 
156 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
157 	(*args)->rule = *f;
158 	nat = ((ipfw_insn_nat *)cmd)->nat;
159 	if (nat == NULL) {
160 		nat_id = cmd->arg1;
161 		nat = nat_ctx->nats[nat_id - 1];
162 		if (nat == NULL) {
163 			*cmd_val = IP_FW_DENY;
164 			*cmd_ctl = IP_FW_CTL_DONE;
165 			return;
166 		}
167 		((ipfw_insn_nat *)cmd)->nat = nat;
168 	}
169 	*cmd_val = ip_fw3_nat(*args, nat, (*args)->m);
170 	*cmd_ctl = IP_FW_CTL_NAT;
171 }
172 
173 int
174 ip_fw3_nat(struct ip_fw_args *args, struct cfg_nat *nat, struct mbuf *m)
175 {
176 	struct state_tree *tree_out = NULL;
177 	struct nat_state *s = NULL, *dup, *k, key;
178 	struct nat_state2 *s2 = NULL;
179 	struct ip *ip = mtod(m, struct ip *);
180 	struct in_addr *old_addr = NULL, new_addr;
181 	uint16_t *old_port = NULL, new_port;
182 	uint16_t *csum = NULL, dlen = 0;
183 	uint8_t udp = 0;
184 	boolean_t pseudo = FALSE, need_return_state = FALSE;
185 	struct cfg_alias *alias;
186 	int i = 0, rand_n = 0;
187 
188 	k = &key;
189 	memset(k, 0, LEN_NAT_STATE);
190 	if (args->oif == NULL) {
191 		old_addr = &ip->ip_dst;
192 		k->dst_addr = ntohl(args->f_id.dst_ip);
193 		LIST_FOREACH(alias, &nat->alias, next) {
194 			if (alias->ip.s_addr == ntohl(args->f_id.dst_ip)) {
195 				break;
196 			}
197 		}
198 		if (alias == NULL) {
199 			goto oops;
200 		}
201 		switch (ip->ip_p) {
202 		case IPPROTO_TCP:
203 			old_port = &L3HDR(struct tcphdr, ip)->th_dport;
204 			s2 = alias->tcp_in[*old_port - ALIAS_BEGIN];
205 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
206 			break;
207 		case IPPROTO_UDP:
208 			old_port = &L3HDR(struct udphdr, ip)->uh_dport;
209 			s2 = alias->udp_in[*old_port - ALIAS_BEGIN];
210 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
211 			udp = 1;
212 			break;
213 		case IPPROTO_ICMP:
214 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
215 			s2 = alias->icmp_in[*old_port];
216 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
217 			break;
218 		default:
219 			panic("ipfw3: unsupported proto %u", ip->ip_p);
220 		}
221 		if (s2 == NULL) {
222 			goto oops;
223 		}
224 	} else {
225 		old_addr = &ip->ip_src;
226 		k->src_addr = args->f_id.src_ip;
227 		k->dst_addr = args->f_id.dst_ip;
228 		switch (ip->ip_p) {
229 		case IPPROTO_TCP:
230 			k->src_port = args->f_id.src_port;
231 			k->dst_port = args->f_id.dst_port;
232 			m->m_pkthdr.csum_flags = CSUM_TCP;
233 			tree_out = &nat->rb_tcp_out;
234 			old_port = &L3HDR(struct tcphdr, ip)->th_sport;
235 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
236 			break;
237 		case IPPROTO_UDP:
238 			k->src_port = args->f_id.src_port;
239 			k->dst_port = args->f_id.dst_port;
240 			m->m_pkthdr.csum_flags = CSUM_UDP;
241 			tree_out = &nat->rb_udp_out;
242 			old_port = &L3HDR(struct udphdr, ip)->uh_sport;
243 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
244 			udp = 1;
245 			break;
246 		case IPPROTO_ICMP:
247 			k->src_port = L3HDR(struct icmp, ip)->icmp_id;
248 			k->dst_port = k->src_port;
249 			tree_out = &nat->rb_icmp_out;
250 			old_port = &L3HDR(struct icmp, ip)->icmp_id;
251 			csum = &L3HDR(struct icmp, ip)->icmp_cksum;
252 			break;
253 		default:
254 			panic("ipfw3: unsupported proto %u", ip->ip_p);
255 		}
256 		s = RB_FIND(state_tree, tree_out, k);
257 		if (s == NULL) {
258 			/* pick an alias ip randomly when there are multiple */
259 			if (nat->count > 1) {
260 				rand_n = krandom() % nat->count;
261 			}
262 			LIST_FOREACH(alias, &nat->alias, next) {
263 				if (i++ == rand_n) {
264 					break;
265 				}
266 			}
267 			switch  (ip->ip_p) {
268 			case IPPROTO_TCP:
269 				m->m_pkthdr.csum_flags = CSUM_TCP;
270 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
271 						M_INTWAIT | M_NULLOK | M_ZERO);
272 
273 				s->src_addr = args->f_id.src_ip;
274 				s->src_port = args->f_id.src_port;
275 
276 				s->dst_addr = args->f_id.dst_ip;
277 				s->dst_port = args->f_id.dst_port;
278 
279 				s->alias_addr = alias->ip.s_addr;
280 				pick_alias_port(s, tree_out);
281 				dup = RB_INSERT(state_tree, tree_out, s);
282 				need_return_state = TRUE;
283 				break;
284 			case IPPROTO_UDP:
285 				m->m_pkthdr.csum_flags = CSUM_UDP;
286 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
287 						M_INTWAIT | M_NULLOK | M_ZERO);
288 
289 				s->src_addr = args->f_id.src_ip;
290 				s->src_port = args->f_id.src_port;
291 
292 				s->dst_addr = args->f_id.dst_ip;
293 				s->dst_port = args->f_id.dst_port;
294 
295 				s->alias_addr = alias->ip.s_addr;
296 				pick_alias_port(s, tree_out);
297 				dup = RB_INSERT(state_tree, tree_out, s);
298 				need_return_state = TRUE;
299 				break;
300 			case IPPROTO_ICMP:
301 				s = kmalloc(LEN_NAT_STATE, M_IPFW3_NAT,
302 						M_INTWAIT | M_NULLOK | M_ZERO);
303 				s->src_addr = args->f_id.src_ip;
304 				s->dst_addr = args->f_id.dst_ip;
305 
306 				s->src_port = *old_port;
307 				s->dst_port = *old_port;
308 
309 				s->alias_addr = alias->ip.s_addr;
310 				s->alias_port = htons(s->src_addr *
311 						s->dst_addr % ALIAS_RANGE);
312 				dup = RB_INSERT(state_tree, tree_out, s);
313 
314 				s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
315 						M_INTWAIT | M_NULLOK | M_ZERO);
316 
317 				s2->src_addr = args->f_id.dst_ip;
318 				s2->dst_addr = alias->ip.s_addr;
319 
320 				s2->src_port = s->alias_port;
321 				s2->dst_port = s->alias_port;
322 
323 				s2->alias_addr = htonl(args->f_id.src_ip);
324 				s2->alias_port = *old_port;
325 
326 				alias->icmp_in[s->alias_port] = s2;
327 				break;
328 			default :
329 				goto oops;
330 			}
331 		}
332 	}
333 	if (args->oif == NULL) {
334 		if (ip->ip_p == IPPROTO_ICMP) {
335 			new_addr.s_addr = s2->alias_addr;
336 			new_port = s2->alias_port;
337 		} else {
338 			new_addr.s_addr = s2->src_addr;
339 			new_port = s2->src_port;
340 		}
341 		s2->timestamp = time_uptime;
342 	} else {
343 		new_addr.s_addr = s->alias_addr;
344 		new_port = s->alias_port;
345 		s->timestamp = time_uptime;
346 	}
347 
348 	/* replace src/dst and fix the checksum */
349 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
350 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
351 			dlen = ip->ip_len - (ip->ip_hl << 2);
352 		}
353 		pseudo = TRUE;
354 	}
355 	if (!pseudo) {
356 		const uint16_t *oaddr, *naddr;
357 		oaddr = (const uint16_t *)&old_addr->s_addr;
358 		naddr = (const uint16_t *)&new_addr.s_addr;
359 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[0], naddr[0], 0);
360 		ip->ip_sum = fix_cksum(ip->ip_sum, oaddr[1], naddr[1], 0);
361 		if (ip->ip_p != IPPROTO_ICMP) {
362 			*csum = fix_cksum(*csum, oaddr[0], naddr[0], udp);
363 			*csum = fix_cksum(*csum, oaddr[1], naddr[1], udp);
364 		}
365 	}
366 	old_addr->s_addr = new_addr.s_addr;
367 	if (!pseudo) {
368 		*csum = fix_cksum(*csum, *old_port, new_port, udp);
369 	}
370 	*old_port = new_port;
371 
372 	if (pseudo) {
373 		*csum = in_pseudo(ip->ip_src.s_addr,
374 				ip->ip_dst.s_addr, htons(dlen + ip->ip_p));
375 	}
376 
377 	/* prepare the state for return traffic */
378 	if (need_return_state) {
379 		ip->ip_len = htons(ip->ip_len);
380 		ip->ip_off = htons(ip->ip_off);
381 
382 		m->m_flags &= ~M_HASH;
383 		ip_hashfn(&m, 0);
384 
385 		ip->ip_len = ntohs(ip->ip_len);
386 		ip->ip_off = ntohs(ip->ip_off);
387 
388 		int nextcpu = netisr_hashcpu(m->m_pkthdr.hash);
389 		if (nextcpu != mycpuid) {
390 			struct netmsg_nat_state_add *msg;
391 			msg = kmalloc(LEN_NMSG_NAT_STATE_ADD,
392 					M_LWKTMSG, M_NOWAIT | M_ZERO);
393 			netmsg_init(&msg->base, NULL, &curthread->td_msgport,
394 					0, nat_state_add_dispatch);
395 			s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
396 					M_INTWAIT | M_NULLOK | M_ZERO);
397 
398 			s2->src_addr = args->f_id.dst_ip;
399 			s2->src_port = args->f_id.dst_port;
400 
401 			s2->dst_addr = alias->ip.s_addr;
402 			s2->dst_port = s->alias_port;
403 
404 			s2->src_addr = htonl(args->f_id.src_ip);
405 			s2->src_port = htons(args->f_id.src_port);
406 
407 			s2->timestamp = s->timestamp;
408 			msg->alias_addr.s_addr = alias->ip.s_addr;
409 			msg->alias_port = s->alias_port;
410 			msg->state = s2;
411 			msg->nat_id = nat->id;
412 			msg->proto = ip->ip_p;
413 			netisr_sendmsg(&msg->base, nextcpu);
414 		} else {
415 			s2 = kmalloc(LEN_NAT_STATE2, M_IPFW3_NAT,
416 					M_INTWAIT | M_NULLOK | M_ZERO);
417 
418 			s2->src_addr = args->f_id.dst_ip;
419 			s2->dst_addr = alias->ip.s_addr;
420 
421 			s2->src_port = s->alias_port;
422 			s2->dst_port = s->alias_port;
423 
424 			s2->src_addr = htonl(args->f_id.src_ip);
425 			s2->src_port = htons(args->f_id.src_port);
426 
427 			s2->timestamp = s->timestamp;
428 			if (ip->ip_p == IPPROTO_TCP) {
429 				alias->tcp_in[s->alias_port - ALIAS_BEGIN] = s2;
430 			} else {
431 				alias->udp_in[s->alias_port - ALIAS_BEGIN] = s2;
432 			}
433 		}
434 	}
435 	return IP_FW_NAT;
436 oops:
437 	IPFW3_DEBUG1("oops\n");
438 	return IP_FW_DENY;
439 }
440 
441 void
442 pick_alias_port(struct nat_state *s, struct state_tree *tree)
443 {
444 	do {
445 		s->alias_port = htons(krandom() % ALIAS_RANGE + ALIAS_BEGIN);
446 	} while (RB_FIND(state_tree, tree, s) != NULL);
447 }
448 
449 int
450 ip_fw3_nat_state_cmp(struct nat_state *s1, struct nat_state *s2)
451 {
452 	if (s1->src_addr > s2->src_addr)
453 		return 1;
454 	if (s1->src_addr < s2->src_addr)
455 		return -1;
456 
457 	if (s1->dst_addr > s2->dst_addr)
458 		return 1;
459 	if (s1->dst_addr < s2->dst_addr)
460 		return -1;
461 
462 	if (s1->src_port > s2->src_port)
463 		return 1;
464 	if (s1->src_port < s2->src_port)
465 		return -1;
466 
467 	if (s1->dst_port > s2->dst_port)
468 		return 1;
469 	if (s1->dst_port < s2->dst_port)
470 		return -1;
471 
472 	return 0;
473 }
474 
475 int
476 ip_fw3_ctl_nat_get_cfg(struct sockopt *sopt)
477 {
478 	struct ip_fw3_nat_context *nat_ctx;
479 	struct ioc_nat *ioc;
480 	struct cfg_nat *nat;
481 	struct cfg_alias *alias;
482 	struct in_addr *ip;
483 	size_t valsize;
484 	int i, len;
485 
486 	len = 0;
487 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
488 	valsize = sopt->sopt_valsize;
489 	ioc = (struct ioc_nat *)sopt->sopt_val;
490 
491 	for (i = 0; i < NAT_ID_MAX; i++) {
492 		nat = nat_ctx->nats[i];
493 		if (nat != NULL) {
494 			len += LEN_IOC_NAT;
495 			if (len >= valsize) {
496 				goto nospace;
497 			}
498 			ioc->id = nat->id;
499 			ioc->count = nat->count;
500 			ip = &ioc->ip;
501 			LIST_FOREACH(alias, &nat->alias, next) {
502 				len += LEN_IN_ADDR;
503 				if (len > valsize) {
504 					goto nospace;
505 				}
506 				bcopy(&alias->ip, ip, LEN_IN_ADDR);
507 				ip++;
508 			}
509 		}
510 	}
511 	sopt->sopt_valsize = len;
512 	return 0;
513 nospace:
514 	bzero(sopt->sopt_val, sopt->sopt_valsize);
515 	sopt->sopt_valsize = 0;
516 	return 0;
517 }
518 
519 int
520 ip_fw3_ctl_nat_get_record(struct sockopt *sopt)
521 {
522 	struct ip_fw3_nat_context *nat_ctx;
523 	struct cfg_nat *the;
524 	size_t sopt_size, total_len = 0;
525 	struct ioc_nat_state *ioc;
526 	int ioc_nat_id, i, n, cpu;
527 	struct nat_state 	*s;
528 	struct nat_state2 	*s2;
529 	struct cfg_alias	*a1;
530 
531 	ioc_nat_id = *((int *)(sopt->sopt_val));
532 	sopt_size = sopt->sopt_valsize;
533 	ioc = (struct ioc_nat_state *)sopt->sopt_val;
534 	/* icmp states only in CPU 0 */
535 	cpu = 0;
536 	nat_ctx = ip_fw3_nat_ctx[cpu];
537 	for (n = 0; n < NAT_ID_MAX; n++) {
538 		if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
539 			if (nat_ctx->nats[n] == NULL)
540 				break;
541 			the = nat_ctx->nats[n];
542 			RB_FOREACH(s, state_tree, &the->rb_icmp_out) {
543 				total_len += LEN_IOC_NAT_STATE;
544 				if (total_len > sopt_size)
545 					goto nospace;
546 				ioc->src_addr.s_addr = ntohl(s->src_addr);
547 				ioc->dst_addr.s_addr = s->dst_addr;
548 				ioc->alias_addr.s_addr = s->alias_addr;
549 				ioc->src_port = s->src_port;
550 				ioc->dst_port = s->dst_port;
551 				ioc->alias_port = s->alias_port;
552 				ioc->nat_id = n + 1;
553 				ioc->cpu_id = cpu;
554 				ioc->proto = IPPROTO_ICMP;
555 				ioc->direction = 1;
556 				ioc->life = s->timestamp +
557 					sysctl_var_icmp_timeout - time_uptime;
558 				ioc++;
559 			}
560 
561 			LIST_FOREACH(a1, &the->alias, next) {
562 			for (i = 0; i < ALIAS_RANGE; i++) {
563 				s2 = a1->icmp_in[i];
564 				if (s2 == NULL) {
565 					continue;
566 				}
567 
568 				total_len += LEN_IOC_NAT_STATE;
569 				if (total_len > sopt_size)
570 					goto nospace;
571 
572 				ioc->src_addr.s_addr = ntohl(s2->src_addr);
573 				ioc->dst_addr.s_addr = s2->dst_addr;
574 				ioc->alias_addr.s_addr = s2->alias_addr;
575 				ioc->src_port = s2->src_port;
576 				ioc->dst_port = s2->dst_port;
577 				ioc->alias_port = s2->alias_port;
578 				ioc->nat_id = n + 1;
579 				ioc->cpu_id = cpu;
580 				ioc->proto = IPPROTO_ICMP;
581 				ioc->direction = 0;
582 				ioc->life = s2->timestamp +
583 					sysctl_var_icmp_timeout - time_uptime;
584 				ioc++;
585 			}
586 			}
587 		}
588 	}
589 
590 	/* tcp states */
591 	for (cpu = 0; cpu < ncpus; cpu++) {
592 		nat_ctx = ip_fw3_nat_ctx[cpu];
593 		for (n = 0; n < NAT_ID_MAX; n++) {
594 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
595 				if (nat_ctx->nats[n] == NULL)
596 					break;
597 				the = nat_ctx->nats[n];
598 				RB_FOREACH(s, state_tree, &the->rb_tcp_out) {
599 					total_len += LEN_IOC_NAT_STATE;
600 					if (total_len > sopt_size)
601 						goto nospace;
602 					ioc->src_addr.s_addr = ntohl(s->src_addr);
603 					ioc->dst_addr.s_addr = ntohl(s->dst_addr);
604 					ioc->alias_addr.s_addr = s->alias_addr;
605 					ioc->src_port = ntohs(s->src_port);
606 					ioc->dst_port = ntohs(s->dst_port);
607 					ioc->alias_port = s->alias_port;
608 					ioc->nat_id = n + 1;
609 					ioc->cpu_id = cpu;
610 					ioc->proto = IPPROTO_TCP;
611 					ioc->direction = 1;
612 					ioc->life = s->timestamp +
613 						sysctl_var_tcp_timeout - time_uptime;
614 					ioc++;
615 				}
616 				LIST_FOREACH(a1, &the->alias, next) {
617 					for (i = 0; i < ALIAS_RANGE; i++) {
618 						s2 = a1->tcp_in[i];
619 						if (s2 == NULL) {
620 							continue;
621 						}
622 
623 						total_len += LEN_IOC_NAT_STATE;
624 						if (total_len > sopt_size)
625 							goto nospace;
626 
627 						ioc->src_addr.s_addr = ntohl(s2->src_addr);
628 						ioc->dst_addr.s_addr = s2->dst_addr;
629 						ioc->alias_addr.s_addr = s2->alias_addr;
630 						ioc->src_port = s2->src_port;
631 						ioc->dst_port = s2->dst_port;
632 						ioc->alias_port = s2->alias_port;
633 						ioc->nat_id = n + 1;
634 						ioc->cpu_id = cpu;
635 						ioc->proto = IPPROTO_TCP;
636 						ioc->direction = 0;
637 						ioc->life = s2->timestamp +
638 							sysctl_var_icmp_timeout - time_uptime;
639 						ioc++;
640 					}
641 				}
642 			}
643 		}
644 	}
645 
646 	/* udp states */
647 	for (cpu = 0; cpu < ncpus; cpu++) {
648 		nat_ctx = ip_fw3_nat_ctx[cpu];
649 		for (n = 0; n < NAT_ID_MAX; n++) {
650 			if (ioc_nat_id == 0 || ioc_nat_id == n + 1) {
651 				if (nat_ctx->nats[n] == NULL)
652 					break;
653 				the = nat_ctx->nats[n];
654 				RB_FOREACH(s, state_tree, &the->rb_udp_out) {
655 					total_len += LEN_IOC_NAT_STATE;
656 					if (total_len > sopt_size)
657 						goto nospace;
658 					ioc->src_addr.s_addr = ntohl(s->src_addr);
659 					ioc->dst_addr.s_addr = s->dst_addr;
660 					ioc->alias_addr.s_addr = s->alias_addr;
661 					ioc->src_port = s->src_port;
662 					ioc->dst_port = s->dst_port;
663 					ioc->alias_port = s->alias_port;
664 					ioc->nat_id = n + 1;
665 					ioc->cpu_id = cpu;
666 					ioc->proto = IPPROTO_UDP;
667 					ioc->direction = 1;
668 					ioc->life = s->timestamp +
669 						sysctl_var_udp_timeout - time_uptime;
670 					ioc++;
671 				}
672 				LIST_FOREACH(a1, &the->alias, next) {
673 					for (i = 0; i < ALIAS_RANGE; i++) {
674 						s2 = a1->udp_in[i];
675 						if (s2 == NULL) {
676 							continue;
677 						}
678 
679 						total_len += LEN_IOC_NAT_STATE;
680 						if (total_len > sopt_size)
681 							goto nospace;
682 
683 						ioc->src_addr.s_addr = ntohl(s2->src_addr);
684 						ioc->dst_addr.s_addr = s2->dst_addr;
685 						ioc->alias_addr.s_addr = s2->alias_addr;
686 						ioc->src_port = s2->src_port;
687 						ioc->dst_port = s2->dst_port;
688 						ioc->alias_port = s2->alias_port;
689 						ioc->nat_id = n + 1;
690 						ioc->cpu_id = cpu;
691 						ioc->proto = IPPROTO_UDP;
692 						ioc->direction = 0;
693 						ioc->life = s2->timestamp +
694 							sysctl_var_icmp_timeout - time_uptime;
695 						ioc++;
696 					}
697 				}
698 			}
699 		}
700 	}
701 	sopt->sopt_valsize = total_len;
702 	return 0;
703 nospace:
704 	return 0;
705 }
706 
707 void
708 nat_state_add_dispatch(netmsg_t add_msg)
709 {
710 	struct ip_fw3_nat_context *nat_ctx;
711 	struct netmsg_nat_state_add *msg;
712 	struct cfg_nat *nat;
713 	struct nat_state2 *s2;
714 	struct cfg_alias *alias;
715 
716 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
717 	msg = (struct netmsg_nat_state_add *)add_msg;
718 	nat = nat_ctx->nats[msg->nat_id - 1];
719 
720 	LIST_FOREACH(alias, &nat->alias, next) {
721 		if (alias->ip.s_addr == msg->alias_addr.s_addr) {
722 			break;
723 		}
724 	}
725 	s2 = msg->state;
726 	if (msg->proto == IPPROTO_TCP) {
727 		alias->tcp_in[msg->alias_port - ALIAS_BEGIN] = s2;
728 	} else {
729 		alias->udp_in[msg->alias_port - ALIAS_BEGIN] = s2;
730 	}
731 }
732 
733 /*
734  * Init the RB trees only when the NAT is configured.
735  */
736 void
737 nat_add_dispatch(netmsg_t nat_add_msg)
738 {
739 	struct ip_fw3_nat_context *nat_ctx;
740 	struct netmsg_nat_add *msg;
741 	struct ioc_nat *ioc;
742 	struct cfg_nat *nat;
743 	struct cfg_alias *alias;
744 	struct in_addr *ip;
745 	int n;
746 
747 	msg = (struct netmsg_nat_add *)nat_add_msg;
748 	ioc = &msg->ioc_nat;
749 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
750 
751 	if (nat_ctx->nats[ioc->id - 1] == NULL) {
752 		/* op = set, and nat not exists */
753 		nat = kmalloc(LEN_CFG_NAT, M_IPFW3_NAT, M_WAITOK | M_ZERO);
754 		LIST_INIT(&nat->alias);
755 		RB_INIT(&nat->rb_tcp_out);
756 		RB_INIT(&nat->rb_udp_out);
757 		if (mycpuid == 0) {
758 			RB_INIT(&nat->rb_icmp_out);
759 		}
760 		nat->id = ioc->id;
761 		nat->count = ioc->count;
762 		ip = &ioc->ip;
763 		for (n = 0; n < ioc->count; n++) {
764 			alias = kmalloc(LEN_CFG_ALIAS,
765 					M_IPFW3_NAT, M_WAITOK | M_ZERO);
766 			memcpy(&alias->ip, ip, LEN_IN_ADDR);
767 			LIST_INSERT_HEAD((&nat->alias), alias, next);
768 			ip++;
769 		}
770 		nat_ctx->nats[ioc->id - 1] = nat;
771 	}
772 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
773 }
774 
775 int
776 ip_fw3_ctl_nat_add(struct sockopt *sopt)
777 {
778 	struct netmsg_nat_add nat_add_msg, *msg;
779 	struct ioc_nat *ioc;
780 	msg = &nat_add_msg;
781 
782 	ioc = (struct ioc_nat *)(sopt->sopt_val);
783 	sooptcopyin(sopt, &msg->ioc_nat, sopt->sopt_valsize,
784 			sizeof(struct ioc_nat));
785 	netmsg_init(&msg->base, NULL, &curthread->td_msgport, 0,
786 			nat_add_dispatch);
787 	netisr_domsg(&msg->base, 0);
788 	return 0;
789 }
790 
791 void
792 nat_del_dispatch(netmsg_t nat_del_msg)
793 {
794 	struct ip_fw3_nat_context *nat_ctx;
795 	struct netmsg_nat_del *msg;
796 	struct cfg_nat *nat;
797 	struct nat_state *s, *tmp;
798 	struct cfg_alias *alias, *tmp3;
799 
800 	msg = (struct netmsg_nat_del *)nat_del_msg;
801 
802 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
803 	nat = nat_ctx->nats[msg->id - 1];
804 	if (nat != NULL) {
805 		/* the icmp states will only stored in cpu 0 */
806 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
807 			RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
808 			if (s != NULL) {
809 				kfree(s, M_IPFW3_NAT);
810 			}
811 		}
812 		/*
813 		LIST_FOREACH_MUTABLE(s2, &nat->alias->icmp_in, next, tmp2) {
814 			LIST_REMOVE(s2, next);
815 			if (s != NULL) {
816 				kfree(s, M_IPFW3_NAT);
817 			}
818 		}
819 		*/
820 
821 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
822 			RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
823 			if (s != NULL) {
824 				kfree(s, M_IPFW3_NAT);
825 			}
826 		}
827 		/*
828 		LIST_FOREACH_MUTABLE(s2, &nat->alias->tcp_in, next, tmp2) {
829 			LIST_REMOVE(s2, next);
830 			if (s != NULL) {
831 				kfree(s, M_IPFW3_NAT);
832 			}
833 		}
834 		*/
835 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
836 			RB_REMOVE(state_tree, &nat->rb_udp_out, s);
837 			if (s != NULL) {
838 				kfree(s, M_IPFW3_NAT);
839 			}
840 		}
841 		/*
842 		LIST_FOREACH_MUTABLE(s2, &nat->alias->udp_in, next, tmp2) {
843 			LIST_REMOVE(s2, next);
844 			if (s != NULL) {
845 				kfree(s, M_IPFW3_NAT);
846 			}
847 		}
848 		*/
849 		LIST_FOREACH_MUTABLE(alias, &nat->alias, next, tmp3) {
850 			kfree(alias, M_IPFW3_NAT);
851 		}
852 		kfree(nat, M_IPFW3_NAT);
853 		nat_ctx->nats[msg->id - 1] = NULL;
854 	}
855 	netisr_forwardmsg_all(&nat_del_msg->base, mycpuid + 1);
856 }
857 int
858 ip_fw3_ctl_nat_del(struct sockopt *sopt)
859 {
860 	struct netmsg_nat_del nat_del_msg, *msg;
861 
862 	msg = &nat_del_msg;
863 	msg->id = *((int *)sopt->sopt_val);
864 	netmsg_init(&msg->base, NULL, &curthread->td_msgport,
865 			0, nat_del_dispatch);
866 
867 	netisr_domsg(&msg->base, 0);
868 	return 0;
869 }
870 int
871 ip_fw3_ctl_nat_flush(struct sockopt *sopt)
872 {
873 	struct netmsg_nat_del nat_del_msg, *msg;
874 	int i;
875 	msg = &nat_del_msg;
876 	for (i = 0; i < NAT_ID_MAX; i++) {
877 		msg->id = i + 1;
878 		netmsg_init(&msg->base, NULL, &curthread->td_msgport,
879 				0, nat_del_dispatch);
880 
881 		netisr_domsg(&msg->base, 0);
882 	}
883 	return 0;
884 }
885 
886 int
887 ip_fw3_ctl_nat_sockopt(struct sockopt *sopt)
888 {
889 	int error = 0;
890 	switch (sopt->sopt_name) {
891 	case IP_FW_NAT_ADD:
892 		error = ip_fw3_ctl_nat_add(sopt);
893 		break;
894 	case IP_FW_NAT_DEL:
895 		error = ip_fw3_ctl_nat_del(sopt);
896 		break;
897 	case IP_FW_NAT_FLUSH:
898 		error = ip_fw3_ctl_nat_flush(sopt);
899 		break;
900 	case IP_FW_NAT_GET:
901 		error = ip_fw3_ctl_nat_get_cfg(sopt);
902 		break;
903 	case IP_FW_NAT_GET_RECORD:
904 		error = ip_fw3_ctl_nat_get_record(sopt);
905 		break;
906 	default:
907 		kprintf("ipfw3 nat invalid socket option %d\n",
908 				sopt->sopt_name);
909 	}
910 	return error;
911 }
912 
913 void
914 nat_init_ctx_dispatch(netmsg_t msg)
915 {
916 	struct ip_fw3_nat_context *tmp;
917 	tmp = kmalloc(sizeof(struct ip_fw3_nat_context),
918 				M_IPFW3_NAT, M_WAITOK | M_ZERO);
919 
920 	ip_fw3_nat_ctx[mycpuid] = tmp;
921 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
922 }
923 
924 void
925 nat_fnit_ctx_dispatch(netmsg_t msg)
926 {
927 	kfree(ip_fw3_nat_ctx[mycpuid], M_IPFW3_NAT);
928 	netisr_forwardmsg_all(&msg->base, mycpuid + 1);
929 }
930 
931 static void
932 nat_cleanup_func_dispatch(netmsg_t nmsg)
933 {
934 	struct nat_state *s, *tmp;
935 	struct ip_fw3_nat_context *nat_ctx;
936 	struct cfg_nat *nat;
937 	struct cfg_alias *a1, *tmp2;
938 	struct nat_state2 *s2;
939 	int i, j;
940 
941 	nat_ctx = ip_fw3_nat_ctx[mycpuid];
942 	for (j = 0; j < NAT_ID_MAX; j++) {
943 		nat = nat_ctx->nats[j];
944 		if (nat == NULL)
945 			continue;
946 		/* check the nat_states, remove the expired state */
947 		/* the icmp states will only stored in cpu 0 */
948 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_icmp_out, tmp) {
949 			if (time_uptime - s->timestamp > sysctl_var_icmp_timeout) {
950 				RB_REMOVE(state_tree, &nat->rb_icmp_out, s);
951 				kfree(s, M_IPFW3_NAT);
952 			}
953 		}
954 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
955 			for (i = 0; i < ALIAS_RANGE; i++) {
956 				s2 = a1->icmp_in[i];
957 				if (s2 != NULL) {
958 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
959 						a1->icmp_in[i] = NULL;
960 						kfree(s2, M_IPFW3_NAT);
961 					}
962 				}
963 
964 			}
965 		}
966 
967 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_tcp_out, tmp) {
968 			if (time_uptime - s->timestamp > sysctl_var_tcp_timeout) {
969 				RB_REMOVE(state_tree, &nat->rb_tcp_out, s);
970 				kfree(s, M_IPFW3_NAT);
971 			}
972 		}
973 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
974 			for (i = 0; i < ALIAS_RANGE; i++) {
975 				s2 = a1->tcp_in[i];
976 				if (s2 != NULL) {
977 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
978 						a1->tcp_in[i] = NULL;
979 						kfree(s2, M_IPFW3_NAT);
980 					}
981 				}
982 
983 			}
984 		}
985 		RB_FOREACH_SAFE(s, state_tree, &nat->rb_udp_out, tmp) {
986 			if (time_uptime - s->timestamp > sysctl_var_udp_timeout) {
987 				RB_REMOVE(state_tree, &nat->rb_udp_out, s);
988 				kfree(s, M_IPFW3_NAT);
989 			}
990 		}
991 		LIST_FOREACH_MUTABLE(a1, &nat->alias, next, tmp2) {
992 			for (i = 0; i < ALIAS_RANGE; i++) {
993 				s2 = a1->udp_in[i];
994 				if (s2 != NULL) {
995 					if (time_uptime - s2->timestamp > sysctl_var_icmp_timeout) {
996 						a1->udp_in[i] = NULL;
997 						kfree(s2, M_IPFW3_NAT);
998 					}
999 				}
1000 
1001 			}
1002 		}
1003 	}
1004 	netisr_forwardmsg_all(&nmsg->base, mycpuid + 1);
1005 }
1006 
1007 static void
1008 ip_fw3_nat_cleanup_func(void *dummy __unused)
1009 {
1010 	struct netmsg_base msg;
1011 	netmsg_init(&msg, NULL, &curthread->td_msgport, 0,
1012 			nat_cleanup_func_dispatch);
1013 	netisr_domsg(&msg, 0);
1014 
1015 	callout_reset(&ip_fw3_nat_cleanup_callout,
1016 			sysctl_var_cleanup_interval * hz,
1017 			ip_fw3_nat_cleanup_func, NULL);
1018 }
1019 
1020 static
1021 int ip_fw3_nat_init(void)
1022 {
1023 	struct netmsg_base msg;
1024 	ip_fw3_register_module(MODULE_NAT_ID, MODULE_NAT_NAME);
1025 	ip_fw3_register_filter_funcs(MODULE_NAT_ID, O_NAT_NAT,
1026 			(filter_func)check_nat);
1027 	ip_fw3_ctl_nat_ptr = ip_fw3_ctl_nat_sockopt;
1028 	netmsg_init(&msg, NULL, &curthread->td_msgport,
1029 			0, nat_init_ctx_dispatch);
1030 	netisr_domsg(&msg, 0);
1031 
1032 	callout_init_mp(&ip_fw3_nat_cleanup_callout);
1033 	callout_reset(&ip_fw3_nat_cleanup_callout,
1034 			sysctl_var_cleanup_interval * hz,
1035 			ip_fw3_nat_cleanup_func,
1036 			NULL);
1037 	return 0;
1038 }
1039 
1040 static int
1041 ip_fw3_nat_fini(void)
1042 {
1043 	struct netmsg_base msg;
1044 	struct netmsg_nat_del nat_del_msg, *msg1;
1045 	int i;
1046 
1047 	callout_stop(&ip_fw3_nat_cleanup_callout);
1048 
1049 	msg1 = &nat_del_msg;
1050 	for (i = 0; i < NAT_ID_MAX; i++) {
1051 		msg1->id = i + 1;
1052 		netmsg_init(&msg1->base, NULL, &curthread->td_msgport,
1053 				0, nat_del_dispatch);
1054 
1055 		netisr_domsg(&msg1->base, 0);
1056 	}
1057 
1058 	netmsg_init(&msg, NULL, &curthread->td_msgport,
1059 			0, nat_fnit_ctx_dispatch);
1060 	netisr_domsg(&msg, 0);
1061 
1062 	return ip_fw3_unregister_module(MODULE_NAT_ID);
1063 }
1064 
1065 static int
1066 ip_fw3_nat_modevent(module_t mod, int type, void *data)
1067 {
1068 	switch (type) {
1069 	case MOD_LOAD:
1070 		return ip_fw3_nat_init();
1071 	case MOD_UNLOAD:
1072 		return ip_fw3_nat_fini();
1073 	default:
1074 		break;
1075 	}
1076 	return 0;
1077 }
1078 
1079 moduledata_t ip_fw3_nat_mod = {
1080 	"ipfw3_nat",
1081 	ip_fw3_nat_modevent,
1082 	NULL
1083 };
1084 
1085 DECLARE_MODULE(ipfw3_nat, ip_fw3_nat_mod,
1086 		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
1087 MODULE_DEPEND(ipfw3_nat, ipfw3_basic, 1, 1, 1);
1088 MODULE_VERSION(ipfw3_nat, 1);
1089