xref: /openbsd/sys/net/pf_lb.c (revision e8d81675)
1 /*	$OpenBSD: pf_lb.c,v 1.74 2023/05/10 22:42:51 sashan Exp $ */
2 
3 /*
4  * Copyright (c) 2001 Daniel Hartmeier
5  * Copyright (c) 2002 - 2008 Henning Brauer
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  *    - Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *    - Redistributions in binary form must reproduce the above
15  *      copyright notice, this list of conditions and the following
16  *      disclaimer in the documentation and/or other materials provided
17  *      with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  *
32  * Effort sponsored in part by the Defense Advanced Research Projects
33  * Agency (DARPA) and Air Force Research Laboratory, Air Force
34  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35  *
36  */
37 
38 #include "bpfilter.h"
39 #include "pflog.h"
40 #include "pfsync.h"
41 #include "pflow.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/filio.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/kernel.h>
50 #include <sys/time.h>
51 #include <sys/pool.h>
52 #include <sys/rwlock.h>
53 #include <sys/syslog.h>
54 #include <sys/stdint.h>
55 
56 #include <crypto/siphash.h>
57 
58 #include <net/if.h>
59 #include <net/bpf.h>
60 #include <net/route.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/ip_var.h>
66 #include <netinet/ip_icmp.h>
67 #include <netinet/icmp_var.h>
68 #include <netinet/tcp.h>
69 #include <netinet/tcp_seq.h>
70 #include <netinet/tcp_timer.h>
71 #include <netinet/udp.h>
72 #include <netinet/udp_var.h>
73 #include <netinet/if_ether.h>
74 
75 #ifdef INET6
76 #include <netinet/ip6.h>
77 #include <netinet/icmp6.h>
78 #endif /* INET6 */
79 
80 #include <net/pfvar.h>
81 #include <net/pfvar_priv.h>
82 
83 #if NPFLOG > 0
84 #include <net/if_pflog.h>
85 #endif	/* NPFLOG > 0 */
86 
87 #if NPFLOW > 0
88 #include <net/if_pflow.h>
89 #endif	/* NPFLOW > 0 */
90 
91 #if NPFSYNC > 0
92 #include <net/if_pfsync.h>
93 #endif /* NPFSYNC > 0 */
94 
95 u_int64_t		 pf_hash(struct pf_addr *, struct pf_addr *,
96 			    struct pf_poolhashkey *, sa_family_t);
97 int			 pf_get_sport(struct pf_pdesc *, struct pf_rule *,
98 			    struct pf_addr *, u_int16_t *, u_int16_t,
99 			    u_int16_t, struct pf_src_node **);
100 int			 pf_map_addr_states_increase(sa_family_t,
101 				struct pf_pool *, struct pf_addr *);
102 int			 pf_get_transaddr_af(struct pf_rule *,
103 			    struct pf_pdesc *, struct pf_src_node **);
104 int			 pf_map_addr_sticky(sa_family_t, struct pf_rule *,
105 			    struct pf_addr *, struct pf_addr *,
106 			    struct pf_src_node **, struct pf_pool *,
107 			    enum pf_sn_types);
108 
109 u_int64_t
pf_hash(struct pf_addr * inaddr,struct pf_addr * hash,struct pf_poolhashkey * key,sa_family_t af)110 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
111     struct pf_poolhashkey *key, sa_family_t af)
112 {
113 	uint64_t res = 0;
114 #ifdef INET6
115 	union {
116 		uint64_t hash64;
117 		uint32_t hash32[2];
118 	} h;
119 #endif	/* INET6 */
120 
121 	switch (af) {
122 	case AF_INET:
123 		res = SipHash24((SIPHASH_KEY *)key,
124 		    &inaddr->addr32[0], sizeof(inaddr->addr32[0]));
125 		hash->addr32[0] = res;
126 		break;
127 #ifdef INET6
128 	case AF_INET6:
129 		res = SipHash24((SIPHASH_KEY *)key, &inaddr->addr32[0],
130 		    4 * sizeof(inaddr->addr32[0]));
131 		h.hash64 = res;
132 		hash->addr32[0] = h.hash32[0];
133 		hash->addr32[1] = h.hash32[1];
134 		/*
135 		 * siphash isn't big enough, but flipping it around is
136 		 * good enough here.
137 		 */
138 		hash->addr32[2] = ~h.hash32[1];
139 		hash->addr32[3] = ~h.hash32[0];
140 		break;
141 #endif /* INET6 */
142 	default:
143 		unhandled_af(af);
144 	}
145 	return (res);
146 }
147 
148 int
pf_get_sport(struct pf_pdesc * pd,struct pf_rule * r,struct pf_addr * naddr,u_int16_t * nport,u_int16_t low,u_int16_t high,struct pf_src_node ** sn)149 pf_get_sport(struct pf_pdesc *pd, struct pf_rule *r,
150     struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high,
151     struct pf_src_node **sn)
152 {
153 	struct pf_state_key_cmp	key;
154 	struct pf_addr		init_addr;
155 	u_int16_t		cut;
156 	int			dir = (pd->dir == PF_IN) ? PF_OUT : PF_IN;
157 	int			sidx = pd->sidx;
158 	int			didx = pd->didx;
159 
160 	memset(&init_addr, 0, sizeof(init_addr));
161 	if (pf_map_addr(pd->naf, r, &pd->nsaddr, naddr, &init_addr, sn, &r->nat,
162 	    PF_SN_NAT))
163 		return (1);
164 
165 	if (pd->proto == IPPROTO_ICMP) {
166 		if (pd->ndport == htons(ICMP_ECHO)) {
167 			low = 1;
168 			high = 65535;
169 		} else
170 			return (0);	/* Don't try to modify non-echo ICMP */
171 	}
172 #ifdef INET6
173 	if (pd->proto == IPPROTO_ICMPV6) {
174 		if (pd->ndport == htons(ICMP6_ECHO_REQUEST)) {
175 			low = 1;
176 			high = 65535;
177 		} else
178 			return (0);	/* Don't try to modify non-echo ICMP */
179 	}
180 #endif /* INET6 */
181 
182 	do {
183 		key.af = pd->naf;
184 		key.proto = pd->proto;
185 		key.rdomain = pd->rdomain;
186 		pf_addrcpy(&key.addr[didx], &pd->ndaddr, key.af);
187 		pf_addrcpy(&key.addr[sidx], naddr, key.af);
188 		key.port[didx] = pd->ndport;
189 
190 		/*
191 		 * port search; start random, step;
192 		 * similar 2 portloop in in_pcbbind
193 		 */
194 		if (!(pd->proto == IPPROTO_TCP || pd->proto == IPPROTO_UDP ||
195 		    pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6)) {
196 			/* XXX bug: icmp states dont use the id on both
197 			 * XXX sides (traceroute -I through nat) */
198 			key.port[sidx] = pd->nsport;
199 			key.hash = pf_pkt_hash(key.af, key.proto, &key.addr[0],
200 			    &key.addr[1], key.port[0], key.port[1]);
201 			if (pf_find_state_all(&key, dir, NULL) == NULL) {
202 				*nport = pd->nsport;
203 				return (0);
204 			}
205 		} else if (low == 0 && high == 0) {
206 			key.port[sidx] = pd->nsport;
207 			key.hash = pf_pkt_hash(key.af, key.proto, &key.addr[0],
208 			    &key.addr[1], key.port[0], key.port[1]);
209 			if (pf_find_state_all(&key, dir, NULL) == NULL) {
210 				*nport = pd->nsport;
211 				return (0);
212 			}
213 		} else if (low == high) {
214 			key.port[sidx] = htons(low);
215 			key.hash = pf_pkt_hash(key.af, key.proto, &key.addr[0],
216 			    &key.addr[1], key.port[0], key.port[1]);
217 			if (pf_find_state_all(&key, dir, NULL) == NULL) {
218 				*nport = htons(low);
219 				return (0);
220 			}
221 		} else {
222 			u_int32_t tmp;
223 
224 			if (low > high) {
225 				tmp = low;
226 				low = high;
227 				high = tmp;
228 			}
229 			/* low < high */
230 			cut = arc4random_uniform(1 + high - low) + low;
231 			/* low <= cut <= high */
232 			for (tmp = cut; tmp <= high && tmp <= 0xffff; ++tmp) {
233 				key.port[sidx] = htons(tmp);
234 				key.hash = pf_pkt_hash(key.af, key.proto,
235 				    &key.addr[0], &key.addr[1], key.port[0],
236 				    key.port[1]);
237 				if (pf_find_state_all(&key, dir, NULL) ==
238 				    NULL && !in_baddynamic(tmp, pd->proto)) {
239 					*nport = htons(tmp);
240 					return (0);
241 				}
242 			}
243 			tmp = cut;
244 			for (tmp -= 1; tmp >= low && tmp <= 0xffff; --tmp) {
245 				key.port[sidx] = htons(tmp);
246 				key.hash = pf_pkt_hash(key.af, key.proto,
247 				    &key.addr[0], &key.addr[1], key.port[0],
248 				    key.port[1]);
249 				if (pf_find_state_all(&key, dir, NULL) ==
250 				    NULL && !in_baddynamic(tmp, pd->proto)) {
251 					*nport = htons(tmp);
252 					return (0);
253 				}
254 			}
255 		}
256 
257 		switch (r->nat.opts & PF_POOL_TYPEMASK) {
258 		case PF_POOL_RANDOM:
259 		case PF_POOL_ROUNDROBIN:
260 		case PF_POOL_LEASTSTATES:
261 			/*
262 			 * pick a different source address since we're out
263 			 * of free port choices for the current one.
264 			 */
265 			if (pf_map_addr(pd->naf, r, &pd->nsaddr, naddr,
266 			    &init_addr, sn, &r->nat, PF_SN_NAT))
267 				return (1);
268 			break;
269 		case PF_POOL_NONE:
270 		case PF_POOL_SRCHASH:
271 		case PF_POOL_BITMASK:
272 		default:
273 			return (1);
274 		}
275 	} while (! PF_AEQ(&init_addr, naddr, pd->naf) );
276 	return (1);					/* none available */
277 }
278 
279 int
pf_map_addr_sticky(sa_family_t af,struct pf_rule * r,struct pf_addr * saddr,struct pf_addr * naddr,struct pf_src_node ** sns,struct pf_pool * rpool,enum pf_sn_types type)280 pf_map_addr_sticky(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
281     struct pf_addr *naddr, struct pf_src_node **sns, struct pf_pool *rpool,
282     enum pf_sn_types type)
283 {
284 	struct pf_addr		*raddr, *rmask, *cached;
285 	struct pf_state		*s;
286 	struct pf_src_node	 k;
287 	int			 valid;
288 
289 	k.af = af;
290 	k.type = type;
291 	pf_addrcpy(&k.addr, saddr, af);
292 	k.rule.ptr = r;
293 	pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
294 	sns[type] = RB_FIND(pf_src_tree, &tree_src_tracking, &k);
295 	if (sns[type] == NULL)
296 		return (-1);
297 
298 	/* check if the cached entry is still valid */
299 	cached = &(sns[type])->raddr;
300 	valid = 0;
301 	if (PF_AZERO(cached, af)) {
302 		valid = 1;
303 	} else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
304 		if (pfr_kentry_byaddr(rpool->addr.p.dyn->pfid_kt, cached,
305 		    af, 0))
306 			valid = 1;
307 	} else if (rpool->addr.type == PF_ADDR_TABLE) {
308 		if (pfr_kentry_byaddr(rpool->addr.p.tbl, cached, af, 0))
309 			valid = 1;
310 	} else if (rpool->addr.type != PF_ADDR_NOROUTE) {
311 		raddr = &rpool->addr.v.a.addr;
312 		rmask = &rpool->addr.v.a.mask;
313 		valid = pf_match_addr(0, raddr, rmask, cached, af);
314 	}
315 	if (!valid) {
316 		if (pf_status.debug >= LOG_DEBUG) {
317 			log(LOG_DEBUG, "pf: pf_map_addr: "
318 			    "stale src tracking (%u) ", type);
319 			pf_print_host(&k.addr, 0, af);
320 			addlog(" to ");
321 			pf_print_host(cached, 0, af);
322 			addlog("\n");
323 		}
324 		if (sns[type]->states != 0) {
325 			/* XXX expensive */
326 			RBT_FOREACH(s, pf_state_tree_id, &tree_id)
327 				pf_state_rm_src_node(s, sns[type]);
328 		}
329 		sns[type]->expire = 1;
330 		pf_remove_src_node(sns[type]);
331 		sns[type] = NULL;
332 		return (-1);
333 	}
334 
335 
336 	if (!PF_AZERO(cached, af)) {
337 		pf_addrcpy(naddr, cached, af);
338 		if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES &&
339 		    pf_map_addr_states_increase(af, rpool, cached) == -1)
340 			return (-1);
341 	}
342 	if (pf_status.debug >= LOG_DEBUG) {
343 		log(LOG_DEBUG, "pf: pf_map_addr: "
344 		    "src tracking (%u) maps ", type);
345 		pf_print_host(&k.addr, 0, af);
346 		addlog(" to ");
347 		pf_print_host(naddr, 0, af);
348 		addlog("\n");
349 	}
350 
351 	if (sns[type]->kif != NULL)
352 		rpool->kif = sns[type]->kif;
353 
354 	return (0);
355 }
356 
357 uint32_t
pf_rand_addr(uint32_t mask)358 pf_rand_addr(uint32_t mask)
359 {
360 	uint32_t addr;
361 
362 	mask = ~ntohl(mask);
363 	addr = arc4random_uniform(mask + 1);
364 
365 	return (htonl(addr));
366 }
367 
368 int
pf_map_addr(sa_family_t af,struct pf_rule * r,struct pf_addr * saddr,struct pf_addr * naddr,struct pf_addr * init_addr,struct pf_src_node ** sns,struct pf_pool * rpool,enum pf_sn_types type)369 pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
370     struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sns,
371     struct pf_pool *rpool, enum pf_sn_types type)
372 {
373 	struct pf_addr		 hash;
374 	struct pf_addr		 faddr;
375 	struct pf_addr		*raddr = &rpool->addr.v.a.addr;
376 	struct pf_addr		*rmask = &rpool->addr.v.a.mask;
377 	struct pfr_ktable	*kt;
378 	struct pfi_kif		*kif;
379 	u_int64_t		 states;
380 	u_int16_t		 weight;
381 	u_int64_t		 load;
382 	u_int64_t		 cload;
383 	u_int64_t		 hashidx;
384 	int			 cnt;
385 
386 	if (sns[type] == NULL && rpool->opts & PF_POOL_STICKYADDR &&
387 	    (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE &&
388 	    pf_map_addr_sticky(af, r, saddr, naddr, sns, rpool, type) == 0)
389 		return (0);
390 
391 	if (rpool->addr.type == PF_ADDR_NOROUTE)
392 		return (1);
393 	if (rpool->addr.type == PF_ADDR_DYNIFTL) {
394 		switch (af) {
395 		case AF_INET:
396 			if (rpool->addr.p.dyn->pfid_acnt4 < 1 &&
397 			    !PF_POOL_DYNTYPE(rpool->opts))
398 				return (1);
399 			raddr = &rpool->addr.p.dyn->pfid_addr4;
400 			rmask = &rpool->addr.p.dyn->pfid_mask4;
401 			break;
402 #ifdef INET6
403 		case AF_INET6:
404 			if (rpool->addr.p.dyn->pfid_acnt6 < 1 &&
405 			    !PF_POOL_DYNTYPE(rpool->opts))
406 				return (1);
407 			raddr = &rpool->addr.p.dyn->pfid_addr6;
408 			rmask = &rpool->addr.p.dyn->pfid_mask6;
409 			break;
410 #endif /* INET6 */
411 		default:
412 			unhandled_af(af);
413 		}
414 	} else if (rpool->addr.type == PF_ADDR_TABLE) {
415 		if (!PF_POOL_DYNTYPE(rpool->opts))
416 			return (1); /* unsupported */
417 	} else {
418 		raddr = &rpool->addr.v.a.addr;
419 		rmask = &rpool->addr.v.a.mask;
420 	}
421 
422 	switch (rpool->opts & PF_POOL_TYPEMASK) {
423 	case PF_POOL_NONE:
424 		pf_addrcpy(naddr, raddr, af);
425 		break;
426 	case PF_POOL_BITMASK:
427 		pf_poolmask(naddr, raddr, rmask, saddr, af);
428 		break;
429 	case PF_POOL_RANDOM:
430 		if (rpool->addr.type == PF_ADDR_TABLE ||
431 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
432 			if (rpool->addr.type == PF_ADDR_TABLE)
433 				kt = rpool->addr.p.tbl;
434 			else
435 				kt = rpool->addr.p.dyn->pfid_kt;
436 			kt = pfr_ktable_select_active(kt);
437 			if (kt == NULL)
438 				return (1);
439 
440 			cnt = kt->pfrkt_cnt;
441 			if (cnt == 0)
442 				rpool->tblidx = 0;
443 			else
444 				rpool->tblidx = (int)arc4random_uniform(cnt);
445 			memset(&rpool->counter, 0, sizeof(rpool->counter));
446 			if (pfr_pool_get(rpool, &raddr, &rmask, af))
447 				return (1);
448 			pf_addrcpy(naddr, &rpool->counter, af);
449 		} else if (init_addr != NULL && PF_AZERO(init_addr, af)) {
450 			switch (af) {
451 			case AF_INET:
452 				rpool->counter.addr32[0] = pf_rand_addr(
453 				    rmask->addr32[0]);
454 				break;
455 #ifdef INET6
456 			case AF_INET6:
457 				if (rmask->addr32[3] != 0xffffffff)
458 					rpool->counter.addr32[3] = pf_rand_addr(
459 					    rmask->addr32[3]);
460 				else
461 					break;
462 				if (rmask->addr32[2] != 0xffffffff)
463 					rpool->counter.addr32[2] = pf_rand_addr(
464 					    rmask->addr32[2]);
465 				else
466 					break;
467 				if (rmask->addr32[1] != 0xffffffff)
468 					rpool->counter.addr32[1] = pf_rand_addr(
469 					    rmask->addr32[1]);
470 				else
471 					break;
472 				if (rmask->addr32[0] != 0xffffffff)
473 					rpool->counter.addr32[0] = pf_rand_addr(
474 					    rmask->addr32[0]);
475 				break;
476 #endif /* INET6 */
477 			default:
478 				unhandled_af(af);
479 			}
480 			pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
481 			pf_addrcpy(init_addr, naddr, af);
482 
483 		} else {
484 			pf_addr_inc(&rpool->counter, af);
485 			pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
486 		}
487 		break;
488 	case PF_POOL_SRCHASH:
489 		hashidx = pf_hash(saddr, &hash, &rpool->key, af);
490 
491 		if (rpool->addr.type == PF_ADDR_TABLE ||
492 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
493 			if (rpool->addr.type == PF_ADDR_TABLE)
494 				kt = rpool->addr.p.tbl;
495 			else
496 				kt = rpool->addr.p.dyn->pfid_kt;
497 			kt = pfr_ktable_select_active(kt);
498 			if (kt == NULL)
499 				return (1);
500 
501 			cnt = kt->pfrkt_cnt;
502 			if (cnt == 0)
503 				rpool->tblidx = 0;
504 			else
505 				rpool->tblidx = (int)(hashidx % cnt);
506 			memset(&rpool->counter, 0, sizeof(rpool->counter));
507 			if (pfr_pool_get(rpool, &raddr, &rmask, af))
508 				return (1);
509 			pf_addrcpy(naddr, &rpool->counter, af);
510 		} else {
511 			pf_poolmask(naddr, raddr, rmask, &hash, af);
512 		}
513 		break;
514 	case PF_POOL_ROUNDROBIN:
515 		if (rpool->addr.type == PF_ADDR_TABLE ||
516 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
517 			if (pfr_pool_get(rpool, &raddr, &rmask, af)) {
518 				/*
519 				 * reset counter in case its value
520 				 * has been removed from the pool.
521 				 */
522 				memset(&rpool->counter, 0,
523 				    sizeof(rpool->counter));
524 				if (pfr_pool_get(rpool, &raddr, &rmask, af))
525 					return (1);
526 			}
527 		} else if (PF_AZERO(&rpool->counter, af)) {
528 			/*
529 			 * fall back to POOL_NONE if there is a single host
530 			 * address in pool.
531 			 */
532 			if (af == AF_INET &&
533 			    rmask->addr32[0] == INADDR_BROADCAST) {
534 				pf_addrcpy(naddr, raddr, af);
535 				break;
536 			}
537 #ifdef INET6
538 			if (af == AF_INET6 &&
539 			    IN6_ARE_ADDR_EQUAL(&rmask->v6, &in6mask128)) {
540 				pf_addrcpy(naddr, raddr, af);
541 				break;
542 			}
543 #endif
544 		} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
545 			return (1);
546 
547 		/* iterate over table if it contains entries which are weighted */
548 		if ((rpool->addr.type == PF_ADDR_TABLE &&
549 		    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
550 		    (rpool->addr.type == PF_ADDR_DYNIFTL &&
551 		    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0)) {
552 			do {
553 				if (rpool->addr.type == PF_ADDR_TABLE ||
554 				    rpool->addr.type == PF_ADDR_DYNIFTL) {
555 					if (pfr_pool_get(rpool,
556 					    &raddr, &rmask, af))
557 						return (1);
558 				} else {
559 					log(LOG_ERR, "pf: pf_map_addr: "
560 					    "weighted RR failure");
561 					return (1);
562 				}
563 				if (rpool->weight >= rpool->curweight)
564 					break;
565 				pf_addr_inc(&rpool->counter, af);
566 			} while (1);
567 
568 			weight = rpool->weight;
569 		}
570 
571 		pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
572 		if (init_addr != NULL && PF_AZERO(init_addr, af))
573 			pf_addrcpy(init_addr, &rpool->counter, af);
574 		pf_addr_inc(&rpool->counter, af);
575 		break;
576 	case PF_POOL_LEASTSTATES:
577 		/* retrieve an address first */
578 		if (rpool->addr.type == PF_ADDR_TABLE ||
579 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
580 			if (pfr_pool_get(rpool, &raddr, &rmask, af)) {
581 				/* see PF_POOL_ROUNDROBIN */
582 				memset(&rpool->counter, 0,
583 				    sizeof(rpool->counter));
584 				if (pfr_pool_get(rpool, &raddr, &rmask, af))
585 					return (1);
586 			}
587 		} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
588 			return (1);
589 
590 		states = rpool->states;
591 		weight = rpool->weight;
592 		kif = rpool->kif;
593 
594 		if ((rpool->addr.type == PF_ADDR_TABLE &&
595 		    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
596 		    (rpool->addr.type == PF_ADDR_DYNIFTL &&
597 		    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0))
598 			load = ((UINT16_MAX * rpool->states) / rpool->weight);
599 		else
600 			load = states;
601 
602 		pf_addrcpy(&faddr, &rpool->counter, af);
603 
604 		pf_addrcpy(naddr, &rpool->counter, af);
605 		if (init_addr != NULL && PF_AZERO(init_addr, af))
606 			pf_addrcpy(init_addr, naddr, af);
607 
608 		/*
609 		 * iterate *once* over whole table and find destination with
610 		 * least connection
611 		 */
612 		do  {
613 			pf_addr_inc(&rpool->counter, af);
614 			if (rpool->addr.type == PF_ADDR_TABLE ||
615 			    rpool->addr.type == PF_ADDR_DYNIFTL) {
616 				if (pfr_pool_get(rpool, &raddr, &rmask, af))
617 					return (1);
618 			} else if (pf_match_addr(0, raddr, rmask,
619 			    &rpool->counter, af))
620 				return (1);
621 
622 			if ((rpool->addr.type == PF_ADDR_TABLE &&
623 			    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
624 			    (rpool->addr.type == PF_ADDR_DYNIFTL &&
625 			    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0))
626 				cload = ((UINT16_MAX * rpool->states)
627 					/ rpool->weight);
628 			else
629 				cload = rpool->states;
630 
631 			/* find lc minimum */
632 			if (cload < load) {
633 				states = rpool->states;
634 				weight = rpool->weight;
635 				kif = rpool->kif;
636 				load = cload;
637 
638 				pf_addrcpy(naddr, &rpool->counter, af);
639 				if (init_addr != NULL &&
640 				    PF_AZERO(init_addr, af))
641 				    pf_addrcpy(init_addr, naddr, af);
642 			}
643 		} while (pf_match_addr(1, &faddr, rmask, &rpool->counter, af) &&
644 		    (states > 0));
645 
646 		if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
647 			return (1);
648 		/* revert the kif which was set by pfr_pool_get() */
649 		rpool->kif = kif;
650 		break;
651 	}
652 
653 	if (rpool->opts & PF_POOL_STICKYADDR) {
654 		if (sns[type] != NULL) {
655 			pf_remove_src_node(sns[type]);
656 			sns[type] = NULL;
657 		}
658 		if (pf_insert_src_node(&sns[type], r, type, af, saddr, naddr,
659 		    rpool->kif))
660 			return (1);
661 	}
662 
663 	if (pf_status.debug >= LOG_INFO &&
664 	    (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
665 		log(LOG_INFO, "pf: pf_map_addr: selected address ");
666 		pf_print_host(naddr, 0, af);
667 		if ((rpool->opts & PF_POOL_TYPEMASK) ==
668 		    PF_POOL_LEASTSTATES)
669 			addlog(" with state count %llu", states);
670 		if ((rpool->addr.type == PF_ADDR_TABLE &&
671 		    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
672 		    (rpool->addr.type == PF_ADDR_DYNIFTL &&
673 		    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0))
674 			addlog(" with weight %u", weight);
675 		addlog("\n");
676 	}
677 
678 	return (0);
679 }
680 
681 int
pf_map_addr_states_increase(sa_family_t af,struct pf_pool * rpool,struct pf_addr * naddr)682 pf_map_addr_states_increase(sa_family_t af, struct pf_pool *rpool,
683     struct pf_addr *naddr)
684 {
685 	if (rpool->addr.type == PF_ADDR_TABLE) {
686 		if (pfr_states_increase(rpool->addr.p.tbl,
687 		    naddr, af) == -1) {
688 			if (pf_status.debug >= LOG_DEBUG) {
689 				log(LOG_DEBUG,
690 				    "pf: pf_map_addr_states_increase: "
691 				    "selected address ");
692 				pf_print_host(naddr, 0, af);
693 				addlog(". Failed to increase count!\n");
694 			}
695 			return (-1);
696 		}
697 	} else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
698 		if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt,
699 		    naddr, af) == -1) {
700 			if (pf_status.debug >= LOG_DEBUG) {
701 				log(LOG_DEBUG,
702 				    "pf: pf_map_addr_states_increase: "
703 				    "selected address ");
704 				pf_print_host(naddr, 0, af);
705 				addlog(". Failed to increase count!\n");
706 			}
707 			return (-1);
708 		}
709 	}
710 	return (0);
711 }
712 
713 int
pf_get_transaddr(struct pf_rule * r,struct pf_pdesc * pd,struct pf_src_node ** sns,struct pf_rule ** nr)714 pf_get_transaddr(struct pf_rule *r, struct pf_pdesc *pd,
715     struct pf_src_node **sns, struct pf_rule **nr)
716 {
717 	struct pf_addr	naddr;
718 	u_int16_t	nport;
719 
720 #ifdef INET6
721 	if (pd->af != pd->naf)
722 		return (pf_get_transaddr_af(r, pd, sns));
723 #endif /* INET6 */
724 
725 	if (r->nat.addr.type != PF_ADDR_NONE) {
726 		/* XXX is this right? what if rtable is changed at the same
727 		 * XXX time? where do I need to figure out the sport? */
728 		nport = 0;
729 		if (pf_get_sport(pd, r, &naddr, &nport,
730 		    r->nat.proxy_port[0], r->nat.proxy_port[1], sns)) {
731 			DPFPRINTF(LOG_NOTICE,
732 			    "pf: NAT proxy port allocation (%u-%u) failed",
733 			    r->nat.proxy_port[0],
734 			    r->nat.proxy_port[1]);
735 			return (-1);
736 		}
737 		*nr = r;
738 		pf_addrcpy(&pd->nsaddr, &naddr, pd->af);
739 		pd->nsport = nport;
740 	}
741 	if (r->rdr.addr.type != PF_ADDR_NONE) {
742 		if (pf_map_addr(pd->af, r, &pd->nsaddr, &naddr, NULL, sns,
743 		    &r->rdr, PF_SN_RDR))
744 			return (-1);
745 		if ((r->rdr.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
746 			pf_poolmask(&naddr, &naddr,  &r->rdr.addr.v.a.mask,
747 			    &pd->ndaddr, pd->af);
748 
749 		nport = 0;
750 		if (r->rdr.proxy_port[1]) {
751 			u_int32_t	tmp_nport;
752 			u_int16_t	div;
753 
754 			div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1;
755 			div = (div == 0) ? 1 : div;
756 
757 			tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) +
758 			    r->rdr.proxy_port[0];
759 
760 			/* wrap around if necessary */
761 			if (tmp_nport > 65535)
762 				tmp_nport -= 65535;
763 			nport = htons((u_int16_t)tmp_nport);
764 		} else if (r->rdr.proxy_port[0])
765 			nport = htons(r->rdr.proxy_port[0]);
766 		*nr = r;
767 		pf_addrcpy(&pd->ndaddr, &naddr, pd->af);
768 		if (nport)
769 			pd->ndport = nport;
770 	}
771 
772 	return (0);
773 }
774 
775 #ifdef INET6
776 int
pf_get_transaddr_af(struct pf_rule * r,struct pf_pdesc * pd,struct pf_src_node ** sns)777 pf_get_transaddr_af(struct pf_rule *r, struct pf_pdesc *pd,
778     struct pf_src_node **sns)
779 {
780 	struct pf_addr	ndaddr, nsaddr, naddr;
781 	u_int16_t	nport;
782 	int		prefixlen = 96;
783 
784 	if (pf_status.debug >= LOG_INFO) {
785 		log(LOG_INFO, "pf: af-to %s %s, ",
786 		    pd->naf == AF_INET ? "inet" : "inet6",
787 		    r->rdr.addr.type == PF_ADDR_NONE ? "nat" : "rdr");
788 		pf_print_host(&pd->nsaddr, pd->nsport, pd->af);
789 		addlog(" -> ");
790 		pf_print_host(&pd->ndaddr, pd->ndport, pd->af);
791 		addlog("\n");
792 	}
793 
794 	if (r->nat.addr.type == PF_ADDR_NONE)
795 		panic("pf_get_transaddr_af: no nat pool for source address");
796 
797 	/* get source address and port */
798 	nport = 0;
799 	if (pf_get_sport(pd, r, &nsaddr, &nport,
800 	    r->nat.proxy_port[0], r->nat.proxy_port[1], sns)) {
801 		DPFPRINTF(LOG_NOTICE,
802 		    "pf: af-to NAT proxy port allocation (%u-%u) failed",
803 		    r->nat.proxy_port[0],
804 		    r->nat.proxy_port[1]);
805 		return (-1);
806 	}
807 	pd->nsport = nport;
808 
809 	if (pd->proto == IPPROTO_ICMPV6 && pd->naf == AF_INET) {
810 		if (pd->dir == PF_IN) {
811 			pd->ndport = ntohs(pd->ndport);
812 			if (pd->ndport == ICMP6_ECHO_REQUEST)
813 				pd->ndport = ICMP_ECHO;
814 			else if (pd->ndport == ICMP6_ECHO_REPLY)
815 				pd->ndport = ICMP_ECHOREPLY;
816 			pd->ndport = htons(pd->ndport);
817 		} else {
818 			pd->nsport = ntohs(pd->nsport);
819 			if (pd->nsport == ICMP6_ECHO_REQUEST)
820 				pd->nsport = ICMP_ECHO;
821 			else if (pd->nsport == ICMP6_ECHO_REPLY)
822 				pd->nsport = ICMP_ECHOREPLY;
823 			pd->nsport = htons(pd->nsport);
824 		}
825 	} else if (pd->proto == IPPROTO_ICMP && pd->naf == AF_INET6) {
826 		if (pd->dir == PF_IN) {
827 			pd->ndport = ntohs(pd->ndport);
828 			if (pd->ndport == ICMP_ECHO)
829 				pd->ndport = ICMP6_ECHO_REQUEST;
830 			else if (pd->ndport == ICMP_ECHOREPLY)
831 				pd->ndport = ICMP6_ECHO_REPLY;
832 			pd->ndport = htons(pd->ndport);
833 		} else {
834 			pd->nsport = ntohs(pd->nsport);
835 			if (pd->nsport == ICMP_ECHO)
836 				pd->nsport = ICMP6_ECHO_REQUEST;
837 			else if (pd->nsport == ICMP_ECHOREPLY)
838 				pd->nsport = ICMP6_ECHO_REPLY;
839 			pd->nsport = htons(pd->nsport);
840 		}
841 	}
842 
843 	/* get the destination address and port */
844 	if (r->rdr.addr.type != PF_ADDR_NONE) {
845 		if (pf_map_addr(pd->naf, r, &nsaddr, &naddr, NULL, sns,
846 		    &r->rdr, PF_SN_RDR))
847 			return (-1);
848 		if (r->rdr.proxy_port[0])
849 			pd->ndport = htons(r->rdr.proxy_port[0]);
850 
851 		if (pd->naf == AF_INET) {
852 			/* The prefix is the IPv4 rdr address */
853 			prefixlen = in_mask2len((struct in_addr *)
854 			    &r->rdr.addr.v.a.mask);
855 			inet_nat46(pd->naf, &pd->ndaddr,
856 			    &ndaddr, &naddr, prefixlen);
857 		} else {
858 			/* The prefix is the IPv6 rdr address */
859 			prefixlen =
860 			    in6_mask2len((struct in6_addr *)
861 			    &r->rdr.addr.v.a.mask, NULL);
862 			inet_nat64(pd->naf, &pd->ndaddr,
863 			    &ndaddr, &naddr, prefixlen);
864 		}
865 	} else {
866 		if (pd->naf == AF_INET) {
867 			/* The prefix is the IPv6 dst address */
868 			prefixlen =
869 			    in6_mask2len((struct in6_addr *)
870 			    &r->dst.addr.v.a.mask, NULL);
871 			if (prefixlen < 32)
872 				prefixlen = 96;
873 			inet_nat64(pd->naf, &pd->ndaddr,
874 			    &ndaddr, &pd->ndaddr, prefixlen);
875 		} else {
876 			/*
877 			 * The prefix is the IPv6 nat address
878 			 * (that was stored in pd->nsaddr)
879 			 */
880 			prefixlen = in6_mask2len((struct in6_addr *)
881 			    &r->nat.addr.v.a.mask, NULL);
882 			if (prefixlen > 96)
883 				prefixlen = 96;
884 			inet_nat64(pd->naf, &pd->ndaddr,
885 			    &ndaddr, &nsaddr, prefixlen);
886 		}
887 	}
888 
889 	pf_addrcpy(&pd->nsaddr, &nsaddr, pd->naf);
890 	pf_addrcpy(&pd->ndaddr, &ndaddr, pd->naf);
891 
892 	if (pf_status.debug >= LOG_INFO) {
893 		log(LOG_INFO, "pf: af-to %s %s done, prefixlen %d, ",
894 		    pd->naf == AF_INET ? "inet" : "inet6",
895 		    r->rdr.addr.type == PF_ADDR_NONE ? "nat" : "rdr",
896 		    prefixlen);
897 		pf_print_host(&pd->nsaddr, pd->nsport, pd->naf);
898 		addlog(" -> ");
899 		pf_print_host(&pd->ndaddr, pd->ndport, pd->naf);
900 		addlog("\n");
901 	}
902 
903 	return (0);
904 }
905 #endif /* INET6 */
906 
907 int
pf_postprocess_addr(struct pf_state * cur)908 pf_postprocess_addr(struct pf_state *cur)
909 {
910 	struct pf_rule		*nr;
911 	struct pf_state_key	*sks;
912 	struct pf_pool		 rpool;
913 	struct pf_addr		 lookup_addr;
914 	int			 slbcount = -1;
915 
916 	nr = cur->natrule.ptr;
917 
918 	if (nr == NULL)
919 		return (0);
920 
921 	/* decrease counter */
922 
923 	sks = cur->key[PF_SK_STACK];
924 
925 	/* check for outgoing or ingoing balancing */
926 	if (nr->rt == PF_ROUTETO)
927 		lookup_addr = cur->rt_addr;
928 	else if (sks != NULL)
929 		lookup_addr = sks->addr[1];
930 	else {
931 		if (pf_status.debug >= LOG_DEBUG) {
932 			log(LOG_DEBUG, "pf: %s: unable to obtain address",
933 			    __func__);
934 		}
935 		return (1);
936 	}
937 
938 	/* check for appropriate pool */
939 	if (nr->rdr.addr.type != PF_ADDR_NONE)
940 		rpool = nr->rdr;
941 	else if (nr->nat.addr.type != PF_ADDR_NONE)
942 		rpool = nr->nat;
943 	else if (nr->route.addr.type != PF_ADDR_NONE)
944 		rpool = nr->route;
945 	else
946 		return (0);
947 
948 	if (((rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_LEASTSTATES))
949 		return (0);
950 
951 	if (rpool.addr.type == PF_ADDR_TABLE) {
952 		if ((slbcount = pfr_states_decrease(
953 		    rpool.addr.p.tbl,
954 		    &lookup_addr, sks->af)) == -1) {
955 			if (pf_status.debug >= LOG_DEBUG) {
956 				log(LOG_DEBUG, "pf: %s: selected address ",
957 				    __func__);
958 				pf_print_host(&lookup_addr,
959 				    sks->port[0], sks->af);
960 				addlog(". Failed to "
961 				    "decrease count!\n");
962 			}
963 			return (1);
964 		}
965 	} else if (rpool.addr.type == PF_ADDR_DYNIFTL) {
966 		if ((slbcount = pfr_states_decrease(
967 		    rpool.addr.p.dyn->pfid_kt,
968 		    &lookup_addr, sks->af)) == -1) {
969 			if (pf_status.debug >= LOG_DEBUG) {
970 				log(LOG_DEBUG, "pf: %s: selected address ",
971 				    __func__);
972 				pf_print_host(&lookup_addr,
973 				    sks->port[0], sks->af);
974 				addlog(". Failed to "
975 				    "decrease count!\n");
976 			}
977 			return (1);
978 		}
979 	}
980 	if (slbcount > -1) {
981 		if (pf_status.debug >= LOG_INFO) {
982 			log(LOG_INFO, "pf: %s: selected address ", __func__);
983 			pf_print_host(&lookup_addr, sks->port[0],
984 			    sks->af);
985 			addlog(" decreased state count to %u\n",
986 			    slbcount);
987 		}
988 	}
989 	return (0);
990 }
991