xref: /dragonfly/sys/netinet/ip_flow.c (revision 36a3d1d6)
1 /*-
2  * Copyright (c) 1998 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The NetBSD Foundation
6  * by the 3am Software Foundry ("3am").  It was developed by Matt Thomas.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the NetBSD
19  *	Foundation, Inc. and its contributors.
20  * 4. Neither the name of The NetBSD Foundation nor the names of its
21  *    contributors may be used to endorse or promote products derived
22  *    from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  *
36  * $FreeBSD: src/sys/netinet/ip_flow.c,v 1.9.2.2 2001/11/04 17:35:31 luigi Exp $
37  * $DragonFly: src/sys/netinet/ip_flow.c,v 1.27 2008/10/28 07:09:26 sephe Exp $
38  */
39 
40 #include <sys/param.h>
41 #include <sys/kernel.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/thread2.h>
48 #include <sys/in_cksum.h>
49 
50 #include <machine/smp.h>
51 
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <net/route.h>
55 #include <net/netisr.h>
56 #include <net/netmsg2.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/in_var.h>
61 #include <netinet/ip_var.h>
62 #include <netinet/ip_flow.h>
63 
64 #define	IPFLOW_TIMER		(5 * PR_SLOWHZ)
65 #define IPFLOW_HASHBITS		6	/* should not be a multiple of 8 */
66 #define	IPFLOW_HASHSIZE		(1 << IPFLOW_HASHBITS)
67 #define	IPFLOW_MAX		256
68 
69 #define IPFLOW_RTENTRY_ISDOWN(rt) \
70 	(((rt)->rt_flags & RTF_UP) == 0 || \
71 	 ((rt)->rt_ifp->if_flags & IFF_UP) == 0)
72 
73 struct netmsg_ipfaddr {
74 	struct netmsg	ipf_nmsg;
75 	struct in_addr	ipf_addr;
76 };
77 
78 struct ipflow {
79 	LIST_ENTRY(ipflow) ipf_hash;	/* next ipflow in hash bucket */
80 	LIST_ENTRY(ipflow) ipf_list;	/* next ipflow in list */
81 
82 	struct in_addr ipf_dst;		/* destination address */
83 	struct in_addr ipf_src;		/* source address */
84 	uint8_t ipf_tos;		/* type-of-service */
85 
86 	uint8_t ipf_flags;		/* see IPFLOW_FLAG_ */
87 	uint8_t ipf_pad[2];		/* explicit pad */
88 	int ipf_refcnt;			/* reference count */
89 
90 	struct route ipf_ro;		/* associated route entry */
91 	u_long ipf_uses;		/* number of uses in this period */
92 
93 	int ipf_timer;			/* remaining lifetime of this entry */
94 	u_long ipf_dropped;		/* ENOBUFS returned by if_output */
95 	u_long ipf_errors;		/* other errors returned by if_output */
96 	u_long ipf_last_uses;		/* number of uses in last period */
97 };
98 LIST_HEAD(ipflowhead, ipflow);
99 
100 #define IPFLOW_FLAG_ONLIST	0x1
101 
102 #define ipflow_inuse		ipflow_inuse_pcpu[mycpuid]
103 #define ipflowtable		ipflowtable_pcpu[mycpuid]
104 #define ipflowlist		ipflowlist_pcpu[mycpuid]
105 
106 static struct ipflowhead	ipflowtable_pcpu[MAXCPU][IPFLOW_HASHSIZE];
107 static struct ipflowhead	ipflowlist_pcpu[MAXCPU];
108 static int			ipflow_inuse_pcpu[MAXCPU];
109 static struct netmsg		ipflow_timo_netmsgs[MAXCPU];
110 static int			ipflow_active = 0;
111 
112 #define IPFLOW_REFCNT_INIT	1
113 
114 /* ipflow is alive and active */
115 #define IPFLOW_IS_ACTIVE(ipf)	((ipf)->ipf_refcnt > IPFLOW_REFCNT_INIT)
116 /* ipflow is alive but not active */
117 #define IPFLOW_NOT_ACTIVE(ipf)	((ipf)->ipf_refcnt == IPFLOW_REFCNT_INIT)
118 
119 #define IPFLOW_REF(ipf) \
120 do { \
121 	KKASSERT((ipf)->ipf_refcnt > 0); \
122 	(ipf)->ipf_refcnt++; \
123 } while (0)
124 
125 #define IPFLOW_FREE(ipf) \
126 do { \
127 	KKASSERT((ipf)->ipf_refcnt > 0); \
128 	(ipf)->ipf_refcnt--; \
129 	if ((ipf)->ipf_refcnt == 0) \
130 		ipflow_free((ipf)); \
131 } while (0)
132 
133 #define IPFLOW_INSERT(bucket, ipf) \
134 do { \
135 	KKASSERT(((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST) == 0); \
136 	(ipf)->ipf_flags |= IPFLOW_FLAG_ONLIST; \
137 	LIST_INSERT_HEAD((bucket), (ipf), ipf_hash); \
138 	LIST_INSERT_HEAD(&ipflowlist, (ipf), ipf_list); \
139 } while (0)
140 
141 #define IPFLOW_REMOVE(ipf) \
142 do { \
143 	KKASSERT((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST); \
144 	(ipf)->ipf_flags &= ~IPFLOW_FLAG_ONLIST; \
145 	LIST_REMOVE((ipf), ipf_hash); \
146 	LIST_REMOVE((ipf), ipf_list); \
147 } while (0)
148 
149 SYSCTL_NODE(_net_inet_ip, OID_AUTO, ipflow, CTLFLAG_RW, 0, "ip flow");
150 SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW,
151 	   &ipflow_active, 0, "Enable flow-based IP forwarding");
152 
153 static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow");
154 
155 static void	ipflow_free(struct ipflow *);
156 
157 static unsigned
158 ipflow_hash(struct in_addr dst, struct in_addr src, unsigned tos)
159 {
160 	unsigned hash = tos;
161 	int idx;
162 
163 	for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS)
164 		hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx);
165 	return hash & (IPFLOW_HASHSIZE-1);
166 }
167 
168 static struct ipflow *
169 ipflow_lookup(const struct ip *ip)
170 {
171 	unsigned hash;
172 	struct ipflow *ipf;
173 
174 	hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos);
175 	LIST_FOREACH(ipf, &ipflowtable[hash], ipf_hash) {
176 		if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr &&
177 		    ip->ip_src.s_addr == ipf->ipf_src.s_addr &&
178 		    ip->ip_tos == ipf->ipf_tos)
179 			break;
180 	}
181 	return ipf;
182 }
183 
184 int
185 ipflow_fastforward(struct mbuf *m)
186 {
187 	struct ip *ip;
188 	struct ipflow *ipf;
189 	struct rtentry *rt;
190 	struct sockaddr *dst;
191 	struct ifnet *ifp;
192 	int error, iplen;
193 
194 	/*
195 	 * Are we forwarding packets?
196 	 */
197 	if (!ipforwarding || !ipflow_active)
198 		return 0;
199 
200 	/*
201 	 * Was packet received as a link-level multicast or broadcast?
202 	 * If so, don't try to fast forward..
203 	 */
204 	if (m->m_flags & (M_BCAST | M_MCAST))
205 		return 0;
206 
207 	/* length checks already done in ip_mport() */
208 	KASSERT(m->m_len >= sizeof(struct ip), ("IP header not in one mbuf"));
209 	ip = mtod(m, struct ip *);
210 
211 	/*
212 	 * IP header with no option and valid version
213 	 */
214 	if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2))
215 		return 0;
216 
217 	iplen = ntohs(ip->ip_len);
218 	/* length checks already done in ip_mport() */
219 	KASSERT(iplen >= sizeof(struct ip),
220 		("total length less then header length"));
221 	KASSERT(m->m_pkthdr.len >= iplen, ("mbuf too short"));
222 
223 	/*
224 	 * Find a flow.
225 	 */
226 	ipf = ipflow_lookup(ip);
227 	if (ipf == NULL)
228 		return 0;
229 
230 	/*
231 	 * Verify the IP header checksum.
232 	 */
233 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
234 		if (!(m->m_pkthdr.csum_flags & CSUM_IP_VALID))
235 			return 0;
236 	} else {
237 		/* Must compute it ourselves. */
238 		if (in_cksum_hdr(ip) != 0)
239 			return 0;
240 	}
241 
242 	/*
243 	 * Route and interface still up?
244 	 */
245 	rt = ipf->ipf_ro.ro_rt;
246 	if (IPFLOW_RTENTRY_ISDOWN(rt))
247 		return 0;
248 	ifp = rt->rt_ifp;
249 
250 	/*
251 	 * Packet size OK?  TTL?
252 	 */
253 	if (m->m_pkthdr.len > ifp->if_mtu || ip->ip_ttl <= IPTTLDEC)
254 		return 0;
255 
256 	/*
257 	 * Clear any in-bound checksum flags for this packet.
258 	 */
259 	m->m_pkthdr.csum_flags = 0;
260 
261 	/*
262 	 * Everything checks out and so we can forward this packet.
263 	 * Modify the TTL and incrementally change the checksum.
264 	 *
265 	 * This method of adding the checksum works on either endian CPU.
266 	 * If htons() is inlined, all the arithmetic is folded; otherwise
267 	 * the htons()s are combined by CSE due to the __const__ attribute.
268 	 *
269 	 * Don't bother using HW checksumming here -- the incremental
270 	 * update is pretty fast.
271 	 */
272 	ip->ip_ttl -= IPTTLDEC;
273 	if (ip->ip_sum >= (uint16_t)~htons(IPTTLDEC << 8))
274 		ip->ip_sum -= ~htons(IPTTLDEC << 8);
275 	else
276 		ip->ip_sum += htons(IPTTLDEC << 8);
277 
278 	/*
279 	 * Trim the packet in case it's too long..
280 	 */
281 	if (m->m_pkthdr.len > iplen) {
282 		if (m->m_len == m->m_pkthdr.len) {
283 			m->m_len = iplen;
284 			m->m_pkthdr.len = iplen;
285 		} else {
286 			m_adj(m, iplen - m->m_pkthdr.len);
287 		}
288 	}
289 
290 	/*
291 	 * Send the packet on its way.  All we can get back is ENOBUFS
292 	 */
293 	ipf->ipf_uses++;
294 	ipf->ipf_timer = IPFLOW_TIMER;
295 
296 	if (rt->rt_flags & RTF_GATEWAY)
297 		dst = rt->rt_gateway;
298 	else
299 		dst = &ipf->ipf_ro.ro_dst;
300 
301 	/*
302 	 * Reference count this ipflow, before the possible blocking
303 	 * ifnet.if_output(), so this ipflow will not be changed or
304 	 * reaped behind our back.
305 	 */
306 	IPFLOW_REF(ipf);
307 
308 	error = ifp->if_output(ifp, m, dst, rt);
309 	if (error) {
310 		if (error == ENOBUFS)
311 			ipf->ipf_dropped++;
312 		else
313 			ipf->ipf_errors++;
314 	}
315 
316 	IPFLOW_FREE(ipf);
317 	return 1;
318 }
319 
320 static void
321 ipflow_addstats(struct ipflow *ipf)
322 {
323 	ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses;
324 	ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped;
325 	ipstat.ips_total += ipf->ipf_uses;
326 	ipstat.ips_forward += ipf->ipf_uses;
327 	ipstat.ips_fastforward += ipf->ipf_uses;
328 }
329 
330 static void
331 ipflow_free(struct ipflow *ipf)
332 {
333 	KKASSERT(ipf->ipf_refcnt == 0);
334 	KKASSERT((ipf->ipf_flags & IPFLOW_FLAG_ONLIST) == 0);
335 
336 	KKASSERT(ipflow_inuse > 0);
337 	ipflow_inuse--;
338 
339 	ipflow_addstats(ipf);
340 	RTFREE(ipf->ipf_ro.ro_rt);
341 	kfree(ipf, M_IPFLOW);
342 }
343 
344 static void
345 ipflow_reset(struct ipflow *ipf)
346 {
347 	ipflow_addstats(ipf);
348 	RTFREE(ipf->ipf_ro.ro_rt);
349 	ipf->ipf_uses = ipf->ipf_last_uses = 0;
350 	ipf->ipf_errors = ipf->ipf_dropped = 0;
351 }
352 
353 static struct ipflow *
354 ipflow_reap(void)
355 {
356 	struct ipflow *ipf, *maybe_ipf = NULL;
357 
358 	LIST_FOREACH(ipf, &ipflowlist, ipf_list) {
359 		/*
360 		 * Skip actively used ipflow
361 		 */
362 		if (IPFLOW_IS_ACTIVE(ipf))
363 			continue;
364 
365 		/*
366 		 * If this no longer points to a valid route
367 		 * reclaim it.
368 		 */
369 		if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0)
370 			goto done;
371 
372 		/*
373 		 * choose the one that's been least recently used
374 		 * or has had the least uses in the last 1.5
375 		 * intervals.
376 		 */
377 		if (maybe_ipf == NULL ||
378 		    ipf->ipf_timer < maybe_ipf->ipf_timer ||
379 		    (ipf->ipf_timer == maybe_ipf->ipf_timer &&
380 		     ipf->ipf_last_uses + ipf->ipf_uses <
381 		     maybe_ipf->ipf_last_uses + maybe_ipf->ipf_uses))
382 			maybe_ipf = ipf;
383 	}
384 	if (maybe_ipf == NULL)
385 		return NULL;
386 
387 	ipf = maybe_ipf;
388 done:
389 	/*
390 	 * Remove the entry from the flow table and reset its states
391 	 */
392 	IPFLOW_REMOVE(ipf);
393 	ipflow_reset(ipf);
394 	return ipf;
395 }
396 
397 static void
398 ipflow_timo_dispatch(struct netmsg *nmsg)
399 {
400 	struct ipflow *ipf, *next_ipf;
401 
402 	crit_enter();
403 	lwkt_replymsg(&nmsg->nm_lmsg, 0);	/* reply ASAP */
404 	crit_exit();
405 
406 	LIST_FOREACH_MUTABLE(ipf, &ipflowlist, ipf_list, next_ipf) {
407 		if (--ipf->ipf_timer == 0) {
408 			IPFLOW_REMOVE(ipf);
409 			IPFLOW_FREE(ipf);
410 		} else {
411 			ipf->ipf_last_uses = ipf->ipf_uses;
412 			ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses;
413 			ipstat.ips_total += ipf->ipf_uses;
414 			ipstat.ips_forward += ipf->ipf_uses;
415 			ipstat.ips_fastforward += ipf->ipf_uses;
416 			ipf->ipf_uses = 0;
417 		}
418 	}
419 }
420 
421 static void
422 ipflow_timo_ipi(void *arg __unused)
423 {
424 	struct lwkt_msg *msg = &ipflow_timo_netmsgs[mycpuid].nm_lmsg;
425 
426 	crit_enter();
427 	if (msg->ms_flags & MSGF_DONE)
428 		lwkt_sendmsg(cpu_portfn(mycpuid), msg);
429 	crit_exit();
430 }
431 
432 void
433 ipflow_slowtimo(void)
434 {
435 #ifdef SMP
436 	uint32_t mask = 0;
437 	int i;
438 
439 	for (i = 0; i < ncpus; ++i) {
440 		if (ipflow_inuse_pcpu[i])
441 			mask |= 1 << i;
442 	}
443 	mask &= smp_active_mask;
444 	if (mask != 0)
445 		lwkt_send_ipiq_mask(mask, ipflow_timo_ipi, NULL);
446 #else
447 	if (ipflow_inuse)
448 		ipflow_timo_ipi(NULL);
449 #endif
450 }
451 
452 void
453 ipflow_create(const struct route *ro, struct mbuf *m)
454 {
455 	const struct ip *const ip = mtod(m, struct ip *);
456 	struct ipflow *ipf;
457 	unsigned hash;
458 
459 	/*
460 	 * Don't create cache entries for ICMP messages.
461 	 */
462 	if (!ipflow_active || ip->ip_p == IPPROTO_ICMP)
463 		return;
464 
465 	/*
466 	 * See if an existing flow struct exists.  If so remove it from it's
467 	 * list and free the old route.  If not, try to malloc a new one
468 	 * (if we aren't at our limit).
469 	 */
470 	ipf = ipflow_lookup(ip);
471 	if (ipf == NULL) {
472 		if (ipflow_inuse == IPFLOW_MAX) {
473 			ipf = ipflow_reap();
474 			if (ipf == NULL)
475 				return;
476 		} else {
477 			ipf = kmalloc(sizeof(*ipf), M_IPFLOW,
478 				      M_NOWAIT | M_ZERO);
479 			if (ipf == NULL)
480 				return;
481 			ipf->ipf_refcnt = IPFLOW_REFCNT_INIT;
482 
483 			ipflow_inuse++;
484 		}
485 	} else {
486 		if (IPFLOW_NOT_ACTIVE(ipf)) {
487 			IPFLOW_REMOVE(ipf);
488 			ipflow_reset(ipf);
489 		} else {
490 			/* This ipflow is being used; don't change it */
491 			KKASSERT(IPFLOW_IS_ACTIVE(ipf));
492 			return;
493 		}
494 	}
495 	/* This ipflow should not be actively used */
496 	KKASSERT(IPFLOW_NOT_ACTIVE(ipf));
497 
498 	/*
499 	 * Fill in the updated information.
500 	 */
501 	ipf->ipf_ro = *ro;
502 	ro->ro_rt->rt_refcnt++;
503 	ipf->ipf_dst = ip->ip_dst;
504 	ipf->ipf_src = ip->ip_src;
505 	ipf->ipf_tos = ip->ip_tos;
506 	ipf->ipf_timer = IPFLOW_TIMER;
507 
508 	/*
509 	 * Insert into the approriate bucket of the flow table.
510 	 */
511 	hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos);
512 	IPFLOW_INSERT(&ipflowtable[hash], ipf);
513 }
514 
515 void
516 ipflow_flush_oncpu(void)
517 {
518 	struct ipflow *ipf;
519 
520 	while ((ipf = LIST_FIRST(&ipflowlist)) != NULL) {
521 		IPFLOW_REMOVE(ipf);
522 		IPFLOW_FREE(ipf);
523 	}
524 }
525 
526 static void
527 ipflow_ifaddr_handler(struct netmsg *nmsg)
528 {
529 	struct netmsg_ipfaddr *amsg = (struct netmsg_ipfaddr *)nmsg;
530 	struct ipflow *ipf, *next_ipf;
531 
532 	LIST_FOREACH_MUTABLE(ipf, &ipflowlist, ipf_list, next_ipf) {
533 		if (ipf->ipf_dst.s_addr == amsg->ipf_addr.s_addr ||
534 		    ipf->ipf_src.s_addr == amsg->ipf_addr.s_addr) {
535 			IPFLOW_REMOVE(ipf);
536 			IPFLOW_FREE(ipf);
537 		}
538 	}
539 	ifnet_forwardmsg(&nmsg->nm_lmsg, mycpuid + 1);
540 }
541 
542 static void
543 ipflow_ifaddr(void *arg __unused, struct ifnet *ifp __unused,
544 	      enum ifaddr_event event, struct ifaddr *ifa)
545 {
546 	struct netmsg_ipfaddr amsg;
547 
548 	if (ifa->ifa_addr->sa_family != AF_INET)
549 		return;
550 
551 	/* Only add/change events need to be handled */
552 	switch (event) {
553 	case IFADDR_EVENT_ADD:
554 	case IFADDR_EVENT_CHANGE:
555 		break;
556 
557 	case IFADDR_EVENT_DELETE:
558 		return;
559 	}
560 
561 	netmsg_init(&amsg.ipf_nmsg, NULL, &curthread->td_msgport,
562 		    MSGF_PRIORITY, ipflow_ifaddr_handler);
563 	amsg.ipf_addr = ifatoia(ifa)->ia_addr.sin_addr;
564 
565 	ifnet_domsg(&amsg.ipf_nmsg.nm_lmsg, 0);
566 }
567 
568 static void
569 ipflow_init(void)
570 {
571 	char oid_name[32];
572 	int i;
573 
574 	for (i = 0; i < ncpus; ++i) {
575 		netmsg_init(&ipflow_timo_netmsgs[i], NULL, &netisr_adone_rport,
576 			    MSGF_MPSAFE, ipflow_timo_dispatch);
577 
578 		ksnprintf(oid_name, sizeof(oid_name), "inuse%d", i);
579 
580 		SYSCTL_ADD_INT(NULL,
581 		SYSCTL_STATIC_CHILDREN(_net_inet_ip_ipflow),
582 		OID_AUTO, oid_name, CTLFLAG_RD, &ipflow_inuse_pcpu[i], 0,
583 		"# of ip flow being used");
584 	}
585 	EVENTHANDLER_REGISTER(ifaddr_event, ipflow_ifaddr, NULL,
586 			      EVENTHANDLER_PRI_ANY);
587 }
588 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipflow_init, 0);
589