1 /* $NetBSD: ip_flow.c,v 1.75 2016/07/27 04:23:42 knakahara Exp $ */
2
3 /*-
4 * Copyright (c) 1998 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: ip_flow.c,v 1.75 2016/07/27 04:23:42 knakahara Exp $");
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/errno.h>
44 #include <sys/time.h>
45 #include <sys/kernel.h>
46 #include <sys/pool.h>
47 #include <sys/sysctl.h>
48 #include <sys/workqueue.h>
49 #include <sys/atomic.h>
50
51 #include <net/if.h>
52 #include <net/if_dl.h>
53 #include <net/route.h>
54 #include <net/pfil.h>
55
56 #include <netinet/in.h>
57 #include <netinet/in_systm.h>
58 #include <netinet/ip.h>
59 #include <netinet/in_pcb.h>
60 #include <netinet/in_var.h>
61 #include <netinet/ip_var.h>
62 #include <netinet/ip_private.h>
63
64 /*
65 * Similar code is very well commented in netinet6/ip6_flow.c
66 */
67
68 #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */
69
70 static struct pool ipflow_pool;
71
72 LIST_HEAD(ipflowhead, ipflow);
73
74 #define IPFLOW_TIMER (5 * PR_SLOWHZ)
75 #define IPFLOW_DEFAULT_HASHSIZE (1 << IPFLOW_HASHBITS)
76
77 /*
78 * ip_flow.c internal lock.
79 * If we use softnet_lock, it would cause recursive lock.
80 *
81 * This is a tentative workaround.
82 * We should make it scalable somehow in the future.
83 */
84 static kmutex_t ipflow_lock;
85 static struct ipflowhead *ipflowtable = NULL;
86 static struct ipflowhead ipflowlist;
87 static int ipflow_inuse;
88
89 #define IPFLOW_INSERT(bucket, ipf) \
90 do { \
91 LIST_INSERT_HEAD((bucket), (ipf), ipf_hash); \
92 LIST_INSERT_HEAD(&ipflowlist, (ipf), ipf_list); \
93 } while (/*CONSTCOND*/ 0)
94
95 #define IPFLOW_REMOVE(ipf) \
96 do { \
97 LIST_REMOVE((ipf), ipf_hash); \
98 LIST_REMOVE((ipf), ipf_list); \
99 } while (/*CONSTCOND*/ 0)
100
101 #ifndef IPFLOW_MAX
102 #define IPFLOW_MAX 256
103 #endif
104 static int ip_maxflows = IPFLOW_MAX;
105 static int ip_hashsize = IPFLOW_DEFAULT_HASHSIZE;
106
107 static struct ipflow *ipflow_reap(bool);
108 static void ipflow_sysctl_init(struct sysctllog **);
109
110 static void ipflow_slowtimo_work(struct work *, void *);
111 static struct workqueue *ipflow_slowtimo_wq;
112 static struct work ipflow_slowtimo_wk;
113
114 static size_t
ipflow_hash(const struct ip * ip)115 ipflow_hash(const struct ip *ip)
116 {
117 size_t hash = ip->ip_tos;
118 size_t idx;
119
120 for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) {
121 hash += (ip->ip_dst.s_addr >> (32 - idx)) +
122 (ip->ip_src.s_addr >> idx);
123 }
124
125 return hash & (ip_hashsize-1);
126 }
127
128 static struct ipflow *
ipflow_lookup(const struct ip * ip)129 ipflow_lookup(const struct ip *ip)
130 {
131 size_t hash;
132 struct ipflow *ipf;
133
134 KASSERT(mutex_owned(&ipflow_lock));
135
136 hash = ipflow_hash(ip);
137
138 LIST_FOREACH(ipf, &ipflowtable[hash], ipf_hash) {
139 if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr
140 && ip->ip_src.s_addr == ipf->ipf_src.s_addr
141 && ip->ip_tos == ipf->ipf_tos)
142 break;
143 }
144 return ipf;
145 }
146
147 void
ipflow_poolinit(void)148 ipflow_poolinit(void)
149 {
150
151 pool_init(&ipflow_pool, sizeof(struct ipflow), 0, 0, 0, "ipflowpl",
152 NULL, IPL_NET);
153 }
154
155 static int
ipflow_reinit(int table_size)156 ipflow_reinit(int table_size)
157 {
158 struct ipflowhead *new_table;
159 size_t i;
160
161 KASSERT(mutex_owned(&ipflow_lock));
162
163 new_table = (struct ipflowhead *)malloc(sizeof(struct ipflowhead) *
164 table_size, M_RTABLE, M_NOWAIT);
165
166 if (new_table == NULL)
167 return 1;
168
169 if (ipflowtable != NULL)
170 free(ipflowtable, M_RTABLE);
171
172 ipflowtable = new_table;
173 ip_hashsize = table_size;
174
175 LIST_INIT(&ipflowlist);
176 for (i = 0; i < ip_hashsize; i++)
177 LIST_INIT(&ipflowtable[i]);
178
179 return 0;
180 }
181
182 void
ipflow_init(void)183 ipflow_init(void)
184 {
185 int error;
186
187 error = workqueue_create(&ipflow_slowtimo_wq, "ipflow_slowtimo",
188 ipflow_slowtimo_work, NULL, PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE);
189 if (error != 0)
190 panic("%s: workqueue_create failed (%d)\n", __func__, error);
191
192 mutex_init(&ipflow_lock, MUTEX_DEFAULT, IPL_NONE);
193
194 mutex_enter(&ipflow_lock);
195 (void)ipflow_reinit(ip_hashsize);
196 mutex_exit(&ipflow_lock);
197 ipflow_sysctl_init(NULL);
198 }
199
200 int
ipflow_fastforward(struct mbuf * m)201 ipflow_fastforward(struct mbuf *m)
202 {
203 struct ip *ip;
204 struct ip ip_store;
205 struct ipflow *ipf;
206 struct rtentry *rt;
207 const struct sockaddr *dst;
208 int error;
209 int iplen;
210 struct ifnet *ifp;
211 int s;
212 int ret = 0;
213
214 mutex_enter(&ipflow_lock);
215 /*
216 * Are we forwarding packets? Big enough for an IP packet?
217 */
218 if (!ipforwarding || ipflow_inuse == 0 || m->m_len < sizeof(struct ip))
219 goto out;
220
221 /*
222 * Was packet received as a link-level multicast or broadcast?
223 * If so, don't try to fast forward..
224 */
225 if ((m->m_flags & (M_BCAST|M_MCAST)) != 0)
226 goto out;
227
228 /*
229 * IP header with no option and valid version and length
230 */
231 if (IP_HDR_ALIGNED_P(mtod(m, const void *)))
232 ip = mtod(m, struct ip *);
233 else {
234 memcpy(&ip_store, mtod(m, const void *), sizeof(ip_store));
235 ip = &ip_store;
236 }
237 iplen = ntohs(ip->ip_len);
238 if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) ||
239 iplen < sizeof(struct ip) || iplen > m->m_pkthdr.len)
240 goto out;
241 /*
242 * Find a flow.
243 */
244 if ((ipf = ipflow_lookup(ip)) == NULL)
245 goto out;
246
247 ifp = m_get_rcvif(m, &s);
248 /*
249 * Verify the IP header checksum.
250 */
251 switch (m->m_pkthdr.csum_flags &
252 ((ifp->if_csum_flags_rx & M_CSUM_IPv4) |
253 M_CSUM_IPv4_BAD)) {
254 case M_CSUM_IPv4|M_CSUM_IPv4_BAD:
255 m_put_rcvif(ifp, &s);
256 goto out;
257
258 case M_CSUM_IPv4:
259 /* Checksum was okay. */
260 break;
261
262 default:
263 /* Must compute it ourselves. */
264 if (in_cksum(m, sizeof(struct ip)) != 0) {
265 m_put_rcvif(ifp, &s);
266 goto out;
267 }
268 break;
269 }
270 m_put_rcvif(ifp, &s);
271
272 /*
273 * Route and interface still up?
274 */
275 if ((rt = rtcache_validate(&ipf->ipf_ro)) == NULL ||
276 (rt->rt_ifp->if_flags & IFF_UP) == 0 ||
277 (rt->rt_flags & (RTF_BLACKHOLE | RTF_BROADCAST)) != 0)
278 goto out;
279
280 /*
281 * Packet size OK? TTL?
282 */
283 if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC)
284 goto out;
285
286 /*
287 * Clear any in-bound checksum flags for this packet.
288 */
289 m->m_pkthdr.csum_flags = 0;
290
291 /*
292 * Everything checks out and so we can forward this packet.
293 * Modify the TTL and incrementally change the checksum.
294 *
295 * This method of adding the checksum works on either endian CPU.
296 * If htons() is inlined, all the arithmetic is folded; otherwise
297 * the htons()s are combined by CSE due to the const attribute.
298 *
299 * Don't bother using HW checksumming here -- the incremental
300 * update is pretty fast.
301 */
302 ip->ip_ttl -= IPTTLDEC;
303 if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8))
304 ip->ip_sum -= ~htons(IPTTLDEC << 8);
305 else
306 ip->ip_sum += htons(IPTTLDEC << 8);
307
308 /*
309 * Done modifying the header; copy it back, if necessary.
310 *
311 * XXX Use m_copyback_cow(9) here? --dyoung
312 */
313 if (IP_HDR_ALIGNED_P(mtod(m, void *)) == 0)
314 memcpy(mtod(m, void *), &ip_store, sizeof(ip_store));
315
316 /*
317 * Trim the packet in case it's too long..
318 */
319 if (m->m_pkthdr.len > iplen) {
320 if (m->m_len == m->m_pkthdr.len) {
321 m->m_len = iplen;
322 m->m_pkthdr.len = iplen;
323 } else
324 m_adj(m, iplen - m->m_pkthdr.len);
325 }
326
327 /*
328 * Send the packet on its way. All we can get back is ENOBUFS
329 */
330 ipf->ipf_uses++;
331 PRT_SLOW_ARM(ipf->ipf_timer, IPFLOW_TIMER);
332
333 if (rt->rt_flags & RTF_GATEWAY)
334 dst = rt->rt_gateway;
335 else
336 dst = rtcache_getdst(&ipf->ipf_ro);
337
338 if ((error = if_output_lock(rt->rt_ifp, rt->rt_ifp, m, dst, rt)) != 0) {
339 if (error == ENOBUFS)
340 ipf->ipf_dropped++;
341 else
342 ipf->ipf_errors++;
343 }
344 ret = 1;
345 out:
346 mutex_exit(&ipflow_lock);
347 return ret;
348 }
349
350 static void
ipflow_addstats(struct ipflow * ipf)351 ipflow_addstats(struct ipflow *ipf)
352 {
353 struct rtentry *rt;
354 uint64_t *ips;
355
356 if ((rt = rtcache_validate(&ipf->ipf_ro)) != NULL)
357 rt->rt_use += ipf->ipf_uses;
358
359 ips = IP_STAT_GETREF();
360 ips[IP_STAT_CANTFORWARD] += ipf->ipf_errors + ipf->ipf_dropped;
361 ips[IP_STAT_TOTAL] += ipf->ipf_uses;
362 ips[IP_STAT_FORWARD] += ipf->ipf_uses;
363 ips[IP_STAT_FASTFORWARD] += ipf->ipf_uses;
364 IP_STAT_PUTREF();
365 }
366
367 static void
ipflow_free(struct ipflow * ipf)368 ipflow_free(struct ipflow *ipf)
369 {
370
371 KASSERT(mutex_owned(&ipflow_lock));
372
373 /*
374 * Remove the flow from the hash table (at elevated IPL).
375 * Once it's off the list, we can deal with it at normal
376 * network IPL.
377 */
378 IPFLOW_REMOVE(ipf);
379
380 ipflow_addstats(ipf);
381 rtcache_free(&ipf->ipf_ro);
382 ipflow_inuse--;
383 pool_put(&ipflow_pool, ipf);
384 }
385
386 static struct ipflow *
ipflow_reap(bool just_one)387 ipflow_reap(bool just_one)
388 {
389
390 KASSERT(mutex_owned(&ipflow_lock));
391
392 while (just_one || ipflow_inuse > ip_maxflows) {
393 struct ipflow *ipf, *maybe_ipf = NULL;
394
395 ipf = LIST_FIRST(&ipflowlist);
396 while (ipf != NULL) {
397 /*
398 * If this no longer points to a valid route
399 * reclaim it.
400 */
401 if (rtcache_validate(&ipf->ipf_ro) == NULL)
402 goto done;
403 /*
404 * choose the one that's been least recently
405 * used or has had the least uses in the
406 * last 1.5 intervals.
407 */
408 if (maybe_ipf == NULL ||
409 ipf->ipf_timer < maybe_ipf->ipf_timer ||
410 (ipf->ipf_timer == maybe_ipf->ipf_timer &&
411 ipf->ipf_last_uses + ipf->ipf_uses <
412 maybe_ipf->ipf_last_uses +
413 maybe_ipf->ipf_uses))
414 maybe_ipf = ipf;
415 ipf = LIST_NEXT(ipf, ipf_list);
416 }
417 ipf = maybe_ipf;
418 done:
419 /*
420 * Remove the entry from the flow table.
421 */
422 IPFLOW_REMOVE(ipf);
423
424 ipflow_addstats(ipf);
425 rtcache_free(&ipf->ipf_ro);
426 if (just_one)
427 return ipf;
428 pool_put(&ipflow_pool, ipf);
429 ipflow_inuse--;
430 }
431 return NULL;
432 }
433
434 static unsigned int ipflow_work_enqueued = 0;
435
436 static void
ipflow_slowtimo_work(struct work * wk,void * arg)437 ipflow_slowtimo_work(struct work *wk, void *arg)
438 {
439 struct rtentry *rt;
440 struct ipflow *ipf, *next_ipf;
441 uint64_t *ips;
442
443 /* We can allow enqueuing another work at this point */
444 atomic_swap_uint(&ipflow_work_enqueued, 0);
445
446 mutex_enter(softnet_lock);
447 mutex_enter(&ipflow_lock);
448 KERNEL_LOCK(1, NULL);
449 for (ipf = LIST_FIRST(&ipflowlist); ipf != NULL; ipf = next_ipf) {
450 next_ipf = LIST_NEXT(ipf, ipf_list);
451 if (PRT_SLOW_ISEXPIRED(ipf->ipf_timer) ||
452 (rt = rtcache_validate(&ipf->ipf_ro)) == NULL) {
453 ipflow_free(ipf);
454 } else {
455 ipf->ipf_last_uses = ipf->ipf_uses;
456 rt->rt_use += ipf->ipf_uses;
457 ips = IP_STAT_GETREF();
458 ips[IP_STAT_TOTAL] += ipf->ipf_uses;
459 ips[IP_STAT_FORWARD] += ipf->ipf_uses;
460 ips[IP_STAT_FASTFORWARD] += ipf->ipf_uses;
461 IP_STAT_PUTREF();
462 ipf->ipf_uses = 0;
463 }
464 }
465 KERNEL_UNLOCK_ONE(NULL);
466 mutex_exit(&ipflow_lock);
467 mutex_exit(softnet_lock);
468 }
469
470 void
ipflow_slowtimo(void)471 ipflow_slowtimo(void)
472 {
473
474 /* Avoid enqueuing another work when one is already enqueued */
475 if (atomic_swap_uint(&ipflow_work_enqueued, 1) == 1)
476 return;
477
478 workqueue_enqueue(ipflow_slowtimo_wq, &ipflow_slowtimo_wk, NULL);
479 }
480
481 void
ipflow_create(const struct route * ro,struct mbuf * m)482 ipflow_create(const struct route *ro, struct mbuf *m)
483 {
484 const struct ip *const ip = mtod(m, const struct ip *);
485 struct ipflow *ipf;
486 size_t hash;
487
488 mutex_enter(&ipflow_lock);
489
490 /*
491 * Don't create cache entries for ICMP messages.
492 */
493 if (ip_maxflows == 0 || ip->ip_p == IPPROTO_ICMP) {
494 mutex_exit(&ipflow_lock);
495 return;
496 }
497
498 KERNEL_LOCK(1, NULL);
499
500 /*
501 * See if an existing flow struct exists. If so remove it from its
502 * list and free the old route. If not, try to malloc a new one
503 * (if we aren't at our limit).
504 */
505 ipf = ipflow_lookup(ip);
506 if (ipf == NULL) {
507 if (ipflow_inuse >= ip_maxflows) {
508 ipf = ipflow_reap(true);
509 } else {
510 ipf = pool_get(&ipflow_pool, PR_NOWAIT);
511 if (ipf == NULL)
512 goto out;
513 ipflow_inuse++;
514 }
515 memset(ipf, 0, sizeof(*ipf));
516 } else {
517 IPFLOW_REMOVE(ipf);
518
519 ipflow_addstats(ipf);
520 rtcache_free(&ipf->ipf_ro);
521 ipf->ipf_uses = ipf->ipf_last_uses = 0;
522 ipf->ipf_errors = ipf->ipf_dropped = 0;
523 }
524
525 /*
526 * Fill in the updated information.
527 */
528 rtcache_copy(&ipf->ipf_ro, ro);
529 ipf->ipf_dst = ip->ip_dst;
530 ipf->ipf_src = ip->ip_src;
531 ipf->ipf_tos = ip->ip_tos;
532 PRT_SLOW_ARM(ipf->ipf_timer, IPFLOW_TIMER);
533
534 /*
535 * Insert into the approriate bucket of the flow table.
536 */
537 hash = ipflow_hash(ip);
538 IPFLOW_INSERT(&ipflowtable[hash], ipf);
539
540 out:
541 KERNEL_UNLOCK_ONE(NULL);
542 mutex_exit(&ipflow_lock);
543 }
544
545 int
ipflow_invalidate_all(int new_size)546 ipflow_invalidate_all(int new_size)
547 {
548 struct ipflow *ipf, *next_ipf;
549 int error;
550
551 error = 0;
552
553 mutex_enter(&ipflow_lock);
554
555 for (ipf = LIST_FIRST(&ipflowlist); ipf != NULL; ipf = next_ipf) {
556 next_ipf = LIST_NEXT(ipf, ipf_list);
557 ipflow_free(ipf);
558 }
559
560 if (new_size)
561 error = ipflow_reinit(new_size);
562
563 mutex_exit(&ipflow_lock);
564
565 return error;
566 }
567
568 /*
569 * sysctl helper routine for net.inet.ip.maxflows.
570 */
571 static int
sysctl_net_inet_ip_maxflows(SYSCTLFN_ARGS)572 sysctl_net_inet_ip_maxflows(SYSCTLFN_ARGS)
573 {
574 int error;
575
576 error = sysctl_lookup(SYSCTLFN_CALL(rnode));
577 if (error || newp == NULL)
578 return (error);
579
580 mutex_enter(softnet_lock);
581 mutex_enter(&ipflow_lock);
582 KERNEL_LOCK(1, NULL);
583
584 ipflow_reap(false);
585
586 KERNEL_UNLOCK_ONE(NULL);
587 mutex_exit(&ipflow_lock);
588 mutex_exit(softnet_lock);
589
590 return (0);
591 }
592
593 static int
sysctl_net_inet_ip_hashsize(SYSCTLFN_ARGS)594 sysctl_net_inet_ip_hashsize(SYSCTLFN_ARGS)
595 {
596 int error, tmp;
597 struct sysctlnode node;
598
599 node = *rnode;
600 tmp = ip_hashsize;
601 node.sysctl_data = &tmp;
602 error = sysctl_lookup(SYSCTLFN_CALL(&node));
603 if (error || newp == NULL)
604 return (error);
605
606 if ((tmp & (tmp - 1)) == 0 && tmp != 0) {
607 /*
608 * Can only fail due to malloc()
609 */
610 mutex_enter(softnet_lock);
611 KERNEL_LOCK(1, NULL);
612
613 error = ipflow_invalidate_all(tmp);
614
615 KERNEL_UNLOCK_ONE(NULL);
616 mutex_exit(softnet_lock);
617
618 } else {
619 /*
620 * EINVAL if not a power of 2
621 */
622 error = EINVAL;
623 }
624
625 return error;
626 }
627
628 static void
ipflow_sysctl_init(struct sysctllog ** clog)629 ipflow_sysctl_init(struct sysctllog **clog)
630 {
631 sysctl_createv(clog, 0, NULL, NULL,
632 CTLFLAG_PERMANENT,
633 CTLTYPE_NODE, "inet",
634 SYSCTL_DESCR("PF_INET related settings"),
635 NULL, 0, NULL, 0,
636 CTL_NET, PF_INET, CTL_EOL);
637 sysctl_createv(clog, 0, NULL, NULL,
638 CTLFLAG_PERMANENT,
639 CTLTYPE_NODE, "ip",
640 SYSCTL_DESCR("IPv4 related settings"),
641 NULL, 0, NULL, 0,
642 CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL);
643
644 sysctl_createv(clog, 0, NULL, NULL,
645 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
646 CTLTYPE_INT, "maxflows",
647 SYSCTL_DESCR("Number of flows for fast forwarding"),
648 sysctl_net_inet_ip_maxflows, 0, &ip_maxflows, 0,
649 CTL_NET, PF_INET, IPPROTO_IP,
650 IPCTL_MAXFLOWS, CTL_EOL);
651 sysctl_createv(clog, 0, NULL, NULL,
652 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
653 CTLTYPE_INT, "hashsize",
654 SYSCTL_DESCR("Size of hash table for fast forwarding (IPv4)"),
655 sysctl_net_inet_ip_hashsize, 0, &ip_hashsize, 0,
656 CTL_NET, PF_INET, IPPROTO_IP,
657 CTL_CREATE, CTL_EOL);
658 }
659