1 /*- 2 * Copyright (c) 1998 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the NetBSD 19 * Foundation, Inc. and its contributors. 20 * 4. Neither the name of The NetBSD Foundation nor the names of its 21 * contributors may be used to endorse or promote products derived 22 * from this software without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 * 36 * $FreeBSD: src/sys/netinet/ip_flow.c,v 1.9.2.2 2001/11/04 17:35:31 luigi Exp $ 37 * $DragonFly: src/sys/netinet/ip_flow.c,v 1.27 2008/10/28 07:09:26 sephe Exp $ 38 */ 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/sysctl.h> 47 #include <sys/thread2.h> 48 #include <sys/in_cksum.h> 49 50 #include <machine/smp.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <net/route.h> 55 #include <net/netisr.h> 56 #include <net/netmsg2.h> 57 58 #include <netinet/in.h> 59 #include <netinet/ip.h> 60 #include <netinet/in_var.h> 61 #include <netinet/ip_var.h> 62 #include <netinet/ip_flow.h> 63 64 #define IPFLOW_TIMER (5 * PR_SLOWHZ) 65 #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ 66 #define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) 67 #define IPFLOW_MAX 256 68 69 #define IPFLOW_RTENTRY_ISDOWN(rt) \ 70 (((rt)->rt_flags & RTF_UP) == 0 || \ 71 ((rt)->rt_ifp->if_flags & IFF_UP) == 0) 72 73 struct netmsg_ipfaddr { 74 struct netmsg_base base; 75 struct in_addr ipf_addr; 76 }; 77 78 struct ipflow { 79 LIST_ENTRY(ipflow) ipf_hash; /* next ipflow in hash bucket */ 80 LIST_ENTRY(ipflow) ipf_list; /* next ipflow in list */ 81 82 struct in_addr ipf_dst; /* destination address */ 83 struct in_addr ipf_src; /* source address */ 84 uint8_t ipf_tos; /* type-of-service */ 85 86 uint8_t ipf_flags; /* see IPFLOW_FLAG_ */ 87 uint8_t ipf_pad[2]; /* explicit pad */ 88 int ipf_refcnt; /* reference count */ 89 90 struct route ipf_ro; /* associated route entry */ 91 u_long ipf_uses; /* number of uses in this period */ 92 93 int ipf_timer; /* remaining lifetime of this entry */ 94 u_long ipf_dropped; /* ENOBUFS returned by if_output */ 95 u_long ipf_errors; /* other errors returned by if_output */ 96 u_long ipf_last_uses; /* number of uses in last period */ 97 }; 98 LIST_HEAD(ipflowhead, ipflow); 99 100 #define IPFLOW_FLAG_ONLIST 0x1 101 102 #define ipflow_inuse ipflow_inuse_pcpu[mycpuid] 103 #define ipflowtable ipflowtable_pcpu[mycpuid] 104 #define ipflowlist ipflowlist_pcpu[mycpuid] 105 106 static struct ipflowhead ipflowtable_pcpu[MAXCPU][IPFLOW_HASHSIZE]; 107 static struct ipflowhead ipflowlist_pcpu[MAXCPU]; 108 static int ipflow_inuse_pcpu[MAXCPU]; 109 static struct netmsg_base ipflow_timo_netmsgs[MAXCPU]; 110 static int ipflow_active = 0; 111 112 #define IPFLOW_REFCNT_INIT 1 113 114 /* ipflow is alive and active */ 115 #define IPFLOW_IS_ACTIVE(ipf) ((ipf)->ipf_refcnt > IPFLOW_REFCNT_INIT) 116 /* ipflow is alive but not active */ 117 #define IPFLOW_NOT_ACTIVE(ipf) ((ipf)->ipf_refcnt == IPFLOW_REFCNT_INIT) 118 119 #define IPFLOW_REF(ipf) \ 120 do { \ 121 KKASSERT((ipf)->ipf_refcnt > 0); \ 122 (ipf)->ipf_refcnt++; \ 123 } while (0) 124 125 #define IPFLOW_FREE(ipf) \ 126 do { \ 127 KKASSERT((ipf)->ipf_refcnt > 0); \ 128 (ipf)->ipf_refcnt--; \ 129 if ((ipf)->ipf_refcnt == 0) \ 130 ipflow_free((ipf)); \ 131 } while (0) 132 133 #define IPFLOW_INSERT(bucket, ipf) \ 134 do { \ 135 KKASSERT(((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST) == 0); \ 136 (ipf)->ipf_flags |= IPFLOW_FLAG_ONLIST; \ 137 LIST_INSERT_HEAD((bucket), (ipf), ipf_hash); \ 138 LIST_INSERT_HEAD(&ipflowlist, (ipf), ipf_list); \ 139 } while (0) 140 141 #define IPFLOW_REMOVE(ipf) \ 142 do { \ 143 KKASSERT((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST); \ 144 (ipf)->ipf_flags &= ~IPFLOW_FLAG_ONLIST; \ 145 LIST_REMOVE((ipf), ipf_hash); \ 146 LIST_REMOVE((ipf), ipf_list); \ 147 } while (0) 148 149 SYSCTL_NODE(_net_inet_ip, OID_AUTO, ipflow, CTLFLAG_RW, 0, "ip flow"); 150 SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, 151 &ipflow_active, 0, "Enable flow-based IP forwarding"); 152 153 static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); 154 155 static void ipflow_free(struct ipflow *); 156 157 static unsigned 158 ipflow_hash(struct in_addr dst, struct in_addr src, unsigned tos) 159 { 160 unsigned hash = tos; 161 int idx; 162 163 for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) 164 hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); 165 return hash & (IPFLOW_HASHSIZE-1); 166 } 167 168 static struct ipflow * 169 ipflow_lookup(const struct ip *ip) 170 { 171 unsigned hash; 172 struct ipflow *ipf; 173 174 hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); 175 LIST_FOREACH(ipf, &ipflowtable[hash], ipf_hash) { 176 if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr && 177 ip->ip_src.s_addr == ipf->ipf_src.s_addr && 178 ip->ip_tos == ipf->ipf_tos) 179 break; 180 } 181 return ipf; 182 } 183 184 int 185 ipflow_fastforward(struct mbuf *m) 186 { 187 struct ip *ip; 188 struct ipflow *ipf; 189 struct rtentry *rt; 190 struct sockaddr *dst; 191 struct ifnet *ifp; 192 int error, iplen; 193 194 /* 195 * Are we forwarding packets? 196 */ 197 if (!ipforwarding || !ipflow_active) 198 return 0; 199 200 /* 201 * Was packet received as a link-level multicast or broadcast? 202 * If so, don't try to fast forward.. 203 */ 204 if (m->m_flags & (M_BCAST | M_MCAST)) 205 return 0; 206 207 /* length checks already done in ip_cpufn() */ 208 KASSERT(m->m_len >= sizeof(struct ip), ("IP header not in one mbuf")); 209 ip = mtod(m, struct ip *); 210 211 /* 212 * IP header with no option and valid version 213 */ 214 if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2)) 215 return 0; 216 217 iplen = ntohs(ip->ip_len); 218 /* length checks already done in ip_cpufn() */ 219 KASSERT(iplen >= sizeof(struct ip), 220 ("total length less then header length")); 221 KASSERT(m->m_pkthdr.len >= iplen, ("mbuf too short")); 222 223 /* 224 * Find a flow. 225 */ 226 ipf = ipflow_lookup(ip); 227 if (ipf == NULL) 228 return 0; 229 230 /* 231 * Verify the IP header checksum. 232 */ 233 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 234 if (!(m->m_pkthdr.csum_flags & CSUM_IP_VALID)) 235 return 0; 236 } else { 237 /* Must compute it ourselves. */ 238 if (in_cksum_hdr(ip) != 0) 239 return 0; 240 } 241 242 /* 243 * Route and interface still up? 244 */ 245 rt = ipf->ipf_ro.ro_rt; 246 if (IPFLOW_RTENTRY_ISDOWN(rt)) 247 return 0; 248 ifp = rt->rt_ifp; 249 250 /* 251 * Packet size OK? TTL? 252 */ 253 if (m->m_pkthdr.len > ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) 254 return 0; 255 256 /* 257 * Clear any in-bound checksum flags for this packet. 258 */ 259 m->m_pkthdr.csum_flags = 0; 260 261 /* 262 * Everything checks out and so we can forward this packet. 263 * Modify the TTL and incrementally change the checksum. 264 * 265 * This method of adding the checksum works on either endian CPU. 266 * If htons() is inlined, all the arithmetic is folded; otherwise 267 * the htons()s are combined by CSE due to the __const__ attribute. 268 * 269 * Don't bother using HW checksumming here -- the incremental 270 * update is pretty fast. 271 */ 272 ip->ip_ttl -= IPTTLDEC; 273 if (ip->ip_sum >= (uint16_t)~htons(IPTTLDEC << 8)) 274 ip->ip_sum -= ~htons(IPTTLDEC << 8); 275 else 276 ip->ip_sum += htons(IPTTLDEC << 8); 277 278 /* 279 * Trim the packet in case it's too long.. 280 */ 281 if (m->m_pkthdr.len > iplen) { 282 if (m->m_len == m->m_pkthdr.len) { 283 m->m_len = iplen; 284 m->m_pkthdr.len = iplen; 285 } else { 286 m_adj(m, iplen - m->m_pkthdr.len); 287 } 288 } 289 290 /* 291 * Send the packet on its way. All we can get back is ENOBUFS 292 */ 293 ipf->ipf_uses++; 294 ipf->ipf_timer = IPFLOW_TIMER; 295 296 if (rt->rt_flags & RTF_GATEWAY) 297 dst = rt->rt_gateway; 298 else 299 dst = &ipf->ipf_ro.ro_dst; 300 301 /* 302 * Reference count this ipflow, before the possible blocking 303 * ifnet.if_output(), so this ipflow will not be changed or 304 * reaped behind our back. 305 */ 306 IPFLOW_REF(ipf); 307 308 error = ifp->if_output(ifp, m, dst, rt); 309 if (error) { 310 if (error == ENOBUFS) 311 ipf->ipf_dropped++; 312 else 313 ipf->ipf_errors++; 314 } 315 316 IPFLOW_FREE(ipf); 317 return 1; 318 } 319 320 static void 321 ipflow_addstats(struct ipflow *ipf) 322 { 323 ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; 324 ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped; 325 ipstat.ips_total += ipf->ipf_uses; 326 ipstat.ips_forward += ipf->ipf_uses; 327 ipstat.ips_fastforward += ipf->ipf_uses; 328 } 329 330 static void 331 ipflow_free(struct ipflow *ipf) 332 { 333 KKASSERT(ipf->ipf_refcnt == 0); 334 KKASSERT((ipf->ipf_flags & IPFLOW_FLAG_ONLIST) == 0); 335 336 KKASSERT(ipflow_inuse > 0); 337 ipflow_inuse--; 338 339 ipflow_addstats(ipf); 340 RTFREE(ipf->ipf_ro.ro_rt); 341 kfree(ipf, M_IPFLOW); 342 } 343 344 static void 345 ipflow_reset(struct ipflow *ipf) 346 { 347 ipflow_addstats(ipf); 348 RTFREE(ipf->ipf_ro.ro_rt); 349 ipf->ipf_uses = ipf->ipf_last_uses = 0; 350 ipf->ipf_errors = ipf->ipf_dropped = 0; 351 } 352 353 static struct ipflow * 354 ipflow_reap(void) 355 { 356 struct ipflow *ipf, *maybe_ipf = NULL; 357 358 LIST_FOREACH(ipf, &ipflowlist, ipf_list) { 359 /* 360 * Skip actively used ipflow 361 */ 362 if (IPFLOW_IS_ACTIVE(ipf)) 363 continue; 364 365 /* 366 * If this no longer points to a valid route 367 * reclaim it. 368 */ 369 if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) 370 goto done; 371 372 /* 373 * choose the one that's been least recently used 374 * or has had the least uses in the last 1.5 375 * intervals. 376 */ 377 if (maybe_ipf == NULL || 378 ipf->ipf_timer < maybe_ipf->ipf_timer || 379 (ipf->ipf_timer == maybe_ipf->ipf_timer && 380 ipf->ipf_last_uses + ipf->ipf_uses < 381 maybe_ipf->ipf_last_uses + maybe_ipf->ipf_uses)) 382 maybe_ipf = ipf; 383 } 384 if (maybe_ipf == NULL) 385 return NULL; 386 387 ipf = maybe_ipf; 388 done: 389 /* 390 * Remove the entry from the flow table and reset its states 391 */ 392 IPFLOW_REMOVE(ipf); 393 ipflow_reset(ipf); 394 return ipf; 395 } 396 397 static void 398 ipflow_timo_dispatch(netmsg_t nmsg) 399 { 400 struct ipflow *ipf, *next_ipf; 401 402 crit_enter(); 403 lwkt_replymsg(&nmsg->lmsg, 0); /* reply ASAP */ 404 crit_exit(); 405 406 LIST_FOREACH_MUTABLE(ipf, &ipflowlist, ipf_list, next_ipf) { 407 if (--ipf->ipf_timer == 0) { 408 IPFLOW_REMOVE(ipf); 409 IPFLOW_FREE(ipf); 410 } else { 411 ipf->ipf_last_uses = ipf->ipf_uses; 412 ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; 413 ipstat.ips_total += ipf->ipf_uses; 414 ipstat.ips_forward += ipf->ipf_uses; 415 ipstat.ips_fastforward += ipf->ipf_uses; 416 ipf->ipf_uses = 0; 417 } 418 } 419 } 420 421 static void 422 ipflow_timo_ipi(void *arg __unused) 423 { 424 struct lwkt_msg *msg = &ipflow_timo_netmsgs[mycpuid].lmsg; 425 426 crit_enter(); 427 if (msg->ms_flags & MSGF_DONE) 428 lwkt_sendmsg(cpu_portfn(mycpuid), msg); 429 crit_exit(); 430 } 431 432 void 433 ipflow_slowtimo(void) 434 { 435 #ifdef SMP 436 cpumask_t mask = 0; 437 int i; 438 439 for (i = 0; i < ncpus; ++i) { 440 if (ipflow_inuse_pcpu[i]) 441 mask |= CPUMASK(i); 442 } 443 mask &= smp_active_mask; 444 if (mask != 0) 445 lwkt_send_ipiq_mask(mask, ipflow_timo_ipi, NULL); 446 #else 447 if (ipflow_inuse) 448 ipflow_timo_ipi(NULL); 449 #endif 450 } 451 452 void 453 ipflow_create(const struct route *ro, struct mbuf *m) 454 { 455 const struct ip *const ip = mtod(m, struct ip *); 456 struct ipflow *ipf; 457 unsigned hash; 458 459 /* 460 * Don't create cache entries for ICMP messages. 461 */ 462 if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) 463 return; 464 465 /* 466 * See if an existing flow struct exists. If so remove it from it's 467 * list and free the old route. If not, try to malloc a new one 468 * (if we aren't at our limit). 469 */ 470 ipf = ipflow_lookup(ip); 471 if (ipf == NULL) { 472 if (ipflow_inuse == IPFLOW_MAX) { 473 ipf = ipflow_reap(); 474 if (ipf == NULL) 475 return; 476 } else { 477 ipf = kmalloc(sizeof(*ipf), M_IPFLOW, 478 M_NOWAIT | M_ZERO); 479 if (ipf == NULL) 480 return; 481 ipf->ipf_refcnt = IPFLOW_REFCNT_INIT; 482 483 ipflow_inuse++; 484 } 485 } else { 486 if (IPFLOW_NOT_ACTIVE(ipf)) { 487 IPFLOW_REMOVE(ipf); 488 ipflow_reset(ipf); 489 } else { 490 /* This ipflow is being used; don't change it */ 491 KKASSERT(IPFLOW_IS_ACTIVE(ipf)); 492 return; 493 } 494 } 495 /* This ipflow should not be actively used */ 496 KKASSERT(IPFLOW_NOT_ACTIVE(ipf)); 497 498 /* 499 * Fill in the updated information. 500 */ 501 ipf->ipf_ro = *ro; 502 ro->ro_rt->rt_refcnt++; 503 ipf->ipf_dst = ip->ip_dst; 504 ipf->ipf_src = ip->ip_src; 505 ipf->ipf_tos = ip->ip_tos; 506 ipf->ipf_timer = IPFLOW_TIMER; 507 508 /* 509 * Insert into the approriate bucket of the flow table. 510 */ 511 hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); 512 IPFLOW_INSERT(&ipflowtable[hash], ipf); 513 } 514 515 void 516 ipflow_flush_oncpu(void) 517 { 518 struct ipflow *ipf; 519 520 while ((ipf = LIST_FIRST(&ipflowlist)) != NULL) { 521 IPFLOW_REMOVE(ipf); 522 IPFLOW_FREE(ipf); 523 } 524 } 525 526 static void 527 ipflow_ifaddr_handler(netmsg_t nmsg) 528 { 529 struct netmsg_ipfaddr *amsg = (struct netmsg_ipfaddr *)nmsg; 530 struct ipflow *ipf, *next_ipf; 531 532 LIST_FOREACH_MUTABLE(ipf, &ipflowlist, ipf_list, next_ipf) { 533 if (ipf->ipf_dst.s_addr == amsg->ipf_addr.s_addr || 534 ipf->ipf_src.s_addr == amsg->ipf_addr.s_addr) { 535 IPFLOW_REMOVE(ipf); 536 IPFLOW_FREE(ipf); 537 } 538 } 539 ifnet_forwardmsg(&nmsg->lmsg, mycpuid + 1); 540 } 541 542 static void 543 ipflow_ifaddr(void *arg __unused, struct ifnet *ifp __unused, 544 enum ifaddr_event event, struct ifaddr *ifa) 545 { 546 struct netmsg_ipfaddr amsg; 547 548 if (ifa->ifa_addr->sa_family != AF_INET) 549 return; 550 551 /* Only add/change events need to be handled */ 552 switch (event) { 553 case IFADDR_EVENT_ADD: 554 case IFADDR_EVENT_CHANGE: 555 break; 556 557 case IFADDR_EVENT_DELETE: 558 return; 559 } 560 561 netmsg_init(&amsg.base, NULL, &curthread->td_msgport, 562 MSGF_PRIORITY, ipflow_ifaddr_handler); 563 amsg.ipf_addr = ifatoia(ifa)->ia_addr.sin_addr; 564 565 ifnet_domsg(&amsg.base.lmsg, 0); 566 } 567 568 static void 569 ipflow_init(void) 570 { 571 char oid_name[32]; 572 int i; 573 574 for (i = 0; i < ncpus; ++i) { 575 netmsg_init(&ipflow_timo_netmsgs[i], NULL, &netisr_adone_rport, 576 0, ipflow_timo_dispatch); 577 578 ksnprintf(oid_name, sizeof(oid_name), "inuse%d", i); 579 580 SYSCTL_ADD_INT(NULL, 581 SYSCTL_STATIC_CHILDREN(_net_inet_ip_ipflow), 582 OID_AUTO, oid_name, CTLFLAG_RD, &ipflow_inuse_pcpu[i], 0, 583 "# of ip flow being used"); 584 } 585 EVENTHANDLER_REGISTER(ifaddr_event, ipflow_ifaddr, NULL, 586 EVENTHANDLER_PRI_ANY); 587 } 588 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipflow_init, 0); 589