1 /*- 2 * Copyright (c) 1998 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the NetBSD 19 * Foundation, Inc. and its contributors. 20 * 4. Neither the name of The NetBSD Foundation nor the names of its 21 * contributors may be used to endorse or promote products derived 22 * from this software without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 * 36 * $FreeBSD: src/sys/netinet/ip_flow.c,v 1.9.2.2 2001/11/04 17:35:31 luigi Exp $ 37 */ 38 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/malloc.h> 42 #include <sys/mbuf.h> 43 #include <sys/protosw.h> 44 #include <sys/socket.h> 45 #include <sys/sysctl.h> 46 #include <sys/thread2.h> 47 #include <sys/in_cksum.h> 48 49 #include <machine/smp.h> 50 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <net/route.h> 54 #include <net/netisr2.h> 55 #include <net/netmsg2.h> 56 57 #include <netinet/in.h> 58 #include <netinet/ip.h> 59 #include <netinet/in_var.h> 60 #include <netinet/ip_var.h> 61 #include <netinet/ip_flow.h> 62 63 #define IPFLOW_TIMEOUT_FREQ 2 /* 2/second */ 64 #define IPFLOW_TIMEOUT (hz / IPFLOW_TIMEOUT_FREQ) 65 66 #define IPFLOW_TIMER (5 * IPFLOW_TIMEOUT_FREQ) 67 #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ 68 #define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) 69 #define IPFLOW_MAX 256 70 71 #define IPFLOW_RTENTRY_ISDOWN(rt) \ 72 (((rt)->rt_flags & RTF_UP) == 0 || \ 73 ((rt)->rt_ifp->if_flags & IFF_UP) == 0) 74 75 struct netmsg_ipfaddr { 76 struct netmsg_base base; 77 struct in_addr ipf_addr; 78 }; 79 80 struct ipflow { 81 LIST_ENTRY(ipflow) ipf_hash; /* next ipflow in hash bucket */ 82 LIST_ENTRY(ipflow) ipf_list; /* next ipflow in list */ 83 84 struct in_addr ipf_dst; /* destination address */ 85 struct in_addr ipf_src; /* source address */ 86 uint8_t ipf_tos; /* type-of-service */ 87 88 uint8_t ipf_flags; /* see IPFLOW_FLAG_ */ 89 uint8_t ipf_pad[2]; /* explicit pad */ 90 int ipf_timer; /* remaining lifetime of this entry */ 91 92 struct route ipf_ro; /* associated route entry */ 93 u_long ipf_uses; /* number of uses in this period */ 94 95 u_long ipf_dropped; /* ENOBUFS returned by if_output */ 96 u_long ipf_errors; /* other errors returned by if_output */ 97 u_long ipf_last_uses; /* number of uses in last period */ 98 }; 99 LIST_HEAD(ipflowhead, ipflow); 100 101 #define IPFLOW_FLAG_ONLIST 0x1 102 103 struct ipflow_pcpu { 104 struct ipflowhead ipf_table[IPFLOW_HASHSIZE]; 105 struct ipflowhead ipf_list; 106 int ipf_inuse; 107 struct callout ipf_timeo; 108 struct netmsg_base ipf_timeo_netmsg; 109 }; 110 111 static struct ipflow_pcpu *ipflow_pcpu_data[MAXCPU]; 112 static int ipflow_active = 0; 113 114 #define IPFLOW_INSERT(pcpu, bucket, ipf) \ 115 do { \ 116 KKASSERT(((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST) == 0); \ 117 (ipf)->ipf_flags |= IPFLOW_FLAG_ONLIST; \ 118 LIST_INSERT_HEAD((bucket), (ipf), ipf_hash); \ 119 LIST_INSERT_HEAD(&(pcpu)->ipf_list, (ipf), ipf_list); \ 120 } while (0) 121 122 #define IPFLOW_REMOVE(ipf) \ 123 do { \ 124 KKASSERT((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST); \ 125 (ipf)->ipf_flags &= ~IPFLOW_FLAG_ONLIST; \ 126 LIST_REMOVE((ipf), ipf_hash); \ 127 LIST_REMOVE((ipf), ipf_list); \ 128 } while (0) 129 130 SYSCTL_NODE(_net_inet_ip, OID_AUTO, ipflow, CTLFLAG_RW, 0, "ip flow"); 131 SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, 132 &ipflow_active, 0, "Enable flow-based IP forwarding"); 133 134 static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); 135 136 static void ipflow_free(struct ipflow_pcpu *, struct ipflow *); 137 static void ipflow_timeo(void *); 138 139 static unsigned 140 ipflow_hash(struct in_addr dst, struct in_addr src, unsigned tos) 141 { 142 unsigned hash = tos + src.s_addr; 143 int idx; 144 145 for (idx = IPFLOW_HASHBITS; idx < 32; idx += IPFLOW_HASHBITS) 146 hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); 147 return hash & (IPFLOW_HASHSIZE-1); 148 } 149 150 static struct ipflow * 151 ipflow_lookup(struct ipflow_pcpu *pcpu, const struct ip *ip) 152 { 153 unsigned hash; 154 struct ipflow *ipf; 155 156 hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); 157 LIST_FOREACH(ipf, &pcpu->ipf_table[hash], ipf_hash) { 158 if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr && 159 ip->ip_src.s_addr == ipf->ipf_src.s_addr && 160 ip->ip_tos == ipf->ipf_tos) 161 break; 162 } 163 return ipf; 164 } 165 166 int 167 ipflow_fastforward(struct mbuf *m) 168 { 169 struct ip *ip; 170 struct ipflow *ipf; 171 struct rtentry *rt; 172 struct sockaddr *dst; 173 struct ifnet *ifp; 174 int error, iplen; 175 176 ASSERT_NETISR_NCPUS(mycpuid); 177 178 /* 179 * Are we forwarding packets? 180 */ 181 if (!ipforwarding || !ipflow_active) 182 return 0; 183 184 /* 185 * Was packet received as a link-level multicast or broadcast? 186 * If so, don't try to fast forward.. 187 */ 188 if (m->m_flags & (M_BCAST | M_MCAST)) 189 return 0; 190 191 /* length checks already done in ip_hashfn() */ 192 KASSERT(m->m_len >= sizeof(struct ip), ("IP header not in one mbuf")); 193 ip = mtod(m, struct ip *); 194 195 /* 196 * IP header with no option and valid version 197 */ 198 if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2)) 199 return 0; 200 201 iplen = ntohs(ip->ip_len); 202 /* length checks already done in ip_hashfn() */ 203 KASSERT(iplen >= sizeof(struct ip), 204 ("total length less than header length")); 205 KASSERT(m->m_pkthdr.len >= iplen, ("mbuf too short")); 206 207 /* 208 * Find a flow. 209 */ 210 ipf = ipflow_lookup(ipflow_pcpu_data[mycpuid], ip); 211 if (ipf == NULL) 212 return 0; 213 214 /* 215 * Verify the IP header checksum. 216 */ 217 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 218 if (!(m->m_pkthdr.csum_flags & CSUM_IP_VALID)) 219 return 0; 220 } else { 221 /* Must compute it ourselves. */ 222 if (in_cksum_hdr(ip) != 0) 223 return 0; 224 } 225 226 /* 227 * Route and interface still up? 228 */ 229 rt = ipf->ipf_ro.ro_rt; 230 if (IPFLOW_RTENTRY_ISDOWN(rt)) 231 return 0; 232 ifp = rt->rt_ifp; 233 234 /* 235 * Packet size OK? TTL? 236 */ 237 if (m->m_pkthdr.len > ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) 238 return 0; 239 240 /* 241 * Clear any in-bound checksum flags for this packet. 242 */ 243 m->m_pkthdr.csum_flags = 0; 244 245 /* 246 * Everything checks out and so we can forward this packet. 247 * Modify the TTL and incrementally change the checksum. 248 * 249 * This method of adding the checksum works on either endian CPU. 250 * If htons() is inlined, all the arithmetic is folded; otherwise 251 * the htons()s are combined by CSE due to the __const__ attribute. 252 * 253 * Don't bother using HW checksumming here -- the incremental 254 * update is pretty fast. 255 */ 256 ip->ip_ttl -= IPTTLDEC; 257 if (ip->ip_sum >= (uint16_t)~htons(IPTTLDEC << 8)) 258 ip->ip_sum -= ~htons(IPTTLDEC << 8); 259 else 260 ip->ip_sum += htons(IPTTLDEC << 8); 261 262 /* 263 * Trim the packet in case it's too long.. 264 */ 265 if (m->m_pkthdr.len > iplen) { 266 if (m->m_len == m->m_pkthdr.len) { 267 m->m_len = iplen; 268 m->m_pkthdr.len = iplen; 269 } else { 270 m_adj(m, iplen - m->m_pkthdr.len); 271 } 272 } 273 274 /* 275 * Send the packet on its way. All we can get back is ENOBUFS 276 */ 277 ipf->ipf_uses++; 278 ipf->ipf_timer = IPFLOW_TIMER; 279 280 if (rt->rt_flags & RTF_GATEWAY) 281 dst = rt->rt_gateway; 282 else 283 dst = &ipf->ipf_ro.ro_dst; 284 285 error = ifp->if_output(ifp, m, dst, rt); 286 if (error) { 287 if (error == ENOBUFS) 288 ipf->ipf_dropped++; 289 else 290 ipf->ipf_errors++; 291 } 292 return 1; 293 } 294 295 static void 296 ipflow_addstats(struct ipflow *ipf) 297 { 298 ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; 299 ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped; 300 ipstat.ips_total += ipf->ipf_uses; 301 ipstat.ips_forward += ipf->ipf_uses; 302 ipstat.ips_fastforward += ipf->ipf_uses; 303 } 304 305 static void 306 ipflow_free(struct ipflow_pcpu *pcpu, struct ipflow *ipf) 307 { 308 KKASSERT((ipf->ipf_flags & IPFLOW_FLAG_ONLIST) == 0); 309 310 KKASSERT(pcpu->ipf_inuse > 0); 311 pcpu->ipf_inuse--; 312 313 ipflow_addstats(ipf); 314 RTFREE(ipf->ipf_ro.ro_rt); 315 kfree(ipf, M_IPFLOW); 316 } 317 318 static void 319 ipflow_reset(struct ipflow *ipf) 320 { 321 ipflow_addstats(ipf); 322 RTFREE(ipf->ipf_ro.ro_rt); 323 ipf->ipf_uses = ipf->ipf_last_uses = 0; 324 ipf->ipf_errors = ipf->ipf_dropped = 0; 325 } 326 327 static struct ipflow * 328 ipflow_reap(struct ipflow_pcpu *pcpu) 329 { 330 struct ipflow *ipf, *maybe_ipf = NULL; 331 332 LIST_FOREACH(ipf, &pcpu->ipf_list, ipf_list) { 333 /* 334 * If this no longer points to a valid route 335 * reclaim it. 336 */ 337 if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) 338 goto done; 339 340 /* 341 * choose the one that's been least recently used 342 * or has had the least uses in the last 1.5 343 * intervals. 344 */ 345 if (maybe_ipf == NULL || 346 ipf->ipf_timer < maybe_ipf->ipf_timer || 347 (ipf->ipf_timer == maybe_ipf->ipf_timer && 348 ipf->ipf_last_uses + ipf->ipf_uses < 349 maybe_ipf->ipf_last_uses + maybe_ipf->ipf_uses)) 350 maybe_ipf = ipf; 351 } 352 if (maybe_ipf == NULL) 353 return NULL; 354 355 ipf = maybe_ipf; 356 done: 357 /* 358 * Remove the entry from the flow table and reset its states 359 */ 360 IPFLOW_REMOVE(ipf); 361 ipflow_reset(ipf); 362 return ipf; 363 } 364 365 static void 366 ipflow_timeo_dispatch(netmsg_t nmsg) 367 { 368 struct ipflow *ipf, *next_ipf; 369 struct ipflow_pcpu *pcpu = ipflow_pcpu_data[mycpuid]; 370 371 ASSERT_NETISR_NCPUS(mycpuid); 372 373 crit_enter(); 374 netisr_replymsg(&nmsg->base, 0); /* reply ASAP */ 375 crit_exit(); 376 377 LIST_FOREACH_MUTABLE(ipf, &pcpu->ipf_list, ipf_list, next_ipf) { 378 if (--ipf->ipf_timer == 0) { 379 IPFLOW_REMOVE(ipf); 380 ipflow_free(pcpu, ipf); 381 } else { 382 ipf->ipf_last_uses = ipf->ipf_uses; 383 ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; 384 ipstat.ips_total += ipf->ipf_uses; 385 ipstat.ips_forward += ipf->ipf_uses; 386 ipstat.ips_fastforward += ipf->ipf_uses; 387 ipf->ipf_uses = 0; 388 } 389 } 390 callout_reset(&pcpu->ipf_timeo, IPFLOW_TIMEOUT, ipflow_timeo, pcpu); 391 } 392 393 static void 394 ipflow_timeo(void *xpcpu) 395 { 396 struct ipflow_pcpu *pcpu = xpcpu; 397 struct netmsg_base *nm = &pcpu->ipf_timeo_netmsg; 398 399 crit_enter(); 400 if (nm->lmsg.ms_flags & MSGF_DONE) 401 netisr_sendmsg_oncpu(nm); 402 crit_exit(); 403 } 404 405 void 406 ipflow_create(const struct route *ro, struct mbuf *m) 407 { 408 struct ipflow_pcpu *pcpu = ipflow_pcpu_data[mycpuid]; 409 const struct ip *const ip = mtod(m, struct ip *); 410 struct ipflow *ipf; 411 unsigned hash; 412 413 ASSERT_NETISR_NCPUS(mycpuid); 414 415 /* 416 * Don't create cache entries for ICMP messages. 417 */ 418 if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) 419 return; 420 421 /* 422 * See if an existing flow struct exists. If so remove it from it's 423 * list and free the old route. If not, try to malloc a new one 424 * (if we aren't at our limit). 425 */ 426 ipf = ipflow_lookup(pcpu, ip); 427 if (ipf == NULL) { 428 if (pcpu->ipf_inuse == IPFLOW_MAX) { 429 ipf = ipflow_reap(pcpu); 430 if (ipf == NULL) 431 return; 432 } else { 433 ipf = kmalloc(sizeof(*ipf), M_IPFLOW, 434 M_INTWAIT | M_NULLOK | M_ZERO); 435 if (ipf == NULL) 436 return; 437 pcpu->ipf_inuse++; 438 } 439 } else { 440 IPFLOW_REMOVE(ipf); 441 ipflow_reset(ipf); 442 } 443 444 /* 445 * Fill in the updated information. 446 */ 447 ipf->ipf_ro = *ro; 448 ro->ro_rt->rt_refcnt++; 449 ipf->ipf_dst = ip->ip_dst; 450 ipf->ipf_src = ip->ip_src; 451 ipf->ipf_tos = ip->ip_tos; 452 ipf->ipf_timer = IPFLOW_TIMER; 453 454 /* 455 * Insert into the approriate bucket of the flow table. 456 */ 457 hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); 458 IPFLOW_INSERT(pcpu, &pcpu->ipf_table[hash], ipf); 459 } 460 461 void 462 ipflow_flush_oncpu(void) 463 { 464 struct ipflow_pcpu *pcpu = ipflow_pcpu_data[mycpuid]; 465 struct ipflow *ipf; 466 467 ASSERT_NETISR_NCPUS(mycpuid); 468 469 while ((ipf = LIST_FIRST(&pcpu->ipf_list)) != NULL) { 470 IPFLOW_REMOVE(ipf); 471 ipflow_free(pcpu, ipf); 472 } 473 } 474 475 static void 476 ipflow_ifaddr_handler(netmsg_t nmsg) 477 { 478 struct netmsg_ipfaddr *amsg = (struct netmsg_ipfaddr *)nmsg; 479 struct ipflow_pcpu *pcpu = ipflow_pcpu_data[mycpuid]; 480 struct ipflow *ipf, *next_ipf; 481 482 LIST_FOREACH_MUTABLE(ipf, &pcpu->ipf_list, ipf_list, next_ipf) { 483 if (ipf->ipf_dst.s_addr == amsg->ipf_addr.s_addr || 484 ipf->ipf_src.s_addr == amsg->ipf_addr.s_addr) { 485 IPFLOW_REMOVE(ipf); 486 ipflow_free(pcpu, ipf); 487 } 488 } 489 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 490 } 491 492 static void 493 ipflow_ifaddr(void *arg __unused, struct ifnet *ifp __unused, 494 enum ifaddr_event event, struct ifaddr *ifa) 495 { 496 struct netmsg_ipfaddr amsg; 497 498 if (ifa->ifa_addr->sa_family != AF_INET) 499 return; 500 501 /* Only add/change events need to be handled */ 502 switch (event) { 503 case IFADDR_EVENT_ADD: 504 case IFADDR_EVENT_CHANGE: 505 break; 506 507 case IFADDR_EVENT_DELETE: 508 return; 509 } 510 511 netmsg_init(&amsg.base, NULL, &curthread->td_msgport, 512 MSGF_PRIORITY, ipflow_ifaddr_handler); 513 amsg.ipf_addr = ifatoia(ifa)->ia_addr.sin_addr; 514 515 netisr_domsg_global(&amsg.base); 516 } 517 518 static void 519 ipflow_init_dispatch(netmsg_t nm) 520 { 521 struct ipflow_pcpu *pcpu; 522 int cpuid = mycpuid; 523 char oid_name[32]; 524 525 pcpu = kmalloc(sizeof(*pcpu), M_IPFLOW, M_WAITOK | M_ZERO); 526 527 netmsg_init(&pcpu->ipf_timeo_netmsg, NULL, &netisr_adone_rport, 528 MSGF_PRIORITY, ipflow_timeo_dispatch); 529 callout_init_mp(&pcpu->ipf_timeo); 530 531 ksnprintf(oid_name, sizeof(oid_name), "inuse%d", cpuid); 532 SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_net_inet_ip_ipflow), 533 OID_AUTO, oid_name, CTLFLAG_RD, &pcpu->ipf_inuse, 0, 534 "# of ip flow being used"); 535 536 ipflow_pcpu_data[cpuid] = pcpu; 537 538 callout_reset(&pcpu->ipf_timeo, IPFLOW_TIMEOUT, ipflow_timeo, pcpu); 539 540 netisr_forwardmsg(&nm->base, cpuid + 1); 541 } 542 543 static void 544 ipflow_init(void) 545 { 546 struct netmsg_base nm; 547 548 netmsg_init(&nm, NULL, &curthread->td_msgport, 0, 549 ipflow_init_dispatch); 550 netisr_domsg_global(&nm); 551 552 EVENTHANDLER_REGISTER(ifaddr_event, ipflow_ifaddr, NULL, 553 EVENTHANDLER_PRI_ANY); 554 } 555 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipflow_init, 0); 556