1 /*- 2 * Copyright (c) 1998 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the NetBSD 19 * Foundation, Inc. and its contributors. 20 * 4. Neither the name of The NetBSD Foundation nor the names of its 21 * contributors may be used to endorse or promote products derived 22 * from this software without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 * 36 * $FreeBSD: src/sys/netinet/ip_flow.c,v 1.9.2.2 2001/11/04 17:35:31 luigi Exp $ 37 * $DragonFly: src/sys/netinet/ip_flow.c,v 1.27 2008/10/28 07:09:26 sephe Exp $ 38 */ 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/sysctl.h> 47 #include <sys/thread2.h> 48 #include <sys/in_cksum.h> 49 50 #include <machine/smp.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/netisr.h> 55 #include <net/netmsg2.h> 56 57 #include <netinet/in.h> 58 #include <netinet/ip.h> 59 #include <netinet/in_var.h> 60 #include <netinet/ip_var.h> 61 #include <netinet/ip_flow.h> 62 63 #define IPFLOW_TIMER (5 * PR_SLOWHZ) 64 #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ 65 #define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) 66 #define IPFLOW_MAX 256 67 68 #define IPFLOW_RTENTRY_ISDOWN(rt) \ 69 (((rt)->rt_flags & RTF_UP) == 0 || \ 70 ((rt)->rt_ifp->if_flags & IFF_UP) == 0) 71 72 struct ipflow { 73 LIST_ENTRY(ipflow) ipf_hash; /* next ipflow in hash bucket */ 74 LIST_ENTRY(ipflow) ipf_list; /* next ipflow in list */ 75 76 struct in_addr ipf_dst; /* destination address */ 77 struct in_addr ipf_src; /* source address */ 78 uint8_t ipf_tos; /* type-of-service */ 79 80 uint8_t ipf_flags; /* see IPFLOW_FLAG_ */ 81 uint8_t ipf_pad[2]; /* explicit pad */ 82 int ipf_refcnt; /* reference count */ 83 84 struct route ipf_ro; /* associated route entry */ 85 u_long ipf_uses; /* number of uses in this period */ 86 87 int ipf_timer; /* remaining lifetime of this entry */ 88 u_long ipf_dropped; /* ENOBUFS returned by if_output */ 89 u_long ipf_errors; /* other errors returned by if_output */ 90 u_long ipf_last_uses; /* number of uses in last period */ 91 }; 92 LIST_HEAD(ipflowhead, ipflow); 93 94 #define IPFLOW_FLAG_ONLIST 0x1 95 96 #define ipflow_inuse ipflow_inuse_pcpu[mycpuid] 97 #define ipflowtable ipflowtable_pcpu[mycpuid] 98 #define ipflowlist ipflowlist_pcpu[mycpuid] 99 100 static struct ipflowhead ipflowtable_pcpu[MAXCPU][IPFLOW_HASHSIZE]; 101 static struct ipflowhead ipflowlist_pcpu[MAXCPU]; 102 static int ipflow_inuse_pcpu[MAXCPU]; 103 static struct netmsg ipflow_timo_netmsgs[MAXCPU]; 104 static int ipflow_active = 0; 105 106 #define IPFLOW_REF(ipf) \ 107 do { \ 108 KKASSERT((ipf)->ipf_refcnt > 0); \ 109 (ipf)->ipf_refcnt++; \ 110 } while (0) 111 112 #define IPFLOW_FREE(ipf) \ 113 do { \ 114 KKASSERT((ipf)->ipf_refcnt > 0); \ 115 (ipf)->ipf_refcnt--; \ 116 if ((ipf)->ipf_refcnt == 0) \ 117 ipflow_free((ipf)); \ 118 } while (0) 119 120 #define IPFLOW_INSERT(bucket, ipf) \ 121 do { \ 122 KKASSERT(((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST) == 0); \ 123 (ipf)->ipf_flags |= IPFLOW_FLAG_ONLIST; \ 124 LIST_INSERT_HEAD((bucket), (ipf), ipf_hash); \ 125 LIST_INSERT_HEAD(&ipflowlist, (ipf), ipf_list); \ 126 } while (0) 127 128 #define IPFLOW_REMOVE(ipf) \ 129 do { \ 130 KKASSERT((ipf)->ipf_flags & IPFLOW_FLAG_ONLIST); \ 131 (ipf)->ipf_flags &= ~IPFLOW_FLAG_ONLIST; \ 132 LIST_REMOVE((ipf), ipf_hash); \ 133 LIST_REMOVE((ipf), ipf_list); \ 134 } while (0) 135 136 SYSCTL_NODE(_net_inet_ip, OID_AUTO, ipflow, CTLFLAG_RW, 0, "ip flow"); 137 SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, 138 &ipflow_active, 0, "Enable flow-based IP forwarding"); 139 140 static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); 141 142 static void ipflow_free(struct ipflow *); 143 144 static unsigned 145 ipflow_hash(struct in_addr dst, struct in_addr src, unsigned tos) 146 { 147 unsigned hash = tos; 148 int idx; 149 150 for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) 151 hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); 152 return hash & (IPFLOW_HASHSIZE-1); 153 } 154 155 static struct ipflow * 156 ipflow_lookup(const struct ip *ip) 157 { 158 unsigned hash; 159 struct ipflow *ipf; 160 161 hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); 162 LIST_FOREACH(ipf, &ipflowtable[hash], ipf_hash) { 163 if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr && 164 ip->ip_src.s_addr == ipf->ipf_src.s_addr && 165 ip->ip_tos == ipf->ipf_tos) 166 break; 167 } 168 return ipf; 169 } 170 171 int 172 ipflow_fastforward(struct mbuf *m) 173 { 174 struct ip *ip; 175 struct ipflow *ipf; 176 struct rtentry *rt; 177 struct sockaddr *dst; 178 struct ifnet *ifp; 179 int error, iplen; 180 181 /* 182 * Are we forwarding packets? 183 */ 184 if (!ipforwarding || !ipflow_active) 185 return 0; 186 187 /* 188 * Was packet received as a link-level multicast or broadcast? 189 * If so, don't try to fast forward.. 190 */ 191 if (m->m_flags & (M_BCAST | M_MCAST)) 192 return 0; 193 194 /* length checks already done in ip_mport() */ 195 KASSERT(m->m_len >= sizeof(struct ip), ("IP header not in one mbuf")); 196 ip = mtod(m, struct ip *); 197 198 /* 199 * IP header with no option and valid version 200 */ 201 if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2)) 202 return 0; 203 204 iplen = ntohs(ip->ip_len); 205 /* length checks already done in ip_mport() */ 206 KASSERT(iplen >= sizeof(struct ip), 207 ("total length less then header length")); 208 KASSERT(m->m_pkthdr.len >= iplen, ("mbuf too short")); 209 210 /* 211 * Find a flow. 212 */ 213 ipf = ipflow_lookup(ip); 214 if (ipf == NULL) 215 return 0; 216 217 /* 218 * Verify the IP header checksum. 219 */ 220 if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 221 if (!(m->m_pkthdr.csum_flags & CSUM_IP_VALID)) 222 return 0; 223 } else { 224 /* Must compute it ourselves. */ 225 if (in_cksum_hdr(ip) != 0) 226 return 0; 227 } 228 229 /* 230 * Route and interface still up? 231 */ 232 rt = ipf->ipf_ro.ro_rt; 233 if (IPFLOW_RTENTRY_ISDOWN(rt)) 234 return 0; 235 ifp = rt->rt_ifp; 236 237 /* 238 * Packet size OK? TTL? 239 */ 240 if (m->m_pkthdr.len > ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) 241 return 0; 242 243 /* 244 * Clear any in-bound checksum flags for this packet. 245 */ 246 m->m_pkthdr.csum_flags = 0; 247 248 /* 249 * Everything checks out and so we can forward this packet. 250 * Modify the TTL and incrementally change the checksum. 251 * 252 * This method of adding the checksum works on either endian CPU. 253 * If htons() is inlined, all the arithmetic is folded; otherwise 254 * the htons()s are combined by CSE due to the __const__ attribute. 255 * 256 * Don't bother using HW checksumming here -- the incremental 257 * update is pretty fast. 258 */ 259 ip->ip_ttl -= IPTTLDEC; 260 if (ip->ip_sum >= (uint16_t)~htons(IPTTLDEC << 8)) 261 ip->ip_sum -= ~htons(IPTTLDEC << 8); 262 else 263 ip->ip_sum += htons(IPTTLDEC << 8); 264 265 /* 266 * Trim the packet in case it's too long.. 267 */ 268 if (m->m_pkthdr.len > iplen) { 269 if (m->m_len == m->m_pkthdr.len) { 270 m->m_len = iplen; 271 m->m_pkthdr.len = iplen; 272 } else { 273 m_adj(m, iplen - m->m_pkthdr.len); 274 } 275 } 276 277 /* 278 * Send the packet on its way. All we can get back is ENOBUFS 279 */ 280 ipf->ipf_uses++; 281 ipf->ipf_timer = IPFLOW_TIMER; 282 283 if (rt->rt_flags & RTF_GATEWAY) 284 dst = rt->rt_gateway; 285 else 286 dst = &ipf->ipf_ro.ro_dst; 287 288 /* 289 * Reference count this ipflow, before the possible blocking 290 * ifnet.if_output(), so this ipflow will not be changed or 291 * reaped behind our back. 292 */ 293 IPFLOW_REF(ipf); 294 295 error = ifp->if_output(ifp, m, dst, rt); 296 if (error) { 297 if (error == ENOBUFS) 298 ipf->ipf_dropped++; 299 else 300 ipf->ipf_errors++; 301 } 302 303 IPFLOW_FREE(ipf); 304 return 1; 305 } 306 307 static void 308 ipflow_addstats(struct ipflow *ipf) 309 { 310 ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; 311 ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped; 312 ipstat.ips_total += ipf->ipf_uses; 313 ipstat.ips_forward += ipf->ipf_uses; 314 ipstat.ips_fastforward += ipf->ipf_uses; 315 } 316 317 static void 318 ipflow_free(struct ipflow *ipf) 319 { 320 KKASSERT(ipf->ipf_refcnt == 0); 321 KKASSERT((ipf->ipf_flags & IPFLOW_FLAG_ONLIST) == 0); 322 323 KKASSERT(ipflow_inuse > 0); 324 ipflow_inuse--; 325 326 ipflow_addstats(ipf); 327 RTFREE(ipf->ipf_ro.ro_rt); 328 kfree(ipf, M_IPFLOW); 329 } 330 331 static void 332 ipflow_reset(struct ipflow *ipf) 333 { 334 ipflow_addstats(ipf); 335 RTFREE(ipf->ipf_ro.ro_rt); 336 ipf->ipf_uses = ipf->ipf_last_uses = 0; 337 ipf->ipf_errors = ipf->ipf_dropped = 0; 338 } 339 340 static struct ipflow * 341 ipflow_reap(void) 342 { 343 struct ipflow *ipf, *maybe_ipf = NULL; 344 345 LIST_FOREACH(ipf, &ipflowlist, ipf_list) { 346 /* 347 * Skip actively used ipflow 348 */ 349 if (ipf->ipf_refcnt > 1) 350 continue; 351 352 /* 353 * If this no longer points to a valid route 354 * reclaim it. 355 */ 356 if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) 357 goto done; 358 359 /* 360 * choose the one that's been least recently used 361 * or has had the least uses in the last 1.5 362 * intervals. 363 */ 364 if (maybe_ipf == NULL || 365 ipf->ipf_timer < maybe_ipf->ipf_timer || 366 (ipf->ipf_timer == maybe_ipf->ipf_timer && 367 ipf->ipf_last_uses + ipf->ipf_uses < 368 maybe_ipf->ipf_last_uses + maybe_ipf->ipf_uses)) 369 maybe_ipf = ipf; 370 } 371 if (maybe_ipf == NULL) 372 return NULL; 373 374 ipf = maybe_ipf; 375 done: 376 /* 377 * Remove the entry from the flow table and reset its states 378 */ 379 IPFLOW_REMOVE(ipf); 380 ipflow_reset(ipf); 381 return ipf; 382 } 383 384 static void 385 ipflow_timo_dispatch(struct netmsg *nmsg) 386 { 387 struct ipflow *ipf, *next_ipf; 388 389 crit_enter(); 390 lwkt_replymsg(&nmsg->nm_lmsg, 0); /* reply ASAP */ 391 crit_exit(); 392 393 LIST_FOREACH_MUTABLE(ipf, &ipflowlist, ipf_list, next_ipf) { 394 if (--ipf->ipf_timer == 0) { 395 IPFLOW_REMOVE(ipf); 396 IPFLOW_FREE(ipf); 397 } else { 398 ipf->ipf_last_uses = ipf->ipf_uses; 399 ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; 400 ipstat.ips_total += ipf->ipf_uses; 401 ipstat.ips_forward += ipf->ipf_uses; 402 ipstat.ips_fastforward += ipf->ipf_uses; 403 ipf->ipf_uses = 0; 404 } 405 } 406 } 407 408 static void 409 ipflow_timo_ipi(void *arg __unused) 410 { 411 struct lwkt_msg *msg = &ipflow_timo_netmsgs[mycpuid].nm_lmsg; 412 413 crit_enter(); 414 if (msg->ms_flags & MSGF_DONE) 415 lwkt_sendmsg(cpu_portfn(mycpuid), msg); 416 crit_exit(); 417 } 418 419 void 420 ipflow_slowtimo(void) 421 { 422 #ifdef SMP 423 uint32_t mask = 0; 424 int i; 425 426 for (i = 0; i < ncpus; ++i) { 427 if (ipflow_inuse_pcpu[i]) 428 mask |= 1 << i; 429 } 430 mask &= smp_active_mask; 431 if (mask != 0) 432 lwkt_send_ipiq_mask(mask, ipflow_timo_ipi, NULL); 433 #else 434 if (ipflow_inuse) 435 ipflow_timo_ipi(NULL); 436 #endif 437 } 438 439 void 440 ipflow_create(const struct route *ro, struct mbuf *m) 441 { 442 const struct ip *const ip = mtod(m, struct ip *); 443 struct ipflow *ipf; 444 unsigned hash; 445 446 /* 447 * Don't create cache entries for ICMP messages. 448 */ 449 if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) 450 return; 451 452 /* 453 * See if an existing flow struct exists. If so remove it from it's 454 * list and free the old route. If not, try to malloc a new one 455 * (if we aren't at our limit). 456 */ 457 ipf = ipflow_lookup(ip); 458 if (ipf == NULL) { 459 if (ipflow_inuse == IPFLOW_MAX) { 460 ipf = ipflow_reap(); 461 if (ipf == NULL) 462 return; 463 } else { 464 ipf = kmalloc(sizeof(*ipf), M_IPFLOW, 465 M_NOWAIT | M_ZERO); 466 if (ipf == NULL) 467 return; 468 ipf->ipf_refcnt = 1; 469 470 ipflow_inuse++; 471 } 472 } else { 473 if (ipf->ipf_refcnt == 1) { 474 IPFLOW_REMOVE(ipf); 475 ipflow_reset(ipf); 476 } else { 477 /* This ipflow is being used; don't change it */ 478 KKASSERT(ipf->ipf_refcnt > 1); 479 return; 480 } 481 } 482 /* This ipflow should not be actively used */ 483 KKASSERT(ipf->ipf_refcnt == 1); 484 485 /* 486 * Fill in the updated information. 487 */ 488 ipf->ipf_ro = *ro; 489 ro->ro_rt->rt_refcnt++; 490 ipf->ipf_dst = ip->ip_dst; 491 ipf->ipf_src = ip->ip_src; 492 ipf->ipf_tos = ip->ip_tos; 493 ipf->ipf_timer = IPFLOW_TIMER; 494 495 /* 496 * Insert into the approriate bucket of the flow table. 497 */ 498 hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); 499 IPFLOW_INSERT(&ipflowtable[hash], ipf); 500 } 501 502 static void 503 ipflow_init(void) 504 { 505 char oid_name[32]; 506 int i; 507 508 for (i = 0; i < ncpus; ++i) { 509 netmsg_init(&ipflow_timo_netmsgs[i], &netisr_adone_rport, 510 MSGF_MPSAFE, ipflow_timo_dispatch); 511 512 ksnprintf(oid_name, sizeof(oid_name), "inuse%d", i); 513 514 SYSCTL_ADD_INT(NULL, 515 SYSCTL_STATIC_CHILDREN(_net_inet_ip_ipflow), 516 OID_AUTO, oid_name, CTLFLAG_RD, &ipflow_inuse_pcpu[i], 0, 517 "# of ip flow being used"); 518 } 519 } 520 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipflow_init, 0); 521