1 /* 2 * Copyright 1994, 1995 Massachusetts Institute of Technology 3 * 4 * Permission to use, copy, modify, and distribute this software and 5 * its documentation for any purpose and without fee is hereby 6 * granted, provided that both the above copyright notice and this 7 * permission notice appear in all copies, that both the above 8 * copyright notice and this permission notice appear in all 9 * supporting documentation, and that the name of M.I.T. not be used 10 * in advertising or publicity pertaining to distribution of the 11 * software without specific, written prior permission. M.I.T. makes 12 * no representations about the suitability of this software for any 13 * purpose. It is provided "as is" without express or implied 14 * warranty. 15 * 16 * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS 17 * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, 18 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT 20 * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 23 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: src/sys/netinet/in_rmx.c,v 1.37.2.3 2002/08/09 14:49:23 ru Exp $ 30 * $DragonFly: src/sys/netinet/in_rmx.c,v 1.14 2006/04/11 06:59:34 dillon Exp $ 31 */ 32 33 /* 34 * This code does two things necessary for the enhanced TCP metrics to 35 * function in a useful manner: 36 * 1) It marks all non-host routes as `cloning', thus ensuring that 37 * every actual reference to such a route actually gets turned 38 * into a reference to a host route to the specific destination 39 * requested. 40 * 2) When such routes lose all their references, it arranges for them 41 * to be deleted in some random collection of circumstances, so that 42 * a large quantity of stale routing data is not kept in kernel memory 43 * indefinitely. See in_rtqtimo() below for the exact mechanism. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/sysctl.h> 50 #include <sys/socket.h> 51 #include <sys/mbuf.h> 52 #include <sys/syslog.h> 53 #include <sys/globaldata.h> 54 #include <sys/thread2.h> 55 56 #include <net/if.h> 57 #include <net/route.h> 58 #include <net/if_var.h> 59 #include <netinet/in.h> 60 #include <netinet/in_var.h> 61 #include <netinet/ip_var.h> 62 #include <netinet/ip_flow.h> 63 64 #define RTPRF_EXPIRING RTF_PROTO3 /* set on routes we manage */ 65 66 static struct callout in_rtqtimo_ch[MAXCPU]; 67 68 /* 69 * Do what we need to do when inserting a route. 70 */ 71 static struct radix_node * 72 in_addroute(char *key, char *mask, struct radix_node_head *head, 73 struct radix_node *treenodes) 74 { 75 struct rtentry *rt = (struct rtentry *)treenodes; 76 struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); 77 struct radix_node *ret; 78 struct in_ifaddr_container *iac; 79 struct in_ifaddr *ia; 80 81 /* 82 * For IP, mark routes to multicast addresses as such, because 83 * it's easy to do and might be useful (but this is much more 84 * dubious since it's so easy to inspect the address). 85 * 86 * For IP, all unicast non-host routes are automatically cloning. 87 */ 88 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 89 rt->rt_flags |= RTF_MULTICAST; 90 91 if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) 92 rt->rt_flags |= RTF_PRCLONING; 93 94 /* 95 * For host routes, we make sure that RTF_BROADCAST 96 * is set for anything that looks like a broadcast address. 97 * This way, we can avoid an expensive call to in_broadcast() 98 * in ip_output() most of the time (because the route passed 99 * to ip_output() is almost always a host route). 100 * 101 * For local routes we set RTF_LOCAL allowing various shortcuts. 102 * 103 * A cloned network route will point to one of several possible 104 * addresses if an interface has aliases and must be repointed 105 * back to the correct address or arp_rtrequest() will not properly 106 * detect the local ip. 107 */ 108 if (rt->rt_flags & RTF_HOST) { 109 if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { 110 rt->rt_flags |= RTF_BROADCAST; 111 } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == 112 sin->sin_addr.s_addr) { 113 rt->rt_flags |= RTF_LOCAL; 114 } else { 115 LIST_FOREACH(iac, INADDR_HASH(sin->sin_addr.s_addr), 116 ia_hash) { 117 ia = iac->ia; 118 if (sin->sin_addr.s_addr == 119 ia->ia_addr.sin_addr.s_addr) { 120 rt->rt_flags |= RTF_LOCAL; 121 IFAREF(&ia->ia_ifa); 122 IFAFREE(rt->rt_ifa); 123 rt->rt_ifa = &ia->ia_ifa; 124 rt->rt_ifp = rt->rt_ifa->ifa_ifp; 125 break; 126 } 127 } 128 } 129 } 130 131 if (rt->rt_rmx.rmx_mtu != 0 && !(rt->rt_rmx.rmx_locks & RTV_MTU) && 132 rt->rt_ifp != NULL) 133 rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; 134 135 ret = rn_addroute(key, mask, head, treenodes); 136 if (ret == NULL && (rt->rt_flags & RTF_HOST)) { 137 struct rtentry *oldrt; 138 139 /* 140 * We are trying to add a host route, but can't. 141 * Find out if it is because of an ARP entry and 142 * delete it if so. 143 */ 144 oldrt = rtpurelookup((struct sockaddr *)sin); 145 if (oldrt != NULL) { 146 --oldrt->rt_refcnt; 147 if ((oldrt->rt_flags & RTF_LLINFO) && 148 (oldrt->rt_flags & RTF_HOST) && 149 oldrt->rt_gateway && 150 oldrt->rt_gateway->sa_family == AF_LINK) { 151 rtrequest(RTM_DELETE, rt_key(oldrt), 152 oldrt->rt_gateway, rt_mask(oldrt), 153 oldrt->rt_flags, NULL); 154 ret = rn_addroute(key, mask, head, treenodes); 155 } 156 } 157 } 158 159 /* 160 * If the new route has been created successfully, and it is 161 * not a multicast/broadcast or cloned route, then we will 162 * have to flush the ipflow. Otherwise, we may end up using 163 * the wrong route. 164 */ 165 if (ret != NULL && 166 (rt->rt_flags & 167 (RTF_MULTICAST | RTF_BROADCAST | RTF_WASCLONED)) == 0) { 168 ipflow_flush_oncpu(); 169 } 170 return ret; 171 } 172 173 /* 174 * This code is the inverse of in_closeroute: on first reference, if we 175 * were managing the route, stop doing so and set the expiration timer 176 * back off again. 177 */ 178 static struct radix_node * 179 in_matchroute(char *key, struct radix_node_head *head) 180 { 181 struct radix_node *rn = rn_match(key, head); 182 struct rtentry *rt = (struct rtentry *)rn; 183 184 if (rt != NULL && rt->rt_refcnt == 0) { /* this is first reference */ 185 if (rt->rt_flags & RTPRF_EXPIRING) { 186 rt->rt_flags &= ~RTPRF_EXPIRING; 187 rt->rt_rmx.rmx_expire = 0; 188 } 189 } 190 return rn; 191 } 192 193 static int rtq_reallyold = 60*60; /* one hour is ``really old'' */ 194 SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, 195 &rtq_reallyold , 0, 196 "Default expiration time on cloned routes"); 197 198 static int rtq_minreallyold = 10; /* never automatically crank down to less */ 199 SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, 200 &rtq_minreallyold , 0, 201 "Minimum time to attempt to hold onto cloned routes"); 202 203 static int rtq_toomany = 128; /* 128 cached routes is ``too many'' */ 204 SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, 205 &rtq_toomany , 0, "Upper limit on cloned routes"); 206 207 /* 208 * On last reference drop, mark the route as belong to us so that it can be 209 * timed out. 210 */ 211 static void 212 in_closeroute(struct radix_node *rn, struct radix_node_head *head) 213 { 214 struct rtentry *rt = (struct rtentry *)rn; 215 216 if (!(rt->rt_flags & RTF_UP)) 217 return; /* prophylactic measures */ 218 219 if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) 220 return; 221 222 if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_EXPIRING)) != RTF_WASCLONED) 223 return; 224 225 /* 226 * As requested by David Greenman: 227 * If rtq_reallyold is 0, just delete the route without 228 * waiting for a timeout cycle to kill it. 229 */ 230 if (rtq_reallyold != 0) { 231 rt->rt_flags |= RTPRF_EXPIRING; 232 rt->rt_rmx.rmx_expire = time_second + rtq_reallyold; 233 } else { 234 /* 235 * Remove route from the radix tree, but defer deallocation 236 * until we return to rtfree(). 237 */ 238 rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), 239 rt->rt_flags, &rt); 240 } 241 } 242 243 struct rtqk_arg { 244 struct radix_node_head *rnh; 245 int draining; 246 int killed; 247 int found; 248 int updating; 249 time_t nextstop; 250 }; 251 252 /* 253 * Get rid of old routes. When draining, this deletes everything, even when 254 * the timeout is not expired yet. When updating, this makes sure that 255 * nothing has a timeout longer than the current value of rtq_reallyold. 256 */ 257 static int 258 in_rtqkill(struct radix_node *rn, void *rock) 259 { 260 struct rtqk_arg *ap = rock; 261 struct rtentry *rt = (struct rtentry *)rn; 262 int err; 263 264 if (rt->rt_flags & RTPRF_EXPIRING) { 265 ap->found++; 266 if (ap->draining || rt->rt_rmx.rmx_expire <= time_second) { 267 if (rt->rt_refcnt > 0) 268 panic("rtqkill route really not free"); 269 270 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, 271 rt_mask(rt), rt->rt_flags, NULL); 272 if (err) 273 log(LOG_WARNING, "in_rtqkill: error %d\n", err); 274 else 275 ap->killed++; 276 } else { 277 if (ap->updating && 278 (rt->rt_rmx.rmx_expire - time_second > 279 rtq_reallyold)) { 280 rt->rt_rmx.rmx_expire = time_second + 281 rtq_reallyold; 282 } 283 ap->nextstop = lmin(ap->nextstop, 284 rt->rt_rmx.rmx_expire); 285 } 286 } 287 288 return 0; 289 } 290 291 #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ 292 static int rtq_timeout = RTQ_TIMEOUT; 293 294 static void 295 in_rtqtimo(void *rock) 296 { 297 struct radix_node_head *rnh = rock; 298 struct rtqk_arg arg; 299 struct timeval atv; 300 static time_t last_adjusted_timeout = 0; 301 302 arg.found = arg.killed = 0; 303 arg.rnh = rnh; 304 arg.nextstop = time_second + rtq_timeout; 305 arg.draining = arg.updating = 0; 306 crit_enter(); 307 rnh->rnh_walktree(rnh, in_rtqkill, &arg); 308 crit_exit(); 309 310 /* 311 * Attempt to be somewhat dynamic about this: 312 * If there are ``too many'' routes sitting around taking up space, 313 * then crank down the timeout, and see if we can't make some more 314 * go away. However, we make sure that we will never adjust more 315 * than once in rtq_timeout seconds, to keep from cranking down too 316 * hard. 317 */ 318 if ((arg.found - arg.killed > rtq_toomany) && 319 (time_second - last_adjusted_timeout >= rtq_timeout) && 320 rtq_reallyold > rtq_minreallyold) { 321 rtq_reallyold = 2*rtq_reallyold / 3; 322 if (rtq_reallyold < rtq_minreallyold) { 323 rtq_reallyold = rtq_minreallyold; 324 } 325 326 last_adjusted_timeout = time_second; 327 #ifdef DIAGNOSTIC 328 log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", 329 rtq_reallyold); 330 #endif 331 arg.found = arg.killed = 0; 332 arg.updating = 1; 333 crit_enter(); 334 rnh->rnh_walktree(rnh, in_rtqkill, &arg); 335 crit_exit(); 336 } 337 338 atv.tv_usec = 0; 339 atv.tv_sec = arg.nextstop - time_second; 340 callout_reset(&in_rtqtimo_ch[mycpuid], tvtohz_high(&atv), in_rtqtimo, 341 rock); 342 } 343 344 void 345 in_rtqdrain(void) 346 { 347 struct radix_node_head *rnh = rt_tables[mycpuid][AF_INET]; 348 struct rtqk_arg arg; 349 350 arg.found = arg.killed = 0; 351 arg.rnh = rnh; 352 arg.nextstop = 0; 353 arg.draining = 1; 354 arg.updating = 0; 355 crit_enter(); 356 rnh->rnh_walktree(rnh, in_rtqkill, &arg); 357 crit_exit(); 358 } 359 360 /* 361 * Initialize our routing tree. 362 */ 363 int 364 in_inithead(void **head, int off) 365 { 366 struct radix_node_head *rnh; 367 368 if (!rn_inithead(head, rn_cpumaskhead(mycpuid), off)) 369 return 0; 370 371 if (head != (void **)&rt_tables[mycpuid][AF_INET]) /* BOGUS! */ 372 return 1; /* only do this for the real routing table */ 373 374 rnh = *head; 375 rnh->rnh_addaddr = in_addroute; 376 rnh->rnh_matchaddr = in_matchroute; 377 rnh->rnh_close = in_closeroute; 378 callout_init(&in_rtqtimo_ch[mycpuid]); 379 in_rtqtimo(rnh); /* kick off timeout first time */ 380 return 1; 381 } 382 383 /* 384 * This zaps old routes when the interface goes down or interface 385 * address is deleted. In the latter case, it deletes static routes 386 * that point to this address. If we don't do this, we may end up 387 * using the old address in the future. The ones we always want to 388 * get rid of are things like ARP entries, since the user might down 389 * the interface, walk over to a completely different network, and 390 * plug back in. 391 * 392 * in_ifadown() is typically called when an interface is being brought 393 * down. We must iterate through all per-cpu route tables and clean 394 * them up. 395 */ 396 struct in_ifadown_arg { 397 struct radix_node_head *rnh; 398 struct ifaddr *ifa; 399 int del; 400 }; 401 402 static int 403 in_ifadownkill(struct radix_node *rn, void *xap) 404 { 405 struct in_ifadown_arg *ap = xap; 406 struct rtentry *rt = (struct rtentry *)rn; 407 int err; 408 409 if (rt->rt_ifa == ap->ifa && 410 (ap->del || !(rt->rt_flags & RTF_STATIC))) { 411 /* 412 * We need to disable the automatic prune that happens 413 * in this case in rtrequest() because it will blow 414 * away the pointers that rn_walktree() needs in order 415 * continue our descent. We will end up deleting all 416 * the routes that rtrequest() would have in any case, 417 * so that behavior is not needed there. 418 */ 419 rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING); 420 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, 421 rt_mask(rt), rt->rt_flags, NULL); 422 if (err) 423 log(LOG_WARNING, "in_ifadownkill: error %d\n", err); 424 } 425 return 0; 426 } 427 428 int 429 in_ifadown(struct ifaddr *ifa, int delete) 430 { 431 struct in_ifadown_arg arg; 432 struct radix_node_head *rnh; 433 int origcpu; 434 int cpu; 435 436 if (ifa->ifa_addr->sa_family != AF_INET) 437 return 1; 438 439 /* 440 * XXX individual requests are not independantly chained, 441 * which means that the per-cpu route tables will not be 442 * consistent in the middle of the operation. If routes 443 * related to the interface are manipulated while we are 444 * doing this the inconsistancy could trigger a panic. 445 */ 446 origcpu = mycpuid; 447 for (cpu = 0; cpu < ncpus2; cpu++) { 448 lwkt_migratecpu(cpu); 449 450 arg.rnh = rnh = rt_tables[cpu][AF_INET]; 451 arg.ifa = ifa; 452 arg.del = delete; 453 rnh->rnh_walktree(rnh, in_ifadownkill, &arg); 454 ifa->ifa_flags &= ~IFA_ROUTE; 455 } 456 lwkt_migratecpu(origcpu); 457 return 0; 458 } 459 460