1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1991, 1993, 1995 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 63 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $ 64 */ 65 66 #include "opt_inet6.h" 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/domain.h> 73 #include <sys/protosw.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/proc.h> 77 #include <sys/priv.h> 78 #include <sys/jail.h> 79 #include <sys/kernel.h> 80 #include <sys/sysctl.h> 81 82 #include <sys/thread2.h> 83 #include <sys/socketvar2.h> 84 #include <sys/msgport2.h> 85 86 #include <machine/limits.h> 87 88 #include <net/if.h> 89 #include <net/if_types.h> 90 #include <net/route.h> 91 #include <net/netisr2.h> 92 #include <net/toeplitz2.h> 93 94 #include <netinet/in.h> 95 #include <netinet/in_pcb.h> 96 #include <netinet/in_var.h> 97 #include <netinet/ip_var.h> 98 #ifdef INET6 99 #include <netinet/ip6.h> 100 #include <netinet6/ip6_var.h> 101 #endif /* INET6 */ 102 103 #define INP_LOCALGROUP_SIZMIN 8 104 #define INP_LOCALGROUP_SIZMAX 256 105 106 static struct inpcb *in_pcblookup_local(struct inpcbporthead *porthash, 107 struct in_addr laddr, u_int lport_arg, int wild_okay, 108 struct ucred *cred); 109 110 struct in_addr zeroin_addr; 111 112 /* 113 * These configure the range of local port addresses assigned to 114 * "unspecified" outgoing connections/packets/whatever. 115 */ 116 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 117 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 118 119 int ipport_firstauto = IPPORT_RESERVED; /* 1024 */ 120 int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */ 121 122 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 123 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 124 125 #define RANGECHK(var, min, max) \ 126 if ((var) < (min)) { (var) = (min); } \ 127 else if ((var) > (max)) { (var) = (max); } 128 129 int udpencap_enable = 1; /* enabled by default */ 130 int udpencap_port = 4500; /* triggers decapsulation */ 131 132 /* 133 * Per-netisr inpcb markers. 134 * NOTE: they should only be used in netisrs. 135 */ 136 static struct inpcb *in_pcbmarkers; 137 static struct inpcontainer *in_pcbcontainer_markers; 138 139 static int 140 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 141 { 142 int error; 143 144 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 145 if (!error) { 146 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 147 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 148 149 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); 150 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); 151 152 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); 153 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); 154 } 155 return (error); 156 } 157 158 #undef RANGECHK 159 160 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); 161 162 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, 163 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); 164 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, 165 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); 166 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, 167 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); 168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, 169 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); 170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, 171 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); 172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, 173 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); 174 175 /* Initialized by ip_init() */ 176 int ip_porthash_trycount; 177 SYSCTL_INT(_net_inet_ip, OID_AUTO, porthash_trycount, CTLFLAG_RW, 178 &ip_porthash_trycount, 0, 179 "Number of tries to find local port matching hash of 4-tuple"); 180 181 /* 182 * in_pcb.c: manage the Protocol Control Blocks. 183 * 184 * NOTE: It is assumed that most of these functions will be called from 185 * a critical section. XXX - There are, unfortunately, a few exceptions 186 * to this rule that should be fixed. 187 * 188 * NOTE: The caller should initialize the cpu field to the cpu running the 189 * protocol stack associated with this inpcbinfo. 190 */ 191 192 void 193 in_pcbinfo_init(struct inpcbinfo *pcbinfo, int cpu, boolean_t shared) 194 { 195 KASSERT(cpu >= 0 && cpu < netisr_ncpus, ("invalid cpu%d", cpu)); 196 pcbinfo->cpu = cpu; 197 198 LIST_INIT(&pcbinfo->pcblisthead); 199 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB, 200 M_WAITOK | M_ZERO); 201 202 if (shared) { 203 pcbinfo->infotoken = kmalloc(sizeof(struct lwkt_token), 204 M_PCB, M_WAITOK); 205 lwkt_token_init(pcbinfo->infotoken, "infotoken"); 206 } else { 207 pcbinfo->infotoken = NULL; 208 } 209 } 210 211 void 212 in_pcbportinfo_set(struct inpcbinfo *pcbinfo, struct inpcbportinfo *portinfo, 213 int portinfo_cnt) 214 { 215 216 KASSERT(portinfo_cnt > 0, ("invalid portinfo_cnt %d", portinfo_cnt)); 217 pcbinfo->portinfo = portinfo; 218 pcbinfo->portinfo_cnt = portinfo_cnt; 219 } 220 221 struct baddynamicports baddynamicports; 222 223 /* 224 * Check if the specified port is invalid for dynamic allocation. 225 */ 226 int 227 in_baddynamic(u_int16_t port, u_int16_t proto) 228 { 229 switch (proto) { 230 case IPPROTO_TCP: 231 return (DP_ISSET(baddynamicports.tcp, port)); 232 case IPPROTO_UDP: 233 return (DP_ISSET(baddynamicports.udp, port)); 234 default: 235 return (0); 236 } 237 } 238 239 void 240 in_pcbonlist(struct inpcb *inp) 241 { 242 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 243 244 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 245 ("not in the correct netisr")); 246 KASSERT((inp->inp_flags & INP_ONLIST) == 0, ("already on pcblist")); 247 inp->inp_flags |= INP_ONLIST; 248 249 GET_PCBINFO_TOKEN(pcbinfo); 250 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list); 251 pcbinfo->ipi_count++; 252 REL_PCBINFO_TOKEN(pcbinfo); 253 } 254 255 void 256 in_pcbofflist(struct inpcb *inp) 257 { 258 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 259 260 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 261 ("not in the correct netisr")); 262 KASSERT(inp->inp_flags & INP_ONLIST, ("not on pcblist")); 263 inp->inp_flags &= ~INP_ONLIST; 264 265 GET_PCBINFO_TOKEN(pcbinfo); 266 LIST_REMOVE(inp, inp_list); 267 KASSERT(pcbinfo->ipi_count > 0, 268 ("invalid inpcb count %d", pcbinfo->ipi_count)); 269 pcbinfo->ipi_count--; 270 REL_PCBINFO_TOKEN(pcbinfo); 271 } 272 273 /* 274 * Allocate a PCB and associate it with the socket. 275 */ 276 int 277 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 278 { 279 struct inpcb *inp; 280 281 inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO|M_NULLOK); 282 if (inp == NULL) 283 return (ENOMEM); 284 inp->inp_lgrpindex = -1; 285 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 286 inp->inp_pcbinfo = pcbinfo; 287 inp->inp_socket = so; 288 #ifdef INET6 289 if (INP_CHECK_SOCKAF(so, AF_INET6)) { 290 if (ip6_auto_flowlabel) 291 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 292 inp->inp_af = AF_INET6; 293 } else 294 #endif 295 inp->inp_af = AF_INET; 296 soreference(so); 297 so->so_pcb = inp; 298 299 in_pcbonlist(inp); 300 return (0); 301 } 302 303 /* 304 * Unlink a pcb with the intention of moving it to another cpu with a 305 * different pcbinfo. While unlinked nothing should attempt to dereference 306 * inp_pcbinfo, NULL it out so we assert if it does. 307 */ 308 void 309 in_pcbunlink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags) 310 { 311 KASSERT(inp->inp_pcbinfo == pcbinfo, ("pcbinfo mismatch")); 312 KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0, 313 ("already linked")); 314 315 in_pcbofflist(inp); 316 inp->inp_pcbinfo = NULL; 317 } 318 319 void 320 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 321 { 322 in_pcbunlink_flags(inp, pcbinfo, INP_WILDCARD); 323 } 324 325 /* 326 * Relink a pcb into a new pcbinfo. 327 */ 328 void 329 in_pcblink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags) 330 { 331 KASSERT(inp->inp_pcbinfo == NULL, ("has pcbinfo")); 332 KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0, 333 ("already linked")); 334 335 inp->inp_pcbinfo = pcbinfo; 336 in_pcbonlist(inp); 337 } 338 339 void 340 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 341 { 342 return in_pcblink_flags(inp, pcbinfo, INP_WILDCARD); 343 } 344 345 static boolean_t 346 in_pcbporthash_update(struct inpcbportinfo *portinfo, 347 struct inpcb *inp, u_short lport, struct ucred *cred, int wild) 348 { 349 struct inpcbporthead *porthash; 350 351 /* 352 * This has to be atomic. If the porthash is shared across multiple 353 * protocol threads, e.g. tcp and udp, then the token must be held. 354 */ 355 porthash = in_pcbporthash_head(portinfo, lport); 356 GET_PORTHASH_TOKEN(porthash); 357 358 if (in_pcblookup_local(porthash, inp->inp_laddr, lport, 359 wild, cred) != NULL) { 360 REL_PORTHASH_TOKEN(porthash); 361 return FALSE; 362 } 363 inp->inp_lport = lport; 364 in_pcbinsporthash(porthash, inp); 365 366 REL_PORTHASH_TOKEN(porthash); 367 return TRUE; 368 } 369 370 static int 371 in_pcbsetlport(struct inpcb *inp, int wild, struct ucred *cred) 372 { 373 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 374 struct inpcbportinfo *portinfo; 375 u_short first, last, lport, step, first0, last0; 376 int count, error; 377 int portinfo_first, portinfo_idx; 378 uint32_t cut; 379 380 inp->inp_flags |= INP_ANONPORT; 381 382 step = pcbinfo->portinfo_cnt; 383 portinfo_first = mycpuid % pcbinfo->portinfo_cnt; 384 portinfo_idx = portinfo_first; 385 386 if (inp->inp_flags & INP_HIGHPORT) { 387 first0 = ipport_hifirstauto; /* sysctl */ 388 last0 = ipport_hilastauto; 389 } else if (inp->inp_flags & INP_LOWPORT) { 390 if (cred && 391 (error = 392 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 393 inp->inp_laddr.s_addr = INADDR_ANY; 394 return error; 395 } 396 first0 = ipport_lowfirstauto; /* 1023 */ 397 last0 = ipport_lowlastauto; /* 600 */ 398 } else { 399 first0 = ipport_firstauto; /* sysctl */ 400 last0 = ipport_lastauto; 401 } 402 if (first0 > last0) { 403 lport = last0; 404 last0 = first0; 405 first0 = lport; 406 } 407 KKASSERT(last0 >= first0); 408 409 cut = karc4random(); 410 loop: 411 portinfo = &pcbinfo->portinfo[portinfo_idx]; 412 first = first0; 413 last = last0; 414 415 /* 416 * Simple check to ensure all ports are not used up causing 417 * a deadlock here. 418 */ 419 in_pcbportrange(&last, &first, portinfo->offset, step); 420 lport = last - first; 421 count = lport / step; 422 423 lport = rounddown(cut % lport, step) + first; 424 KKASSERT(lport % step == portinfo->offset); 425 426 for (;;) { 427 if (count-- < 0) { /* completely used? */ 428 error = EADDRNOTAVAIL; 429 break; 430 } 431 432 if (__predict_false(lport < first || lport > last)) { 433 lport = first; 434 KKASSERT(lport % step == portinfo->offset); 435 } 436 437 if (in_pcbporthash_update(portinfo, inp, htons(lport), 438 cred, wild)) { 439 error = 0; 440 break; 441 } 442 443 lport += step; 444 KKASSERT(lport % step == portinfo->offset); 445 } 446 447 if (error) { 448 /* Try next portinfo */ 449 portinfo_idx++; 450 portinfo_idx %= pcbinfo->portinfo_cnt; 451 if (portinfo_idx != portinfo_first) 452 goto loop; 453 inp->inp_laddr.s_addr = INADDR_ANY; 454 } 455 return error; 456 } 457 458 int 459 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 460 { 461 struct socket *so = inp->inp_socket; 462 struct sockaddr_in jsin; 463 struct ucred *cred = NULL; 464 int wild = 0; 465 466 if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */ 467 return (EADDRNOTAVAIL); 468 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 469 return (EINVAL); /* already bound */ 470 471 if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT))) 472 wild = 1; /* neither SO_REUSEADDR nor SO_REUSEPORT is set */ 473 if (td->td_proc) 474 cred = td->td_proc->p_ucred; 475 476 if (nam != NULL) { 477 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 478 struct inpcbinfo *pcbinfo; 479 struct inpcbportinfo *portinfo; 480 struct inpcbporthead *porthash; 481 struct inpcb *t; 482 u_short lport, lport_ho; 483 int reuseport = (so->so_options & SO_REUSEPORT); 484 int error; 485 486 if (nam->sa_len != sizeof *sin) 487 return (EINVAL); 488 #ifdef notdef 489 /* 490 * We should check the family, but old programs 491 * incorrectly fail to initialize it. 492 */ 493 if (sin->sin_family != AF_INET) 494 return (EAFNOSUPPORT); 495 #endif 496 if (!prison_replace_wildcards(td, nam)) 497 return (EINVAL); 498 499 lport = sin->sin_port; 500 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 501 /* 502 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 503 * allow complete duplication of binding if 504 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 505 * and a multicast address is bound on both 506 * new and duplicated sockets. 507 */ 508 if (so->so_options & SO_REUSEADDR) 509 reuseport = SO_REUSEADDR | SO_REUSEPORT; 510 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 511 sin->sin_port = 0; /* yech... */ 512 bzero(&sin->sin_zero, sizeof sin->sin_zero); 513 if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL) 514 return (EADDRNOTAVAIL); 515 } 516 517 inp->inp_laddr = sin->sin_addr; 518 519 jsin.sin_family = AF_INET; 520 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 521 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 522 inp->inp_laddr.s_addr = INADDR_ANY; 523 return (EINVAL); 524 } 525 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 526 527 if (lport == 0) { 528 /* Auto-select local port */ 529 return in_pcbsetlport(inp, wild, cred); 530 } 531 lport_ho = ntohs(lport); 532 533 /* GROSS */ 534 if (lport_ho < IPPORT_RESERVED && cred && 535 (error = 536 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 537 inp->inp_laddr.s_addr = INADDR_ANY; 538 return (error); 539 } 540 541 /* 542 * Locate the proper portinfo based on lport 543 */ 544 pcbinfo = inp->inp_pcbinfo; 545 portinfo = 546 &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt]; 547 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == 548 portinfo->offset); 549 550 /* 551 * This has to be atomic. If the porthash is shared across 552 * multiple protocol threads, e.g. tcp and udp then the token 553 * must be held. 554 */ 555 porthash = in_pcbporthash_head(portinfo, lport); 556 GET_PORTHASH_TOKEN(porthash); 557 558 if (so->so_cred->cr_uid != 0 && 559 !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 560 t = in_pcblookup_local(porthash, sin->sin_addr, lport, 561 INPLOOKUP_WILDCARD, cred); 562 if (t && 563 (so->so_cred->cr_uid != 564 t->inp_socket->so_cred->cr_uid)) { 565 inp->inp_laddr.s_addr = INADDR_ANY; 566 error = EADDRINUSE; 567 goto done; 568 } 569 } 570 if (cred && !prison_replace_wildcards(td, nam)) { 571 inp->inp_laddr.s_addr = INADDR_ANY; 572 error = EADDRNOTAVAIL; 573 goto done; 574 } 575 t = in_pcblookup_local(porthash, sin->sin_addr, lport, 576 wild, cred); 577 if (t && !(reuseport & t->inp_socket->so_options)) { 578 inp->inp_laddr.s_addr = INADDR_ANY; 579 error = EADDRINUSE; 580 goto done; 581 } 582 inp->inp_lport = lport; 583 in_pcbinsporthash(porthash, inp); 584 error = 0; 585 done: 586 REL_PORTHASH_TOKEN(porthash); 587 return (error); 588 } else { 589 jsin.sin_family = AF_INET; 590 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 591 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 592 inp->inp_laddr.s_addr = INADDR_ANY; 593 return (EINVAL); 594 } 595 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 596 597 return in_pcbsetlport(inp, wild, cred); 598 } 599 } 600 601 static struct inpcb * 602 in_pcblookup_localremote(struct inpcbporthead *porthash, struct in_addr laddr, 603 u_short lport, struct in_addr faddr, u_short fport, struct ucred *cred) 604 { 605 struct inpcb *inp; 606 struct inpcbport *phd; 607 struct inpcb *match = NULL; 608 609 /* 610 * If the porthashbase is shared across several cpus, it must 611 * have been locked. 612 */ 613 ASSERT_PORTHASH_TOKEN_HELD(porthash); 614 615 /* 616 * Best fit PCB lookup. 617 * 618 * First see if this local port is in use by looking on the 619 * port hash list. 620 */ 621 LIST_FOREACH(phd, porthash, phd_hash) { 622 if (phd->phd_port == lport) 623 break; 624 } 625 if (phd != NULL) { 626 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 627 #ifdef INET6 628 if (!INP_ISIPV4(inp)) 629 continue; 630 #endif 631 if (inp->inp_laddr.s_addr != INADDR_ANY && 632 inp->inp_laddr.s_addr != laddr.s_addr) 633 continue; 634 635 if (inp->inp_faddr.s_addr != INADDR_ANY && 636 inp->inp_faddr.s_addr != faddr.s_addr) 637 continue; 638 639 if (inp->inp_fport != 0 && inp->inp_fport != fport) 640 continue; 641 642 if (cred == NULL || 643 cred->cr_prison == 644 inp->inp_socket->so_cred->cr_prison) { 645 match = inp; 646 break; 647 } 648 } 649 } 650 return (match); 651 } 652 653 static boolean_t 654 in_pcbporthash_update4(struct inpcbportinfo *portinfo, 655 struct inpcb *inp, u_short lport, const struct sockaddr_in *sin, 656 struct ucred *cred) 657 { 658 struct inpcbporthead *porthash; 659 660 /* 661 * This has to be atomic. If the porthash is shared across multiple 662 * protocol threads, e.g. tcp and udp, then the token must be held. 663 */ 664 porthash = in_pcbporthash_head(portinfo, lport); 665 GET_PORTHASH_TOKEN(porthash); 666 667 if (in_pcblookup_localremote(porthash, inp->inp_laddr, 668 lport, sin->sin_addr, sin->sin_port, cred) != NULL) { 669 REL_PORTHASH_TOKEN(porthash); 670 return FALSE; 671 } 672 inp->inp_lport = lport; 673 in_pcbinsporthash(porthash, inp); 674 675 REL_PORTHASH_TOKEN(porthash); 676 return TRUE; 677 } 678 679 int 680 in_pcbbind_remote(struct inpcb *inp, const struct sockaddr *remote, 681 struct thread *td) 682 { 683 struct proc *p = td->td_proc; 684 const struct sockaddr_in *sin = (const struct sockaddr_in *)remote; 685 struct sockaddr_in jsin; 686 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 687 struct ucred *cred = NULL; 688 u_short first, last, lport; 689 int count, hash_count; 690 int error, selfconn = 0; 691 int cpuid = mycpuid; 692 uint32_t hash_base = 0, hash; 693 694 ASSERT_NETISR_NCPUS(cpuid); 695 696 if (TAILQ_EMPTY(&in_ifaddrheads[cpuid])) /* XXX broken! */ 697 return (EADDRNOTAVAIL); 698 699 KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY); 700 if (inp->inp_lport != 0) 701 return (EINVAL); /* already bound */ 702 703 KKASSERT(p); 704 cred = p->p_ucred; 705 706 jsin.sin_family = AF_INET; 707 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 708 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 709 inp->inp_laddr.s_addr = INADDR_ANY; 710 return (EINVAL); 711 } 712 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 713 714 hash_count = ip_porthash_trycount; 715 if (hash_count > 0) { 716 hash_base = toeplitz_piecemeal_addr(sin->sin_addr.s_addr) ^ 717 toeplitz_piecemeal_addr(inp->inp_laddr.s_addr) ^ 718 toeplitz_piecemeal_port(sin->sin_port); 719 } else { 720 hash_count = 0; 721 } 722 723 inp->inp_flags |= INP_ANONPORT; 724 725 if (inp->inp_flags & INP_HIGHPORT) { 726 first = ipport_hifirstauto; /* sysctl */ 727 last = ipport_hilastauto; 728 } else if (inp->inp_flags & INP_LOWPORT) { 729 if (cred && 730 (error = 731 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 732 inp->inp_laddr.s_addr = INADDR_ANY; 733 return (error); 734 } 735 first = ipport_lowfirstauto; /* 1023 */ 736 last = ipport_lowlastauto; /* 600 */ 737 } else { 738 first = ipport_firstauto; /* sysctl */ 739 last = ipport_lastauto; 740 } 741 if (first > last) { 742 lport = last; 743 last = first; 744 first = lport; 745 } 746 KKASSERT(last >= first); 747 748 count = last - first; 749 lport = (karc4random() % count) + first; 750 count += hash_count; 751 752 /* 753 * Simple check to ensure all ports are not used up causing 754 * a deadlock here. 755 */ 756 for (;;) { 757 u_short lport_no; 758 759 if (count-- < 0) { /* completely used? */ 760 error = EADDRNOTAVAIL; 761 break; 762 } 763 764 if (__predict_false(lport < first || lport > last)) 765 lport = first; 766 lport_no = htons(lport); 767 768 /* This could happen on loopback interface */ 769 if (__predict_false(sin->sin_port == lport_no && 770 sin->sin_addr.s_addr == inp->inp_laddr.s_addr)) { 771 if (!selfconn) { 772 ++count; /* don't count this try */ 773 selfconn = 1; 774 } 775 goto next; 776 } 777 778 if (hash_count) { 779 --hash_count; 780 hash = hash_base ^ 781 toeplitz_piecemeal_port(lport_no); 782 if (netisr_hashcpu(hash) != cpuid && hash_count) 783 goto next; 784 } 785 786 if (in_pcbporthash_update4( 787 &pcbinfo->portinfo[lport % pcbinfo->portinfo_cnt], 788 inp, lport_no, sin, cred)) { 789 error = 0; 790 break; 791 } 792 next: 793 ++lport; 794 } 795 796 if (error) 797 inp->inp_laddr.s_addr = INADDR_ANY; 798 return (error); 799 } 800 801 /* 802 * Transform old in_pcbconnect() into an inner subroutine for new 803 * in_pcbconnect(): Do some validity-checking on the remote 804 * address (in mbuf 'nam') and then determine local host address 805 * (i.e., which interface) to use to access that remote host. 806 * 807 * This preserves definition of in_pcbconnect(), while supporting a 808 * slightly different version for T/TCP. (This is more than 809 * a bit of a kludge, but cleaning up the internal interfaces would 810 * have forced minor changes in every protocol). 811 */ 812 int 813 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam, 814 struct sockaddr_in **plocal_sin, struct thread *td, int find) 815 { 816 struct in_ifaddr *ia; 817 struct ucred *cred = NULL; 818 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 819 struct sockaddr *jsin; 820 int jailed = 0, alloc_route = 0; 821 822 if (nam->sa_len != sizeof *sin) 823 return (EINVAL); 824 if (sin->sin_family != AF_INET) 825 return (EAFNOSUPPORT); 826 if (sin->sin_port == 0) 827 return (EADDRNOTAVAIL); 828 if (td && td->td_proc && td->td_proc->p_ucred) 829 cred = td->td_proc->p_ucred; 830 if (cred && cred->cr_prison) 831 jailed = 1; 832 if (!TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) { 833 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia; 834 /* 835 * If the destination address is INADDR_ANY, 836 * use the primary local address. 837 * If the supplied address is INADDR_BROADCAST, 838 * and the primary interface supports broadcast, 839 * choose the broadcast address for that interface. 840 */ 841 if (sin->sin_addr.s_addr == INADDR_ANY) 842 sin->sin_addr = IA_SIN(ia)->sin_addr; 843 else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && 844 (ia->ia_ifp->if_flags & IFF_BROADCAST)) 845 sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr; 846 } 847 if (find) { 848 struct route *ro; 849 850 ia = NULL; 851 /* 852 * If route is known or can be allocated now, 853 * our src addr is taken from the i/f, else punt. 854 * Note that we should check the address family of the cached 855 * destination, in case of sharing the cache with IPv6. 856 */ 857 ro = &inp->inp_route; 858 if (ro->ro_rt && 859 (!(ro->ro_rt->rt_flags & RTF_UP) || 860 ro->ro_dst.sa_family != AF_INET || 861 satosin(&ro->ro_dst)->sin_addr.s_addr != 862 sin->sin_addr.s_addr || 863 inp->inp_socket->so_options & SO_DONTROUTE)) { 864 RTFREE(ro->ro_rt); 865 ro->ro_rt = NULL; 866 } 867 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 868 (ro->ro_rt == NULL || 869 ro->ro_rt->rt_ifp == NULL)) { 870 /* No route yet, so try to acquire one */ 871 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 872 ro->ro_dst.sa_family = AF_INET; 873 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 874 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 875 sin->sin_addr; 876 rtalloc(ro); 877 alloc_route = 1; 878 } 879 /* 880 * If we found a route, use the address 881 * corresponding to the outgoing interface 882 * unless it is the loopback (in case a route 883 * to our address on another net goes to loopback). 884 */ 885 if (ro->ro_rt && 886 !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) { 887 if (jailed) { 888 if (jailed_ip(cred->cr_prison, 889 ro->ro_rt->rt_ifa->ifa_addr)) { 890 ia = ifatoia(ro->ro_rt->rt_ifa); 891 } 892 } else { 893 ia = ifatoia(ro->ro_rt->rt_ifa); 894 } 895 } 896 if (ia == NULL) { 897 u_short fport = sin->sin_port; 898 899 sin->sin_port = 0; 900 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); 901 if (ia && jailed && !jailed_ip(cred->cr_prison, 902 sintosa(&ia->ia_addr))) 903 ia = NULL; 904 if (ia == NULL) 905 ia = ifatoia(ifa_ifwithnet(sintosa(sin))); 906 if (ia && jailed && !jailed_ip(cred->cr_prison, 907 sintosa(&ia->ia_addr))) 908 ia = NULL; 909 sin->sin_port = fport; 910 if (ia == NULL && 911 !TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) 912 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia; 913 if (ia && jailed && !jailed_ip(cred->cr_prison, 914 sintosa(&ia->ia_addr))) 915 ia = NULL; 916 917 if (!jailed && ia == NULL) 918 goto fail; 919 } 920 /* 921 * If the destination address is multicast and an outgoing 922 * interface has been set as a multicast option, use the 923 * address of that interface as our source address. 924 */ 925 if (!jailed && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 926 inp->inp_moptions != NULL) { 927 struct ip_moptions *imo; 928 struct ifnet *ifp; 929 930 imo = inp->inp_moptions; 931 if ((ifp = imo->imo_multicast_ifp) != NULL) { 932 struct in_ifaddr_container *iac; 933 934 ia = NULL; 935 TAILQ_FOREACH(iac, 936 &in_ifaddrheads[mycpuid], ia_link) { 937 if (iac->ia->ia_ifp == ifp) { 938 ia = iac->ia; 939 break; 940 } 941 } 942 if (ia == NULL) 943 goto fail; 944 } 945 } 946 /* 947 * Don't do pcblookup call here; return interface in plocal_sin 948 * and exit to caller, that will do the lookup. 949 */ 950 if (ia == NULL && jailed) { 951 if ((jsin = prison_get_nonlocal( 952 cred->cr_prison, AF_INET, NULL)) != NULL || 953 (jsin = prison_get_local( 954 cred->cr_prison, AF_INET, NULL)) != NULL) { 955 *plocal_sin = satosin(jsin); 956 } else { 957 /* IPv6 only Jail */ 958 goto fail; 959 } 960 } else { 961 *plocal_sin = &ia->ia_addr; 962 } 963 } 964 return (0); 965 fail: 966 if (alloc_route) 967 in_pcbresetroute(inp); 968 return (EADDRNOTAVAIL); 969 } 970 971 int 972 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, 973 struct sockaddr_in **plocal_sin, struct thread *td) 974 { 975 return in_pcbladdr_find(inp, nam, plocal_sin, td, 976 (inp->inp_laddr.s_addr == INADDR_ANY)); 977 } 978 979 /* 980 * Outer subroutine: 981 * Connect from a socket to a specified address. 982 * Both address and port must be specified in argument sin. 983 * If don't have a local address for this socket yet, 984 * then pick one. 985 */ 986 int 987 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 988 { 989 struct sockaddr_in *if_sin; 990 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 991 int error; 992 993 if_sin = NULL; /* avoid gcc warnings */ 994 995 /* Call inner routine to assign local interface address. */ 996 if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0) 997 return (error); 998 999 if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, 1000 inp->inp_laddr.s_addr ? 1001 inp->inp_laddr : if_sin->sin_addr, 1002 inp->inp_lport, FALSE, NULL) != NULL) { 1003 return (EADDRINUSE); 1004 } 1005 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1006 if (inp->inp_lport == 0) { 1007 error = in_pcbbind(inp, NULL, td); 1008 if (error) 1009 return (error); 1010 } 1011 inp->inp_laddr = if_sin->sin_addr; 1012 } 1013 inp->inp_faddr = sin->sin_addr; 1014 inp->inp_fport = sin->sin_port; 1015 in_pcbinsconnhash(inp); 1016 return (0); 1017 } 1018 1019 void 1020 in_pcbdisconnect(struct inpcb *inp) 1021 { 1022 1023 in_pcbremconnhash(inp); 1024 inp->inp_faddr.s_addr = INADDR_ANY; 1025 inp->inp_fport = 0; 1026 } 1027 1028 void 1029 in_pcbdetach(struct inpcb *inp) 1030 { 1031 struct socket *so = inp->inp_socket; 1032 struct inpcbinfo *ipi = inp->inp_pcbinfo; 1033 1034 inp->inp_gencnt = ++ipi->ipi_gencnt; 1035 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 1036 in_pcbremlists(inp); 1037 so->so_pcb = NULL; 1038 sofree(so); /* remove pcb ref */ 1039 if (inp->inp_options) 1040 m_free(inp->inp_options); 1041 if (inp->inp_route.ro_rt) 1042 rtfree(inp->inp_route.ro_rt); 1043 ip_freemoptions(inp->inp_moptions); 1044 kfree(inp, M_PCB); 1045 } 1046 1047 /* 1048 * The socket may have an invalid PCB, i.e. NULL. For example, a TCP 1049 * socket received RST. 1050 */ 1051 static int 1052 in_setsockaddr(struct socket *so, struct sockaddr **nam) 1053 { 1054 struct inpcb *inp; 1055 struct sockaddr_in *sin; 1056 1057 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 1058 inp = so->so_pcb; 1059 if (!inp) 1060 return (ECONNRESET); 1061 1062 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1063 sin->sin_family = AF_INET; 1064 sin->sin_len = sizeof *sin; 1065 sin->sin_port = inp->inp_lport; 1066 sin->sin_addr = inp->inp_laddr; 1067 1068 *nam = (struct sockaddr *)sin; 1069 return (0); 1070 } 1071 1072 void 1073 in_setsockaddr_dispatch(netmsg_t msg) 1074 { 1075 int error; 1076 1077 error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1078 lwkt_replymsg(&msg->lmsg, error); 1079 } 1080 1081 /* 1082 * The socket may have an invalid PCB, i.e. NULL. For example, a TCP 1083 * socket received RST. 1084 */ 1085 int 1086 in_setpeeraddr(struct socket *so, struct sockaddr **nam) 1087 { 1088 struct inpcb *inp; 1089 struct sockaddr_in *sin; 1090 1091 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 1092 inp = so->so_pcb; 1093 if (!inp) 1094 return (ECONNRESET); 1095 1096 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1097 sin->sin_family = AF_INET; 1098 sin->sin_len = sizeof *sin; 1099 sin->sin_port = inp->inp_fport; 1100 sin->sin_addr = inp->inp_faddr; 1101 1102 *nam = (struct sockaddr *)sin; 1103 return (0); 1104 } 1105 1106 void 1107 in_setpeeraddr_dispatch(netmsg_t msg) 1108 { 1109 int error; 1110 1111 error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1112 lwkt_replymsg(&msg->lmsg, error); 1113 } 1114 1115 void 1116 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int err, 1117 inp_notify_t notify) 1118 { 1119 struct inpcb *inp, *marker; 1120 1121 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1122 ("not in the correct netisr")); 1123 marker = in_pcbmarker(); 1124 1125 /* 1126 * NOTE: 1127 * - If INP_PLACEMARKER is set we must ignore the rest of the 1128 * structure and skip it. 1129 * - It is safe to nuke inpcbs here, since we are in their own 1130 * netisr. 1131 */ 1132 GET_PCBINFO_TOKEN(pcbinfo); 1133 1134 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1135 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1136 LIST_REMOVE(marker, inp_list); 1137 LIST_INSERT_AFTER(inp, marker, inp_list); 1138 1139 if (inp->inp_flags & INP_PLACEMARKER) 1140 continue; 1141 #ifdef INET6 1142 if (!INP_ISIPV4(inp)) 1143 continue; 1144 #endif 1145 if (inp->inp_faddr.s_addr != faddr.s_addr || 1146 inp->inp_socket == NULL) 1147 continue; 1148 (*notify)(inp, err); /* can remove inp from list! */ 1149 } 1150 LIST_REMOVE(marker, inp_list); 1151 1152 REL_PCBINFO_TOKEN(pcbinfo); 1153 } 1154 1155 void 1156 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1157 { 1158 struct inpcb *inp, *marker; 1159 1160 /* 1161 * We only need to make sure that we are in netisr0, where all 1162 * multicast operation happen. We could check inpcbinfo which 1163 * does not belong to netisr0 by holding the inpcbinfo's token. 1164 * In this case, the pcbinfo must be able to be shared, i.e. 1165 * pcbinfo->infotoken is not NULL. 1166 */ 1167 ASSERT_NETISR0; 1168 KASSERT(pcbinfo->cpu == 0 || pcbinfo->infotoken != NULL, 1169 ("pcbinfo could not be shared")); 1170 1171 /* 1172 * Get a marker for the current netisr (netisr0). 1173 * 1174 * It is possible that the multicast address deletion blocks, 1175 * which could cause temporary token releasing. So we use 1176 * inpcb marker here to get a coherent view of the inpcb list. 1177 * 1178 * While, on the other hand, moptions are only added and deleted 1179 * in netisr0, so we would not see staled moption or miss moption 1180 * even if the token was released due to the blocking multicast 1181 * address deletion. 1182 */ 1183 marker = in_pcbmarker(); 1184 1185 GET_PCBINFO_TOKEN(pcbinfo); 1186 1187 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1188 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1189 struct ip_moptions *imo; 1190 1191 LIST_REMOVE(marker, inp_list); 1192 LIST_INSERT_AFTER(inp, marker, inp_list); 1193 1194 if (inp->inp_flags & INP_PLACEMARKER) 1195 continue; 1196 imo = inp->inp_moptions; 1197 if (INP_ISIPV4(inp) && imo != NULL) { 1198 int i, gap; 1199 1200 /* 1201 * Unselect the outgoing interface if it is being 1202 * detached. 1203 */ 1204 if (imo->imo_multicast_ifp == ifp) 1205 imo->imo_multicast_ifp = NULL; 1206 1207 /* 1208 * Drop multicast group membership if we joined 1209 * through the interface being detached. 1210 */ 1211 for (i = 0, gap = 0; i < imo->imo_num_memberships; 1212 i++) { 1213 if (imo->imo_membership[i]->inm_ifp == ifp) { 1214 /* 1215 * NOTE: 1216 * This could block and the pcbinfo 1217 * token could be passively released. 1218 */ 1219 in_delmulti(imo->imo_membership[i]); 1220 gap++; 1221 } else if (gap != 0) 1222 imo->imo_membership[i - gap] = 1223 imo->imo_membership[i]; 1224 } 1225 imo->imo_num_memberships -= gap; 1226 } 1227 } 1228 LIST_REMOVE(marker, inp_list); 1229 1230 REL_PCBINFO_TOKEN(pcbinfo); 1231 } 1232 1233 /* 1234 * Check for alternatives when higher level complains 1235 * about service problems. For now, invalidate cached 1236 * routing information. If the route was created dynamically 1237 * (by a redirect), time to try a default gateway again. 1238 */ 1239 void 1240 in_losing(struct inpcb *inp) 1241 { 1242 struct rtentry *rt; 1243 struct rt_addrinfo rtinfo; 1244 1245 if ((rt = inp->inp_route.ro_rt)) { 1246 bzero(&rtinfo, sizeof(struct rt_addrinfo)); 1247 rtinfo.rti_info[RTAX_DST] = rt_key(rt); 1248 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway; 1249 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt); 1250 rtinfo.rti_flags = rt->rt_flags; 1251 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0); 1252 if (rt->rt_flags & RTF_DYNAMIC) { 1253 rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, 1254 rt_mask(rt), rt->rt_flags, NULL); 1255 } 1256 inp->inp_route.ro_rt = NULL; 1257 rtfree(rt); 1258 /* 1259 * A new route can be allocated 1260 * the next time output is attempted. 1261 */ 1262 } 1263 } 1264 1265 /* 1266 * After a routing change, flush old routing 1267 * and allocate a (hopefully) better one. 1268 */ 1269 void 1270 in_rtchange(struct inpcb *inp, int err) 1271 { 1272 if (inp->inp_route.ro_rt) { 1273 rtfree(inp->inp_route.ro_rt); 1274 inp->inp_route.ro_rt = NULL; 1275 /* 1276 * A new route can be allocated the next time 1277 * output is attempted. 1278 */ 1279 } 1280 } 1281 1282 /* 1283 * Lookup a PCB based on the local address and port. 1284 */ 1285 static struct inpcb * 1286 in_pcblookup_local(struct inpcbporthead *porthash, struct in_addr laddr, 1287 u_int lport_arg, int wild_okay, struct ucred *cred) 1288 { 1289 struct inpcb *inp; 1290 int matchwild = 3, wildcard; 1291 u_short lport = lport_arg; 1292 struct inpcbport *phd; 1293 struct inpcb *match = NULL; 1294 1295 /* 1296 * If the porthashbase is shared across several cpus, it must 1297 * have been locked. 1298 */ 1299 ASSERT_PORTHASH_TOKEN_HELD(porthash); 1300 1301 /* 1302 * Best fit PCB lookup. 1303 * 1304 * First see if this local port is in use by looking on the 1305 * port hash list. 1306 */ 1307 LIST_FOREACH(phd, porthash, phd_hash) { 1308 if (phd->phd_port == lport) 1309 break; 1310 } 1311 if (phd != NULL) { 1312 /* 1313 * Port is in use by one or more PCBs. Look for best 1314 * fit. 1315 */ 1316 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 1317 wildcard = 0; 1318 #ifdef INET6 1319 if (!INP_ISIPV4(inp)) 1320 continue; 1321 #endif 1322 if (inp->inp_faddr.s_addr != INADDR_ANY) 1323 wildcard++; 1324 if (inp->inp_laddr.s_addr != INADDR_ANY) { 1325 if (laddr.s_addr == INADDR_ANY) 1326 wildcard++; 1327 else if (inp->inp_laddr.s_addr != laddr.s_addr) 1328 continue; 1329 } else { 1330 if (laddr.s_addr != INADDR_ANY) 1331 wildcard++; 1332 } 1333 if (wildcard && !wild_okay) 1334 continue; 1335 if (wildcard < matchwild && 1336 (cred == NULL || 1337 cred->cr_prison == 1338 inp->inp_socket->so_cred->cr_prison)) { 1339 match = inp; 1340 matchwild = wildcard; 1341 if (matchwild == 0) { 1342 break; 1343 } 1344 } 1345 } 1346 } 1347 return (match); 1348 } 1349 1350 struct inpcb * 1351 in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo, 1352 const struct inpcb *inp) 1353 { 1354 const struct inp_localgrphead *hdr; 1355 const struct inp_localgroup *grp; 1356 int i; 1357 1358 if (pcbinfo->localgrphashbase == NULL) 1359 return NULL; 1360 1361 GET_PCBINFO_TOKEN(pcbinfo); 1362 1363 hdr = &pcbinfo->localgrphashbase[ 1364 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1365 1366 LIST_FOREACH(grp, hdr, il_list) { 1367 if (grp->il_af == inp->inp_af && 1368 grp->il_lport == inp->inp_lport && 1369 memcmp(&grp->il_dependladdr, 1370 &inp->inp_inc.inc_ie.ie_dependladdr, 1371 sizeof(grp->il_dependladdr)) == 0) { 1372 break; 1373 } 1374 } 1375 if (grp == NULL || grp->il_inpcnt == 1) { 1376 REL_PCBINFO_TOKEN(pcbinfo); 1377 return NULL; 1378 } 1379 1380 KASSERT(grp->il_inpcnt >= 2, 1381 ("invalid localgroup inp count %d", grp->il_inpcnt)); 1382 for (i = 0; i < grp->il_inpcnt; ++i) { 1383 if (grp->il_inp[i] == inp) { 1384 int last = grp->il_inpcnt - 1; 1385 1386 if (i == last) 1387 last = grp->il_inpcnt - 2; 1388 REL_PCBINFO_TOKEN(pcbinfo); 1389 return grp->il_inp[last]; 1390 } 1391 } 1392 REL_PCBINFO_TOKEN(pcbinfo); 1393 return NULL; 1394 } 1395 1396 static struct inpcb * 1397 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo, 1398 struct in_addr laddr, uint16_t lport, uint32_t pkt_hash) 1399 { 1400 struct inpcb *local_wild = NULL; 1401 const struct inp_localgrphead *hdr; 1402 const struct inp_localgroup *grp; 1403 1404 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1405 1406 hdr = &pcbinfo->localgrphashbase[ 1407 INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)]; 1408 1409 /* 1410 * Order of socket selection: 1411 * 1. non-wild. 1412 * 2. wild. 1413 * 1414 * NOTE: Local group does not contain jailed sockets 1415 */ 1416 LIST_FOREACH(grp, hdr, il_list) { 1417 #ifdef INET6 1418 if (grp->il_af != AF_INET) 1419 continue; 1420 #endif 1421 if (grp->il_lport == lport) { 1422 int idx; 1423 1424 /* 1425 * Modulo-N is used here, which greatly reduces 1426 * completion queue token contention, thus more 1427 * cpu time is saved. 1428 */ 1429 idx = netisr_hashlsb(pkt_hash) % grp->il_inpcnt; 1430 if (grp->il_laddr.s_addr == laddr.s_addr) 1431 return grp->il_inp[idx]; 1432 else if (grp->il_laddr.s_addr == INADDR_ANY) 1433 local_wild = grp->il_inp[idx]; 1434 } 1435 } 1436 if (local_wild != NULL) 1437 return local_wild; 1438 return NULL; 1439 } 1440 1441 /* 1442 * Lookup PCB in hash list. 1443 */ 1444 struct inpcb * 1445 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1446 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1447 boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m) 1448 { 1449 struct inpcbhead *head; 1450 struct inpcb *inp, *jinp=NULL; 1451 u_short fport = fport_arg, lport = lport_arg; 1452 1453 /* 1454 * First look for an exact match. 1455 */ 1456 head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport, 1457 laddr.s_addr, lport, pcbinfo->hashmask)]; 1458 LIST_FOREACH(inp, head, inp_hash) { 1459 #ifdef INET6 1460 if (!INP_ISIPV4(inp)) 1461 continue; 1462 #endif 1463 if (in_hosteq(inp->inp_faddr, faddr) && 1464 in_hosteq(inp->inp_laddr, laddr) && 1465 inp->inp_fport == fport && inp->inp_lport == lport) { 1466 /* found */ 1467 if (inp->inp_socket == NULL || 1468 inp->inp_socket->so_cred->cr_prison == NULL) { 1469 return (inp); 1470 } else { 1471 if (jinp == NULL) 1472 jinp = inp; 1473 } 1474 } 1475 } 1476 if (jinp != NULL) 1477 return (jinp); 1478 1479 if (wildcard) { 1480 struct inpcb *local_wild = NULL; 1481 struct inpcb *jinp_wild = NULL; 1482 struct inpcontainer *ic; 1483 struct inpcontainerhead *chead; 1484 struct sockaddr_in jsin; 1485 struct ucred *cred; 1486 1487 GET_PCBINFO_TOKEN(pcbinfo); 1488 1489 /* 1490 * Check local group first 1491 */ 1492 if (pcbinfo->localgrphashbase != NULL && 1493 m != NULL && (m->m_flags & M_HASH)) { 1494 inp = inp_localgroup_lookup(pcbinfo, 1495 laddr, lport, m->m_pkthdr.hash); 1496 if (inp != NULL) { 1497 REL_PCBINFO_TOKEN(pcbinfo); 1498 return inp; 1499 } 1500 } 1501 1502 /* 1503 * Order of socket selection: 1504 * 1. non-jailed, non-wild. 1505 * 2. non-jailed, wild. 1506 * 3. jailed, non-wild. 1507 * 4. jailed, wild. 1508 */ 1509 jsin.sin_family = AF_INET; 1510 chead = &pcbinfo->wildcardhashbase[ 1511 INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)]; 1512 LIST_FOREACH(ic, chead, ic_list) { 1513 inp = ic->ic_inp; 1514 if (inp->inp_flags & INP_PLACEMARKER) 1515 continue; 1516 1517 jsin.sin_addr.s_addr = laddr.s_addr; 1518 #ifdef INET6 1519 if (!INP_ISIPV4(inp)) 1520 continue; 1521 #endif 1522 if (inp->inp_socket != NULL) 1523 cred = inp->inp_socket->so_cred; 1524 else 1525 cred = NULL; 1526 if (cred != NULL && jailed(cred)) { 1527 if (jinp != NULL) 1528 continue; 1529 else 1530 if (!jailed_ip(cred->cr_prison, 1531 (struct sockaddr *)&jsin)) 1532 continue; 1533 } 1534 if (inp->inp_lport == lport) { 1535 if (inp->inp_laddr.s_addr == laddr.s_addr) { 1536 if (cred != NULL && jailed(cred)) { 1537 jinp = inp; 1538 } else { 1539 REL_PCBINFO_TOKEN(pcbinfo); 1540 return (inp); 1541 } 1542 } 1543 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1544 if (cred != NULL && jailed(cred)) 1545 jinp_wild = inp; 1546 else 1547 local_wild = inp; 1548 } 1549 } 1550 } 1551 1552 REL_PCBINFO_TOKEN(pcbinfo); 1553 1554 if (local_wild != NULL) 1555 return (local_wild); 1556 if (jinp != NULL) 1557 return (jinp); 1558 return (jinp_wild); 1559 } 1560 1561 /* 1562 * Not found. 1563 */ 1564 return (NULL); 1565 } 1566 1567 struct inpcb * 1568 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1569 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1570 boolean_t wildcard, struct ifnet *ifp) 1571 { 1572 return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg, 1573 laddr, lport_arg, wildcard, ifp, NULL); 1574 } 1575 1576 /* 1577 * Insert PCB into connection hash table. 1578 */ 1579 void 1580 in_pcbinsconnhash(struct inpcb *inp) 1581 { 1582 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1583 struct inpcbhead *bucket; 1584 u_int32_t hashkey_faddr, hashkey_laddr; 1585 1586 #ifdef INET6 1587 if (INP_ISIPV6(inp)) { 1588 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */; 1589 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */; 1590 } else { 1591 #endif 1592 hashkey_faddr = inp->inp_faddr.s_addr; 1593 hashkey_laddr = inp->inp_laddr.s_addr; 1594 #ifdef INET6 1595 } 1596 #endif 1597 1598 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1599 ("not in the correct netisr")); 1600 ASSERT_INP_NOTINHASH(inp); 1601 inp->inp_flags |= INP_CONNECTED; 1602 1603 /* 1604 * Insert into the connection hash table. 1605 */ 1606 bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr, 1607 inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)]; 1608 LIST_INSERT_HEAD(bucket, inp, inp_hash); 1609 } 1610 1611 /* 1612 * Remove PCB from connection hash table. 1613 */ 1614 void 1615 in_pcbremconnhash(struct inpcb *inp) 1616 { 1617 struct inpcbinfo *pcbinfo __debugvar = inp->inp_pcbinfo; 1618 1619 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1620 ("not in the correct netisr")); 1621 KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected")); 1622 1623 LIST_REMOVE(inp, inp_hash); 1624 inp->inp_flags &= ~INP_CONNECTED; 1625 } 1626 1627 /* 1628 * Insert PCB into port hash table. 1629 */ 1630 void 1631 in_pcbinsporthash(struct inpcbporthead *pcbporthash, struct inpcb *inp) 1632 { 1633 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1634 struct inpcbport *phd; 1635 1636 /* 1637 * If the porthashbase is shared across several cpus, it must 1638 * have been locked. 1639 */ 1640 ASSERT_PORTHASH_TOKEN_HELD(pcbporthash); 1641 1642 /* 1643 * Insert into the port hash table. 1644 */ 1645 1646 /* Go through port list and look for a head for this lport. */ 1647 LIST_FOREACH(phd, pcbporthash, phd_hash) { 1648 if (phd->phd_port == inp->inp_lport) 1649 break; 1650 } 1651 1652 /* If none exists, use saved one and tack it on. */ 1653 if (phd == NULL) { 1654 KKASSERT(pcbinfo->portsave != NULL); 1655 phd = pcbinfo->portsave; 1656 pcbinfo->portsave = NULL; 1657 phd->phd_port = inp->inp_lport; 1658 LIST_INIT(&phd->phd_pcblist); 1659 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 1660 } 1661 1662 inp->inp_porthash = pcbporthash; 1663 inp->inp_phd = phd; 1664 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 1665 1666 /* 1667 * Malloc one inpcbport for later use. It is safe to use 1668 * "wait" malloc here (port token would be released, if 1669 * malloc ever blocked), since all changes to the porthash 1670 * are done. 1671 */ 1672 if (pcbinfo->portsave == NULL) { 1673 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), 1674 M_PCB, M_INTWAIT | M_ZERO); 1675 } 1676 } 1677 1678 void 1679 in_pcbinsporthash_lport(struct inpcb *inp) 1680 { 1681 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1682 struct inpcbportinfo *portinfo; 1683 struct inpcbporthead *porthash; 1684 u_short lport_ho; 1685 1686 /* Locate the proper portinfo based on lport */ 1687 lport_ho = ntohs(inp->inp_lport); 1688 portinfo = &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt]; 1689 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == portinfo->offset); 1690 1691 porthash = in_pcbporthash_head(portinfo, inp->inp_lport); 1692 GET_PORTHASH_TOKEN(porthash); 1693 in_pcbinsporthash(porthash, inp); 1694 REL_PORTHASH_TOKEN(porthash); 1695 } 1696 1697 void 1698 in_pcbremporthash(struct inpcb *inp) 1699 { 1700 struct inpcbporthead *porthash; 1701 struct inpcbport *phd; 1702 1703 if (inp->inp_phd == NULL) 1704 return; 1705 KASSERT(inp->inp_lport != 0, ("inpcb has no lport")); 1706 1707 porthash = inp->inp_porthash; 1708 KASSERT(porthash != NULL, ("no porthash")); 1709 1710 GET_PORTHASH_TOKEN(porthash); 1711 1712 phd = inp->inp_phd; 1713 LIST_REMOVE(inp, inp_portlist); 1714 if (LIST_FIRST(&phd->phd_pcblist) == NULL) { 1715 LIST_REMOVE(phd, phd_hash); 1716 kfree(phd, M_PCB); 1717 } 1718 1719 REL_PORTHASH_TOKEN(porthash); 1720 1721 inp->inp_phd = NULL; 1722 /* NOTE: Don't whack inp_lport, which may be used later */ 1723 } 1724 1725 static struct inp_localgroup * 1726 inp_localgroup_alloc(u_char af, uint16_t port, 1727 const union in_dependaddr *addr, int size) 1728 { 1729 struct inp_localgroup *grp; 1730 1731 grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]), 1732 M_TEMP, M_INTWAIT | M_ZERO); 1733 grp->il_af = af; 1734 grp->il_lport = port; 1735 grp->il_dependladdr = *addr; 1736 grp->il_inpsiz = size; 1737 1738 return grp; 1739 } 1740 1741 static void 1742 inp_localgroup_free(struct inp_localgroup *grp) 1743 { 1744 kfree(grp, M_TEMP); 1745 } 1746 1747 static void 1748 inp_localgroup_destroy(struct inp_localgroup *grp) 1749 { 1750 LIST_REMOVE(grp, il_list); 1751 inp_localgroup_free(grp); 1752 } 1753 1754 static void 1755 inp_localgroup_copy(struct inp_localgroup *grp, 1756 const struct inp_localgroup *old_grp) 1757 { 1758 int i; 1759 1760 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 1761 ("invalid new local group size %d and old local group count %d", 1762 grp->il_inpsiz, old_grp->il_inpcnt)); 1763 for (i = 0; i < old_grp->il_inpcnt; ++i) 1764 grp->il_inp[i] = old_grp->il_inp[i]; 1765 grp->il_inpcnt = old_grp->il_inpcnt; 1766 } 1767 1768 static void 1769 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1770 { 1771 struct inp_localgrphead *hdr; 1772 struct inp_localgroup *grp, *grp_alloc = NULL; 1773 struct ucred *cred; 1774 int i, idx; 1775 1776 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1777 1778 if (pcbinfo->localgrphashbase == NULL) 1779 return; 1780 1781 /* 1782 * XXX don't allow jailed socket to join local group 1783 */ 1784 if (inp->inp_socket != NULL) 1785 cred = inp->inp_socket->so_cred; 1786 else 1787 cred = NULL; 1788 if (cred != NULL && jailed(cred)) 1789 return; 1790 1791 hdr = &pcbinfo->localgrphashbase[ 1792 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1793 1794 again: 1795 LIST_FOREACH(grp, hdr, il_list) { 1796 if (grp->il_af == inp->inp_af && 1797 grp->il_lport == inp->inp_lport && 1798 memcmp(&grp->il_dependladdr, 1799 &inp->inp_inc.inc_ie.ie_dependladdr, 1800 sizeof(grp->il_dependladdr)) == 0) { 1801 break; 1802 } 1803 } 1804 if (grp == NULL) { 1805 /* 1806 * Create a new local group 1807 */ 1808 if (grp_alloc == NULL) { 1809 grp_alloc = inp_localgroup_alloc(inp->inp_af, 1810 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 1811 INP_LOCALGROUP_SIZMIN); 1812 /* 1813 * Local group allocation could block and the 1814 * local group w/ the same property might have 1815 * been added by others when we were blocked; 1816 * check again. 1817 */ 1818 goto again; 1819 } else { 1820 /* Local group has been allocated; link it */ 1821 grp = grp_alloc; 1822 grp_alloc = NULL; 1823 LIST_INSERT_HEAD(hdr, grp, il_list); 1824 } 1825 } else if (grp->il_inpcnt == grp->il_inpsiz) { 1826 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) { 1827 static int limit_logged = 0; 1828 1829 if (!limit_logged) { 1830 limit_logged = 1; 1831 kprintf("local group port %d, " 1832 "limit reached\n", ntohs(grp->il_lport)); 1833 } 1834 if (grp_alloc != NULL) { 1835 /* 1836 * This would happen if the local group 1837 * w/ the same property was expanded when 1838 * our local group allocation blocked. 1839 */ 1840 inp_localgroup_free(grp_alloc); 1841 } 1842 return; 1843 } 1844 1845 /* 1846 * Expand this local group 1847 */ 1848 if (grp_alloc == NULL || 1849 grp->il_inpcnt >= grp_alloc->il_inpsiz) { 1850 if (grp_alloc != NULL) 1851 inp_localgroup_free(grp_alloc); 1852 grp_alloc = inp_localgroup_alloc(grp->il_af, 1853 grp->il_lport, &grp->il_dependladdr, 1854 grp->il_inpsiz * 2); 1855 /* 1856 * Local group allocation could block and the 1857 * local group w/ the same property might have 1858 * been expanded by others when we were blocked; 1859 * check again. 1860 */ 1861 goto again; 1862 } 1863 1864 /* 1865 * Save the old local group, link the new one, and then 1866 * destroy the old local group 1867 */ 1868 inp_localgroup_copy(grp_alloc, grp); 1869 LIST_INSERT_HEAD(hdr, grp_alloc, il_list); 1870 inp_localgroup_destroy(grp); 1871 1872 grp = grp_alloc; 1873 grp_alloc = NULL; 1874 } else { 1875 /* 1876 * Found the local group 1877 */ 1878 if (grp_alloc != NULL) { 1879 /* 1880 * This would happen if the local group w/ the 1881 * same property was added or expanded when our 1882 * local group allocation blocked. 1883 */ 1884 inp_localgroup_free(grp_alloc); 1885 grp_alloc = NULL; 1886 } 1887 } 1888 1889 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 1890 ("invalid local group size %d and count %d", 1891 grp->il_inpsiz, grp->il_inpcnt)); 1892 1893 /* 1894 * Keep the local group sorted by the inpcb local group index 1895 * in ascending order. 1896 * 1897 * This eases the multi-process userland application which uses 1898 * SO_REUSEPORT sockets and binds process to the owner cpu of 1899 * the SO_REUSEPORT socket: 1900 * If we didn't sort the local group by the inpcb local group 1901 * index and one of the process owning an inpcb in this local 1902 * group restarted, e.g. crashed and restarted by watchdog, 1903 * other processes owning a inpcb in this local group would have 1904 * to detect that event, refetch its socket's owner cpu, and 1905 * re-bind. 1906 */ 1907 idx = grp->il_inpcnt; 1908 for (i = 0; i < idx; ++i) { 1909 struct inpcb *oinp = grp->il_inp[i]; 1910 1911 if (oinp->inp_lgrpindex > i) { 1912 if (inp->inp_lgrpindex < 0) { 1913 inp->inp_lgrpindex = i; 1914 } else if (inp->inp_lgrpindex != i) { 1915 if (bootverbose) { 1916 kprintf("inp %p: grpidx %d, " 1917 "assigned to %d, cpu%d\n", 1918 inp, inp->inp_lgrpindex, i, 1919 mycpuid); 1920 } 1921 } 1922 grp->il_inp[i] = inp; 1923 1924 /* Pull down inpcbs */ 1925 for (; i < grp->il_inpcnt; ++i) { 1926 struct inpcb *oinp1 = grp->il_inp[i + 1]; 1927 1928 grp->il_inp[i + 1] = oinp; 1929 oinp = oinp1; 1930 } 1931 grp->il_inpcnt++; 1932 return; 1933 } 1934 } 1935 1936 if (inp->inp_lgrpindex < 0) { 1937 inp->inp_lgrpindex = idx; 1938 } else if (inp->inp_lgrpindex != idx) { 1939 if (bootverbose) { 1940 kprintf("inp %p: grpidx %d, assigned to %d, cpu%d\n", 1941 inp, inp->inp_lgrpindex, idx, mycpuid); 1942 } 1943 } 1944 grp->il_inp[idx] = inp; 1945 grp->il_inpcnt++; 1946 } 1947 1948 void 1949 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1950 { 1951 struct inpcontainer *ic; 1952 struct inpcontainerhead *bucket; 1953 1954 GET_PCBINFO_TOKEN(pcbinfo); 1955 1956 in_pcbinslocalgrphash_oncpu(inp, pcbinfo); 1957 1958 bucket = &pcbinfo->wildcardhashbase[ 1959 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 1960 1961 ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT); 1962 ic->ic_inp = inp; 1963 LIST_INSERT_HEAD(bucket, ic, ic_list); 1964 1965 REL_PCBINFO_TOKEN(pcbinfo); 1966 } 1967 1968 /* 1969 * Insert PCB into wildcard hash table. 1970 */ 1971 void 1972 in_pcbinswildcardhash(struct inpcb *inp) 1973 { 1974 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1975 1976 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1977 ("not in correct netisr")); 1978 ASSERT_INP_NOTINHASH(inp); 1979 inp->inp_flags |= INP_WILDCARD; 1980 1981 in_pcbinswildcardhash_oncpu(inp, pcbinfo); 1982 } 1983 1984 static void 1985 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1986 { 1987 struct inp_localgrphead *hdr; 1988 struct inp_localgroup *grp; 1989 1990 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1991 1992 if (pcbinfo->localgrphashbase == NULL) 1993 return; 1994 1995 hdr = &pcbinfo->localgrphashbase[ 1996 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1997 1998 LIST_FOREACH(grp, hdr, il_list) { 1999 int i; 2000 2001 for (i = 0; i < grp->il_inpcnt; ++i) { 2002 if (grp->il_inp[i] != inp) 2003 continue; 2004 2005 if (grp->il_inpcnt == 1) { 2006 /* Destroy this local group */ 2007 inp_localgroup_destroy(grp); 2008 } else { 2009 /* Pull up inpcbs */ 2010 for (; i + 1 < grp->il_inpcnt; ++i) 2011 grp->il_inp[i] = grp->il_inp[i + 1]; 2012 grp->il_inpcnt--; 2013 } 2014 return; 2015 } 2016 } 2017 } 2018 2019 void 2020 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 2021 { 2022 struct inpcontainer *ic; 2023 struct inpcontainerhead *head; 2024 2025 GET_PCBINFO_TOKEN(pcbinfo); 2026 2027 in_pcbremlocalgrphash_oncpu(inp, pcbinfo); 2028 2029 /* find bucket */ 2030 head = &pcbinfo->wildcardhashbase[ 2031 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 2032 2033 LIST_FOREACH(ic, head, ic_list) { 2034 if (ic->ic_inp == inp) 2035 goto found; 2036 } 2037 REL_PCBINFO_TOKEN(pcbinfo); 2038 return; /* not found! */ 2039 2040 found: 2041 LIST_REMOVE(ic, ic_list); /* remove container from bucket chain */ 2042 REL_PCBINFO_TOKEN(pcbinfo); 2043 kfree(ic, M_TEMP); /* deallocate container */ 2044 } 2045 2046 /* 2047 * Remove PCB from wildcard hash table. 2048 */ 2049 void 2050 in_pcbremwildcardhash(struct inpcb *inp) 2051 { 2052 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2053 2054 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 2055 ("not in correct netisr")); 2056 KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard")); 2057 2058 in_pcbremwildcardhash_oncpu(inp, pcbinfo); 2059 inp->inp_lgrpindex = -1; 2060 inp->inp_flags &= ~INP_WILDCARD; 2061 } 2062 2063 /* 2064 * Remove PCB from various lists. 2065 */ 2066 void 2067 in_pcbremlists(struct inpcb *inp) 2068 { 2069 in_pcbremporthash(inp); 2070 if (inp->inp_flags & INP_WILDCARD) { 2071 in_pcbremwildcardhash(inp); 2072 } else if (inp->inp_flags & INP_CONNECTED) { 2073 in_pcbremconnhash(inp); 2074 } 2075 2076 if (inp->inp_flags & INP_ONLIST) 2077 in_pcbofflist(inp); 2078 } 2079 2080 int 2081 prison_xinpcb(struct thread *td, struct inpcb *inp) 2082 { 2083 struct ucred *cr; 2084 2085 if (td->td_proc == NULL) 2086 return (0); 2087 cr = td->td_proc->p_ucred; 2088 if (cr->cr_prison == NULL) 2089 return (0); 2090 if (inp->inp_socket && inp->inp_socket->so_cred && 2091 inp->inp_socket->so_cred->cr_prison && 2092 cr->cr_prison == inp->inp_socket->so_cred->cr_prison) 2093 return (0); 2094 return (1); 2095 } 2096 2097 int 2098 in_pcblist_range(SYSCTL_HANDLER_ARGS) 2099 { 2100 struct inpcbinfo *pcbinfo_arr = arg1; 2101 int pcbinfo_arrlen = arg2; 2102 struct inpcb *marker; 2103 int cpu, origcpu; 2104 int error, n; 2105 2106 KASSERT(pcbinfo_arrlen <= netisr_ncpus && pcbinfo_arrlen >= 1, 2107 ("invalid pcbinfo count %d", pcbinfo_arrlen)); 2108 2109 /* 2110 * The process of preparing the TCB list is too time-consuming and 2111 * resource-intensive to repeat twice on every request. 2112 */ 2113 n = 0; 2114 if (req->oldptr == NULL) { 2115 for (cpu = 0; cpu < pcbinfo_arrlen; ++cpu) 2116 n += pcbinfo_arr[cpu].ipi_count; 2117 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb); 2118 return 0; 2119 } 2120 2121 if (req->newptr != NULL) 2122 return EPERM; 2123 2124 marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO); 2125 marker->inp_flags |= INP_PLACEMARKER; 2126 2127 /* 2128 * OK, now we're committed to doing something. Re-fetch ipi_count 2129 * after obtaining the generation count. 2130 */ 2131 error = 0; 2132 origcpu = mycpuid; 2133 for (cpu = 0; cpu < pcbinfo_arrlen && error == 0; ++cpu) { 2134 struct inpcbinfo *pcbinfo = &pcbinfo_arr[cpu]; 2135 struct inpcb *inp; 2136 struct xinpcb xi; 2137 int i; 2138 2139 lwkt_migratecpu(cpu); 2140 2141 GET_PCBINFO_TOKEN(pcbinfo); 2142 2143 n = pcbinfo->ipi_count; 2144 2145 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 2146 i = 0; 2147 while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) { 2148 LIST_REMOVE(marker, inp_list); 2149 LIST_INSERT_AFTER(inp, marker, inp_list); 2150 2151 if (inp->inp_flags & INP_PLACEMARKER) 2152 continue; 2153 if (prison_xinpcb(req->td, inp)) 2154 continue; 2155 2156 bzero(&xi, sizeof xi); 2157 xi.xi_len = sizeof xi; 2158 bcopy(inp, &xi.xi_inp, sizeof *inp); 2159 if (inp->inp_socket) 2160 sotoxsocket(inp->inp_socket, &xi.xi_socket); 2161 if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0) 2162 break; 2163 ++i; 2164 } 2165 LIST_REMOVE(marker, inp_list); 2166 2167 REL_PCBINFO_TOKEN(pcbinfo); 2168 2169 if (error == 0 && i < n) { 2170 bzero(&xi, sizeof xi); 2171 xi.xi_len = sizeof xi; 2172 while (i < n) { 2173 error = SYSCTL_OUT(req, &xi, sizeof xi); 2174 if (error) 2175 break; 2176 ++i; 2177 } 2178 } 2179 } 2180 2181 lwkt_migratecpu(origcpu); 2182 kfree(marker, M_TEMP); 2183 return error; 2184 } 2185 2186 int 2187 in_pcblist_ncpus(SYSCTL_HANDLER_ARGS) 2188 { 2189 2190 return (in_pcblist_range(oidp, arg1, netisr_ncpus, req)); 2191 } 2192 2193 void 2194 in_savefaddr(struct socket *so, const struct sockaddr *faddr) 2195 { 2196 struct sockaddr_in *sin; 2197 2198 KASSERT(faddr->sa_family == AF_INET, 2199 ("not AF_INET faddr %d", faddr->sa_family)); 2200 2201 sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO); 2202 sin->sin_family = AF_INET; 2203 sin->sin_len = sizeof(*sin); 2204 sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port; 2205 sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr; 2206 2207 so->so_faddr = (struct sockaddr *)sin; 2208 } 2209 2210 void 2211 in_pcbportinfo_init(struct inpcbportinfo *portinfo, int hashsize, 2212 u_short offset) 2213 { 2214 memset(portinfo, 0, sizeof(*portinfo)); 2215 2216 portinfo->offset = offset; 2217 portinfo->porthashbase = phashinit(hashsize, M_PCB, 2218 &portinfo->porthashcnt); 2219 } 2220 2221 void 2222 in_pcbportrange(u_short *hi0, u_short *lo0, u_short ofs, u_short step) 2223 { 2224 int hi, lo; 2225 2226 if (step == 1) 2227 return; 2228 2229 hi = *hi0; 2230 lo = *lo0; 2231 2232 hi = rounddown(hi, step); 2233 hi += ofs; 2234 if (hi > (int)*hi0) 2235 hi -= step; 2236 2237 lo = roundup(lo, step); 2238 lo -= (step - ofs); 2239 if (lo < (int)*lo0) 2240 lo += step; 2241 2242 *hi0 = hi; 2243 *lo0 = lo; 2244 } 2245 2246 void 2247 in_pcbglobalinit(void) 2248 { 2249 int cpu; 2250 2251 in_pcbmarkers = kmalloc(netisr_ncpus * sizeof(struct inpcb), M_PCB, 2252 M_WAITOK | M_ZERO); 2253 in_pcbcontainer_markers = 2254 kmalloc(netisr_ncpus * sizeof(struct inpcontainer), M_PCB, 2255 M_WAITOK | M_ZERO); 2256 2257 for (cpu = 0; cpu < netisr_ncpus; ++cpu) { 2258 struct inpcontainer *ic = &in_pcbcontainer_markers[cpu]; 2259 struct inpcb *marker = &in_pcbmarkers[cpu]; 2260 2261 marker->inp_flags |= INP_PLACEMARKER; 2262 ic->ic_inp = marker; 2263 } 2264 } 2265 2266 struct inpcb * 2267 in_pcbmarker(void) 2268 { 2269 2270 ASSERT_NETISR_NCPUS(mycpuid); 2271 return &in_pcbmarkers[mycpuid]; 2272 } 2273 2274 struct inpcontainer * 2275 in_pcbcontainer_marker(void) 2276 { 2277 2278 ASSERT_NETISR_NCPUS(mycpuid); 2279 return &in_pcbcontainer_markers[mycpuid]; 2280 } 2281 2282 void 2283 in_pcbresetroute(struct inpcb *inp) 2284 { 2285 struct route *ro = &inp->inp_route; 2286 2287 if (ro->ro_rt != NULL) 2288 RTFREE(ro->ro_rt); 2289 bzero(ro, sizeof(*ro)); 2290 } 2291