1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1991, 1993, 1995 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 63 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $ 64 */ 65 66 #include "opt_inet6.h" 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/domain.h> 73 #include <sys/protosw.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/proc.h> 77 #include <sys/priv.h> 78 #include <sys/jail.h> 79 #include <sys/kernel.h> 80 #include <sys/sysctl.h> 81 82 #include <sys/socketvar2.h> 83 #include <sys/msgport2.h> 84 85 #include <machine/limits.h> 86 87 #include <net/if.h> 88 #include <net/if_types.h> 89 #include <net/route.h> 90 #include <net/netisr2.h> 91 #include <net/toeplitz2.h> 92 93 #include <netinet/in.h> 94 #include <netinet/in_pcb.h> 95 #include <netinet/in_var.h> 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/ip6_var.h> 100 #endif /* INET6 */ 101 102 #define INP_LOCALGROUP_SIZMIN 8 103 #define INP_LOCALGROUP_SIZMAX 256 104 105 static struct inpcb *in_pcblookup_local(struct inpcbporthead *porthash, 106 struct in_addr laddr, u_int lport_arg, int wild_okay, 107 struct ucred *cred); 108 109 struct in_addr zeroin_addr; 110 111 /* 112 * These configure the range of local port addresses assigned to 113 * "unspecified" outgoing connections/packets/whatever. 114 */ 115 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 116 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 117 118 int ipport_firstauto = IPPORT_RESERVED; /* 1024 */ 119 int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */ 120 121 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 122 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 123 124 #define RANGECHK(var, min, max) \ 125 if ((var) < (min)) { (var) = (min); } \ 126 else if ((var) > (max)) { (var) = (max); } 127 128 int udpencap_enable = 1; /* enabled by default */ 129 int udpencap_port = 4500; /* triggers decapsulation */ 130 131 /* 132 * Per-netisr inpcb markers. 133 * NOTE: they should only be used in netisrs. 134 */ 135 static struct inpcb *in_pcbmarkers; 136 static struct inpcontainer *in_pcbcontainer_markers; 137 138 static int 139 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 140 { 141 int error; 142 143 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 144 if (!error) { 145 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 146 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 147 148 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); 149 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); 150 151 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); 152 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); 153 } 154 return (error); 155 } 156 157 #undef RANGECHK 158 159 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); 160 161 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, 162 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); 163 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, 164 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); 165 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, 166 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); 167 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, 168 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); 169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, 170 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); 171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, 172 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); 173 174 /* Initialized by ip_init() */ 175 int ip_porthash_trycount; 176 SYSCTL_INT(_net_inet_ip, OID_AUTO, porthash_trycount, CTLFLAG_RW, 177 &ip_porthash_trycount, 0, 178 "Number of tries to find local port matching hash of 4-tuple"); 179 180 /* 181 * in_pcb.c: manage the Protocol Control Blocks. 182 * 183 * NOTE: It is assumed that most of these functions will be called from 184 * a critical section. XXX - There are, unfortunately, a few exceptions 185 * to this rule that should be fixed. 186 * 187 * NOTE: The caller should initialize the cpu field to the cpu running the 188 * protocol stack associated with this inpcbinfo. 189 */ 190 191 void 192 in_pcbinfo_init(struct inpcbinfo *pcbinfo, int cpu, boolean_t shared) 193 { 194 KASSERT(cpu >= 0 && cpu < netisr_ncpus, ("invalid cpu%d", cpu)); 195 pcbinfo->cpu = cpu; 196 197 LIST_INIT(&pcbinfo->pcblisthead); 198 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB, 199 M_WAITOK | M_ZERO); 200 201 if (shared) { 202 pcbinfo->infotoken = kmalloc(sizeof(struct lwkt_token), 203 M_PCB, M_WAITOK); 204 lwkt_token_init(pcbinfo->infotoken, "infotoken"); 205 } else { 206 pcbinfo->infotoken = NULL; 207 } 208 } 209 210 void 211 in_pcbportinfo_set(struct inpcbinfo *pcbinfo, struct inpcbportinfo *portinfo, 212 int portinfo_cnt) 213 { 214 215 KASSERT(portinfo_cnt > 0, ("invalid portinfo_cnt %d", portinfo_cnt)); 216 pcbinfo->portinfo = portinfo; 217 pcbinfo->portinfo_cnt = portinfo_cnt; 218 } 219 220 struct baddynamicports baddynamicports; 221 222 /* 223 * Check if the specified port is invalid for dynamic allocation. 224 */ 225 int 226 in_baddynamic(u_int16_t port, u_int16_t proto) 227 { 228 switch (proto) { 229 case IPPROTO_TCP: 230 return (DP_ISSET(baddynamicports.tcp, port)); 231 case IPPROTO_UDP: 232 return (DP_ISSET(baddynamicports.udp, port)); 233 default: 234 return (0); 235 } 236 } 237 238 void 239 in_pcbonlist(struct inpcb *inp) 240 { 241 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 242 243 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 244 ("not in the correct netisr")); 245 KASSERT((inp->inp_flags & INP_ONLIST) == 0, ("already on pcblist")); 246 inp->inp_flags |= INP_ONLIST; 247 248 GET_PCBINFO_TOKEN(pcbinfo); 249 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list); 250 pcbinfo->ipi_count++; 251 REL_PCBINFO_TOKEN(pcbinfo); 252 } 253 254 void 255 in_pcbofflist(struct inpcb *inp) 256 { 257 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 258 259 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 260 ("not in the correct netisr")); 261 KASSERT(inp->inp_flags & INP_ONLIST, ("not on pcblist")); 262 inp->inp_flags &= ~INP_ONLIST; 263 264 GET_PCBINFO_TOKEN(pcbinfo); 265 LIST_REMOVE(inp, inp_list); 266 KASSERT(pcbinfo->ipi_count > 0, 267 ("invalid inpcb count %d", pcbinfo->ipi_count)); 268 pcbinfo->ipi_count--; 269 REL_PCBINFO_TOKEN(pcbinfo); 270 } 271 272 /* 273 * Allocate a PCB and associate it with the socket. 274 */ 275 int 276 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 277 { 278 struct inpcb *inp; 279 280 inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO|M_NULLOK); 281 if (inp == NULL) 282 return (ENOMEM); 283 inp->inp_lgrpindex = -1; 284 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 285 inp->inp_pcbinfo = pcbinfo; 286 inp->inp_socket = so; 287 #ifdef INET6 288 if (INP_CHECK_SOCKAF(so, AF_INET6)) { 289 if (ip6_auto_flowlabel) 290 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 291 inp->inp_af = AF_INET6; 292 } else 293 #endif 294 inp->inp_af = AF_INET; 295 soreference(so); 296 so->so_pcb = inp; 297 298 in_pcbonlist(inp); 299 return (0); 300 } 301 302 /* 303 * Unlink a pcb with the intention of moving it to another cpu with a 304 * different pcbinfo. While unlinked nothing should attempt to dereference 305 * inp_pcbinfo, NULL it out so we assert if it does. 306 */ 307 void 308 in_pcbunlink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags) 309 { 310 KASSERT(inp->inp_pcbinfo == pcbinfo, ("pcbinfo mismatch")); 311 KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0, 312 ("already linked")); 313 314 in_pcbofflist(inp); 315 inp->inp_pcbinfo = NULL; 316 } 317 318 void 319 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 320 { 321 in_pcbunlink_flags(inp, pcbinfo, INP_WILDCARD); 322 } 323 324 /* 325 * Relink a pcb into a new pcbinfo. 326 */ 327 void 328 in_pcblink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags) 329 { 330 KASSERT(inp->inp_pcbinfo == NULL, ("has pcbinfo")); 331 KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0, 332 ("already linked")); 333 334 inp->inp_pcbinfo = pcbinfo; 335 in_pcbonlist(inp); 336 } 337 338 void 339 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 340 { 341 return in_pcblink_flags(inp, pcbinfo, INP_WILDCARD); 342 } 343 344 static boolean_t 345 in_pcbporthash_update(struct inpcbportinfo *portinfo, 346 struct inpcb *inp, u_short lport, struct ucred *cred, int wild) 347 { 348 struct inpcbporthead *porthash; 349 350 /* 351 * This has to be atomic. If the porthash is shared across multiple 352 * protocol threads, e.g. tcp and udp, then the token must be held. 353 */ 354 porthash = in_pcbporthash_head(portinfo, lport); 355 GET_PORTHASH_TOKEN(porthash); 356 357 if (in_pcblookup_local(porthash, inp->inp_laddr, lport, 358 wild, cred) != NULL) { 359 REL_PORTHASH_TOKEN(porthash); 360 return FALSE; 361 } 362 inp->inp_lport = lport; 363 in_pcbinsporthash(porthash, inp); 364 365 REL_PORTHASH_TOKEN(porthash); 366 return TRUE; 367 } 368 369 static int 370 in_pcbsetlport(struct inpcb *inp, int wild, struct ucred *cred) 371 { 372 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 373 struct inpcbportinfo *portinfo; 374 u_short first, last, lport, step, first0, last0; 375 int count, error; 376 int portinfo_first, portinfo_idx; 377 uint32_t cut; 378 379 inp->inp_flags |= INP_ANONPORT; 380 381 step = pcbinfo->portinfo_cnt; 382 portinfo_first = mycpuid % pcbinfo->portinfo_cnt; 383 portinfo_idx = portinfo_first; 384 385 if (inp->inp_flags & INP_HIGHPORT) { 386 first0 = ipport_hifirstauto; /* sysctl */ 387 last0 = ipport_hilastauto; 388 } else if (inp->inp_flags & INP_LOWPORT) { 389 if (cred && 390 (error = 391 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 392 inp->inp_laddr.s_addr = INADDR_ANY; 393 return error; 394 } 395 first0 = ipport_lowfirstauto; /* 1023 */ 396 last0 = ipport_lowlastauto; /* 600 */ 397 } else { 398 first0 = ipport_firstauto; /* sysctl */ 399 last0 = ipport_lastauto; 400 } 401 if (first0 > last0) { 402 lport = last0; 403 last0 = first0; 404 first0 = lport; 405 } 406 KKASSERT(last0 >= first0); 407 408 cut = karc4random(); 409 loop: 410 portinfo = &pcbinfo->portinfo[portinfo_idx]; 411 first = first0; 412 last = last0; 413 414 /* 415 * Simple check to ensure all ports are not used up causing 416 * a deadlock here. 417 */ 418 in_pcbportrange(&last, &first, portinfo->offset, step); 419 lport = last - first; 420 count = lport / step; 421 422 lport = rounddown(cut % lport, step) + first; 423 KKASSERT(lport % step == portinfo->offset); 424 425 for (;;) { 426 if (count-- < 0) { /* completely used? */ 427 error = EADDRNOTAVAIL; 428 break; 429 } 430 431 if (__predict_false(lport < first || lport > last)) { 432 lport = first; 433 KKASSERT(lport % step == portinfo->offset); 434 } 435 436 if (in_pcbporthash_update(portinfo, inp, htons(lport), 437 cred, wild)) { 438 error = 0; 439 break; 440 } 441 442 lport += step; 443 KKASSERT(lport % step == portinfo->offset); 444 } 445 446 if (error) { 447 /* Try next portinfo */ 448 portinfo_idx++; 449 portinfo_idx %= pcbinfo->portinfo_cnt; 450 if (portinfo_idx != portinfo_first) 451 goto loop; 452 inp->inp_laddr.s_addr = INADDR_ANY; 453 } 454 return error; 455 } 456 457 int 458 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 459 { 460 struct socket *so = inp->inp_socket; 461 struct sockaddr_in jsin; 462 struct ucred *cred = NULL; 463 int wild = 0; 464 465 if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */ 466 return (EADDRNOTAVAIL); 467 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 468 return (EINVAL); /* already bound */ 469 470 if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT))) 471 wild = 1; /* neither SO_REUSEADDR nor SO_REUSEPORT is set */ 472 if (td->td_proc) 473 cred = td->td_proc->p_ucred; 474 475 if (nam != NULL) { 476 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 477 struct inpcbinfo *pcbinfo; 478 struct inpcbportinfo *portinfo; 479 struct inpcbporthead *porthash; 480 struct inpcb *t; 481 u_short lport, lport_ho; 482 int reuseport = (so->so_options & SO_REUSEPORT); 483 int error; 484 485 if (nam->sa_len != sizeof *sin) 486 return (EINVAL); 487 #ifdef notdef 488 /* 489 * We should check the family, but old programs 490 * incorrectly fail to initialize it. 491 */ 492 if (sin->sin_family != AF_INET) 493 return (EAFNOSUPPORT); 494 #endif 495 if (!prison_replace_wildcards(td, nam)) 496 return (EINVAL); 497 498 lport = sin->sin_port; 499 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 500 /* 501 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 502 * allow complete duplication of binding if 503 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 504 * and a multicast address is bound on both 505 * new and duplicated sockets. 506 */ 507 if (so->so_options & SO_REUSEADDR) 508 reuseport = SO_REUSEADDR | SO_REUSEPORT; 509 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 510 sin->sin_port = 0; /* yech... */ 511 bzero(&sin->sin_zero, sizeof sin->sin_zero); 512 if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL) 513 return (EADDRNOTAVAIL); 514 } 515 516 inp->inp_laddr = sin->sin_addr; 517 518 jsin.sin_family = AF_INET; 519 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 520 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 521 inp->inp_laddr.s_addr = INADDR_ANY; 522 return (EINVAL); 523 } 524 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 525 526 if (lport == 0) { 527 /* Auto-select local port */ 528 return in_pcbsetlport(inp, wild, cred); 529 } 530 lport_ho = ntohs(lport); 531 532 /* GROSS */ 533 if (lport_ho < IPPORT_RESERVED && cred && 534 (error = 535 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 536 inp->inp_laddr.s_addr = INADDR_ANY; 537 return (error); 538 } 539 540 /* 541 * Locate the proper portinfo based on lport 542 */ 543 pcbinfo = inp->inp_pcbinfo; 544 portinfo = 545 &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt]; 546 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == 547 portinfo->offset); 548 549 /* 550 * This has to be atomic. If the porthash is shared across 551 * multiple protocol threads, e.g. tcp and udp then the token 552 * must be held. 553 */ 554 porthash = in_pcbporthash_head(portinfo, lport); 555 GET_PORTHASH_TOKEN(porthash); 556 557 if (so->so_cred->cr_uid != 0 && 558 !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 559 t = in_pcblookup_local(porthash, sin->sin_addr, lport, 560 INPLOOKUP_WILDCARD, cred); 561 if (t && 562 (so->so_cred->cr_uid != 563 t->inp_socket->so_cred->cr_uid)) { 564 inp->inp_laddr.s_addr = INADDR_ANY; 565 error = EADDRINUSE; 566 goto done; 567 } 568 } 569 if (cred && !prison_replace_wildcards(td, nam)) { 570 inp->inp_laddr.s_addr = INADDR_ANY; 571 error = EADDRNOTAVAIL; 572 goto done; 573 } 574 t = in_pcblookup_local(porthash, sin->sin_addr, lport, 575 wild, cred); 576 if (t && !(reuseport & t->inp_socket->so_options)) { 577 inp->inp_laddr.s_addr = INADDR_ANY; 578 error = EADDRINUSE; 579 goto done; 580 } 581 inp->inp_lport = lport; 582 in_pcbinsporthash(porthash, inp); 583 error = 0; 584 done: 585 REL_PORTHASH_TOKEN(porthash); 586 return (error); 587 } else { 588 jsin.sin_family = AF_INET; 589 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 590 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 591 inp->inp_laddr.s_addr = INADDR_ANY; 592 return (EINVAL); 593 } 594 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 595 596 return in_pcbsetlport(inp, wild, cred); 597 } 598 } 599 600 static struct inpcb * 601 in_pcblookup_localremote(struct inpcbporthead *porthash, struct in_addr laddr, 602 u_short lport, struct in_addr faddr, u_short fport, struct ucred *cred) 603 { 604 struct inpcb *inp; 605 struct inpcbport *phd; 606 struct inpcb *match = NULL; 607 608 /* 609 * If the porthashbase is shared across several cpus, it must 610 * have been locked. 611 */ 612 ASSERT_PORTHASH_TOKEN_HELD(porthash); 613 614 /* 615 * Best fit PCB lookup. 616 * 617 * First see if this local port is in use by looking on the 618 * port hash list. 619 */ 620 LIST_FOREACH(phd, porthash, phd_hash) { 621 if (phd->phd_port == lport) 622 break; 623 } 624 if (phd != NULL) { 625 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 626 #ifdef INET6 627 if (!INP_ISIPV4(inp)) 628 continue; 629 #endif 630 if (inp->inp_laddr.s_addr != INADDR_ANY && 631 inp->inp_laddr.s_addr != laddr.s_addr) 632 continue; 633 634 if (inp->inp_faddr.s_addr != INADDR_ANY && 635 inp->inp_faddr.s_addr != faddr.s_addr) 636 continue; 637 638 if (inp->inp_fport != 0 && inp->inp_fport != fport) 639 continue; 640 641 if (cred == NULL || 642 cred->cr_prison == 643 inp->inp_socket->so_cred->cr_prison) { 644 match = inp; 645 break; 646 } 647 } 648 } 649 return (match); 650 } 651 652 static boolean_t 653 in_pcbporthash_update4(struct inpcbportinfo *portinfo, 654 struct inpcb *inp, u_short lport, const struct sockaddr_in *sin, 655 struct ucred *cred) 656 { 657 struct inpcbporthead *porthash; 658 659 /* 660 * This has to be atomic. If the porthash is shared across multiple 661 * protocol threads, e.g. tcp and udp, then the token must be held. 662 */ 663 porthash = in_pcbporthash_head(portinfo, lport); 664 GET_PORTHASH_TOKEN(porthash); 665 666 if (in_pcblookup_localremote(porthash, inp->inp_laddr, 667 lport, sin->sin_addr, sin->sin_port, cred) != NULL) { 668 REL_PORTHASH_TOKEN(porthash); 669 return FALSE; 670 } 671 inp->inp_lport = lport; 672 in_pcbinsporthash(porthash, inp); 673 674 REL_PORTHASH_TOKEN(porthash); 675 return TRUE; 676 } 677 678 int 679 in_pcbbind_remote(struct inpcb *inp, const struct sockaddr *remote, 680 struct thread *td) 681 { 682 struct proc *p = td->td_proc; 683 const struct sockaddr_in *sin = (const struct sockaddr_in *)remote; 684 struct sockaddr_in jsin; 685 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 686 struct ucred *cred = NULL; 687 u_short first, last, lport; 688 int count, hash_count; 689 int error, selfconn = 0; 690 int cpuid = mycpuid; 691 uint32_t hash_base = 0, hash; 692 693 ASSERT_NETISR_NCPUS(cpuid); 694 695 if (TAILQ_EMPTY(&in_ifaddrheads[cpuid])) /* XXX broken! */ 696 return (EADDRNOTAVAIL); 697 698 KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY); 699 if (inp->inp_lport != 0) 700 return (EINVAL); /* already bound */ 701 702 KKASSERT(p); 703 cred = p->p_ucred; 704 705 jsin.sin_family = AF_INET; 706 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 707 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 708 inp->inp_laddr.s_addr = INADDR_ANY; 709 return (EINVAL); 710 } 711 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 712 713 hash_count = ip_porthash_trycount; 714 if (hash_count > 0) { 715 hash_base = toeplitz_piecemeal_addr(sin->sin_addr.s_addr) ^ 716 toeplitz_piecemeal_addr(inp->inp_laddr.s_addr) ^ 717 toeplitz_piecemeal_port(sin->sin_port); 718 } else { 719 hash_count = 0; 720 } 721 722 inp->inp_flags |= INP_ANONPORT; 723 724 if (inp->inp_flags & INP_HIGHPORT) { 725 first = ipport_hifirstauto; /* sysctl */ 726 last = ipport_hilastauto; 727 } else if (inp->inp_flags & INP_LOWPORT) { 728 if (cred && 729 (error = 730 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 731 inp->inp_laddr.s_addr = INADDR_ANY; 732 return (error); 733 } 734 first = ipport_lowfirstauto; /* 1023 */ 735 last = ipport_lowlastauto; /* 600 */ 736 } else { 737 first = ipport_firstauto; /* sysctl */ 738 last = ipport_lastauto; 739 } 740 if (first > last) { 741 lport = last; 742 last = first; 743 first = lport; 744 } 745 KKASSERT(last >= first); 746 747 count = last - first; 748 lport = (karc4random() % count) + first; 749 count += hash_count; 750 751 /* 752 * Simple check to ensure all ports are not used up causing 753 * a deadlock here. 754 */ 755 for (;;) { 756 u_short lport_no; 757 758 if (count-- < 0) { /* completely used? */ 759 error = EADDRNOTAVAIL; 760 break; 761 } 762 763 if (__predict_false(lport < first || lport > last)) 764 lport = first; 765 lport_no = htons(lport); 766 767 /* This could happen on loopback interface */ 768 if (__predict_false(sin->sin_port == lport_no && 769 sin->sin_addr.s_addr == inp->inp_laddr.s_addr)) { 770 if (!selfconn) { 771 ++count; /* don't count this try */ 772 selfconn = 1; 773 } 774 goto next; 775 } 776 777 if (hash_count) { 778 --hash_count; 779 hash = hash_base ^ 780 toeplitz_piecemeal_port(lport_no); 781 if (netisr_hashcpu(hash) != cpuid && hash_count) 782 goto next; 783 } 784 785 if (in_pcbporthash_update4( 786 &pcbinfo->portinfo[lport % pcbinfo->portinfo_cnt], 787 inp, lport_no, sin, cred)) { 788 error = 0; 789 break; 790 } 791 next: 792 ++lport; 793 } 794 795 if (error) 796 inp->inp_laddr.s_addr = INADDR_ANY; 797 return (error); 798 } 799 800 /* 801 * Transform old in_pcbconnect() into an inner subroutine for new 802 * in_pcbconnect(): Do some validity-checking on the remote 803 * address (in mbuf 'nam') and then determine local host address 804 * (i.e., which interface) to use to access that remote host. 805 * 806 * This preserves definition of in_pcbconnect(), while supporting a 807 * slightly different version for T/TCP. (This is more than 808 * a bit of a kludge, but cleaning up the internal interfaces would 809 * have forced minor changes in every protocol). 810 */ 811 int 812 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam, 813 struct sockaddr_in **plocal_sin, struct thread *td, int find) 814 { 815 struct in_ifaddr *ia; 816 struct ucred *cred = NULL; 817 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 818 struct sockaddr *jsin; 819 int jailed = 0, alloc_route = 0; 820 821 if (nam->sa_len != sizeof *sin) 822 return (EINVAL); 823 if (sin->sin_family != AF_INET) 824 return (EAFNOSUPPORT); 825 if (sin->sin_port == 0) 826 return (EADDRNOTAVAIL); 827 if (td && td->td_proc && td->td_proc->p_ucred) 828 cred = td->td_proc->p_ucred; 829 if (cred && cred->cr_prison) 830 jailed = 1; 831 if (!TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) { 832 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia; 833 /* 834 * If the destination address is INADDR_ANY, 835 * use the primary local address. 836 * If the supplied address is INADDR_BROADCAST, 837 * and the primary interface supports broadcast, 838 * choose the broadcast address for that interface. 839 */ 840 if (sin->sin_addr.s_addr == INADDR_ANY) 841 sin->sin_addr = IA_SIN(ia)->sin_addr; 842 else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && 843 (ia->ia_ifp->if_flags & IFF_BROADCAST)) 844 sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr; 845 } 846 if (find) { 847 struct route *ro; 848 849 ia = NULL; 850 /* 851 * If route is known or can be allocated now, 852 * our src addr is taken from the i/f, else punt. 853 * Note that we should check the address family of the cached 854 * destination, in case of sharing the cache with IPv6. 855 */ 856 ro = &inp->inp_route; 857 if (ro->ro_rt && 858 (!(ro->ro_rt->rt_flags & RTF_UP) || 859 ro->ro_dst.sa_family != AF_INET || 860 satosin(&ro->ro_dst)->sin_addr.s_addr != 861 sin->sin_addr.s_addr || 862 inp->inp_socket->so_options & SO_DONTROUTE)) { 863 RTFREE(ro->ro_rt); 864 ro->ro_rt = NULL; 865 } 866 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 867 (ro->ro_rt == NULL || 868 ro->ro_rt->rt_ifp == NULL)) { 869 /* No route yet, so try to acquire one */ 870 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 871 ro->ro_dst.sa_family = AF_INET; 872 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 873 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 874 sin->sin_addr; 875 rtalloc(ro); 876 alloc_route = 1; 877 } 878 /* 879 * If we found a route, use the address 880 * corresponding to the outgoing interface 881 * unless it is the loopback (in case a route 882 * to our address on another net goes to loopback). 883 */ 884 if (ro->ro_rt && 885 !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) { 886 if (jailed) { 887 if (jailed_ip(cred->cr_prison, 888 ro->ro_rt->rt_ifa->ifa_addr)) { 889 ia = ifatoia(ro->ro_rt->rt_ifa); 890 } 891 } else { 892 ia = ifatoia(ro->ro_rt->rt_ifa); 893 } 894 } 895 if (ia == NULL) { 896 u_short fport = sin->sin_port; 897 898 sin->sin_port = 0; 899 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); 900 if (ia && jailed && !jailed_ip(cred->cr_prison, 901 sintosa(&ia->ia_addr))) 902 ia = NULL; 903 if (ia == NULL) 904 ia = ifatoia(ifa_ifwithnet(sintosa(sin))); 905 if (ia && jailed && !jailed_ip(cred->cr_prison, 906 sintosa(&ia->ia_addr))) 907 ia = NULL; 908 sin->sin_port = fport; 909 if (ia == NULL && 910 !TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) 911 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia; 912 if (ia && jailed && !jailed_ip(cred->cr_prison, 913 sintosa(&ia->ia_addr))) 914 ia = NULL; 915 916 if (!jailed && ia == NULL) 917 goto fail; 918 } 919 /* 920 * If the destination address is multicast and an outgoing 921 * interface has been set as a multicast option, use the 922 * address of that interface as our source address. 923 */ 924 if (!jailed && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 925 inp->inp_moptions != NULL) { 926 struct ip_moptions *imo; 927 struct ifnet *ifp; 928 929 imo = inp->inp_moptions; 930 if ((ifp = imo->imo_multicast_ifp) != NULL) { 931 struct in_ifaddr_container *iac; 932 933 ia = NULL; 934 TAILQ_FOREACH(iac, 935 &in_ifaddrheads[mycpuid], ia_link) { 936 if (iac->ia->ia_ifp == ifp) { 937 ia = iac->ia; 938 break; 939 } 940 } 941 if (ia == NULL) 942 goto fail; 943 } 944 } 945 /* 946 * Don't do pcblookup call here; return interface in plocal_sin 947 * and exit to caller, that will do the lookup. 948 */ 949 if (ia == NULL && jailed) { 950 if ((jsin = prison_get_nonlocal( 951 cred->cr_prison, AF_INET, NULL)) != NULL || 952 (jsin = prison_get_local( 953 cred->cr_prison, AF_INET, NULL)) != NULL) { 954 *plocal_sin = satosin(jsin); 955 } else { 956 /* IPv6 only Jail */ 957 goto fail; 958 } 959 } else { 960 *plocal_sin = &ia->ia_addr; 961 } 962 } 963 return (0); 964 fail: 965 if (alloc_route) 966 in_pcbresetroute(inp); 967 return (EADDRNOTAVAIL); 968 } 969 970 int 971 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, 972 struct sockaddr_in **plocal_sin, struct thread *td) 973 { 974 return in_pcbladdr_find(inp, nam, plocal_sin, td, 975 (inp->inp_laddr.s_addr == INADDR_ANY)); 976 } 977 978 /* 979 * Outer subroutine: 980 * Connect from a socket to a specified address. 981 * Both address and port must be specified in argument sin. 982 * If don't have a local address for this socket yet, 983 * then pick one. 984 */ 985 int 986 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 987 { 988 struct sockaddr_in *if_sin; 989 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 990 int error; 991 992 if_sin = NULL; /* avoid gcc warnings */ 993 994 /* Call inner routine to assign local interface address. */ 995 if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0) 996 return (error); 997 998 if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, 999 inp->inp_laddr.s_addr ? 1000 inp->inp_laddr : if_sin->sin_addr, 1001 inp->inp_lport, FALSE, NULL) != NULL) { 1002 return (EADDRINUSE); 1003 } 1004 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1005 if (inp->inp_lport == 0) { 1006 error = in_pcbbind(inp, NULL, td); 1007 if (error) 1008 return (error); 1009 } 1010 inp->inp_laddr = if_sin->sin_addr; 1011 } 1012 inp->inp_faddr = sin->sin_addr; 1013 inp->inp_fport = sin->sin_port; 1014 in_pcbinsconnhash(inp); 1015 return (0); 1016 } 1017 1018 void 1019 in_pcbdisconnect(struct inpcb *inp) 1020 { 1021 1022 in_pcbremconnhash(inp); 1023 inp->inp_faddr.s_addr = INADDR_ANY; 1024 inp->inp_fport = 0; 1025 } 1026 1027 void 1028 in_pcbdetach(struct inpcb *inp) 1029 { 1030 struct socket *so = inp->inp_socket; 1031 struct inpcbinfo *ipi = inp->inp_pcbinfo; 1032 1033 inp->inp_gencnt = ++ipi->ipi_gencnt; 1034 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 1035 in_pcbremlists(inp); 1036 so->so_pcb = NULL; 1037 sofree(so); /* remove pcb ref */ 1038 if (inp->inp_options) 1039 m_free(inp->inp_options); 1040 if (inp->inp_route.ro_rt) 1041 rtfree(inp->inp_route.ro_rt); 1042 ip_freemoptions(inp->inp_moptions); 1043 kfree(inp, M_PCB); 1044 } 1045 1046 /* 1047 * The socket may have an invalid PCB, i.e. NULL. For example, a TCP 1048 * socket received RST. 1049 */ 1050 static int 1051 in_setsockaddr(struct socket *so, struct sockaddr **nam) 1052 { 1053 struct inpcb *inp; 1054 struct sockaddr_in *sin; 1055 1056 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 1057 inp = so->so_pcb; 1058 if (!inp) 1059 return (ECONNRESET); 1060 1061 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1062 sin->sin_family = AF_INET; 1063 sin->sin_len = sizeof *sin; 1064 sin->sin_port = inp->inp_lport; 1065 sin->sin_addr = inp->inp_laddr; 1066 1067 *nam = (struct sockaddr *)sin; 1068 return (0); 1069 } 1070 1071 void 1072 in_setsockaddr_dispatch(netmsg_t msg) 1073 { 1074 int error; 1075 1076 error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1077 lwkt_replymsg(&msg->lmsg, error); 1078 } 1079 1080 /* 1081 * The socket may have an invalid PCB, i.e. NULL. For example, a TCP 1082 * socket received RST. 1083 */ 1084 int 1085 in_setpeeraddr(struct socket *so, struct sockaddr **nam) 1086 { 1087 struct inpcb *inp; 1088 struct sockaddr_in *sin; 1089 1090 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 1091 inp = so->so_pcb; 1092 if (!inp) 1093 return (ECONNRESET); 1094 1095 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1096 sin->sin_family = AF_INET; 1097 sin->sin_len = sizeof *sin; 1098 sin->sin_port = inp->inp_fport; 1099 sin->sin_addr = inp->inp_faddr; 1100 1101 *nam = (struct sockaddr *)sin; 1102 return (0); 1103 } 1104 1105 void 1106 in_setpeeraddr_dispatch(netmsg_t msg) 1107 { 1108 int error; 1109 1110 error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1111 lwkt_replymsg(&msg->lmsg, error); 1112 } 1113 1114 void 1115 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int err, 1116 inp_notify_t notify) 1117 { 1118 struct inpcb *inp, *marker; 1119 1120 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1121 ("not in the correct netisr")); 1122 marker = in_pcbmarker(); 1123 1124 /* 1125 * NOTE: 1126 * - If INP_PLACEMARKER is set we must ignore the rest of the 1127 * structure and skip it. 1128 * - It is safe to nuke inpcbs here, since we are in their own 1129 * netisr. 1130 */ 1131 GET_PCBINFO_TOKEN(pcbinfo); 1132 1133 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1134 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1135 LIST_REMOVE(marker, inp_list); 1136 LIST_INSERT_AFTER(inp, marker, inp_list); 1137 1138 if (inp->inp_flags & INP_PLACEMARKER) 1139 continue; 1140 #ifdef INET6 1141 if (!INP_ISIPV4(inp)) 1142 continue; 1143 #endif 1144 if (inp->inp_faddr.s_addr != faddr.s_addr || 1145 inp->inp_socket == NULL) 1146 continue; 1147 (*notify)(inp, err); /* can remove inp from list! */ 1148 } 1149 LIST_REMOVE(marker, inp_list); 1150 1151 REL_PCBINFO_TOKEN(pcbinfo); 1152 } 1153 1154 void 1155 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1156 { 1157 struct inpcb *inp, *marker; 1158 1159 /* 1160 * We only need to make sure that we are in netisr0, where all 1161 * multicast operation happen. We could check inpcbinfo which 1162 * does not belong to netisr0 by holding the inpcbinfo's token. 1163 * In this case, the pcbinfo must be able to be shared, i.e. 1164 * pcbinfo->infotoken is not NULL. 1165 */ 1166 ASSERT_NETISR0; 1167 KASSERT(pcbinfo->cpu == 0 || pcbinfo->infotoken != NULL, 1168 ("pcbinfo could not be shared")); 1169 1170 /* 1171 * Get a marker for the current netisr (netisr0). 1172 * 1173 * It is possible that the multicast address deletion blocks, 1174 * which could cause temporary token releasing. So we use 1175 * inpcb marker here to get a coherent view of the inpcb list. 1176 * 1177 * While, on the other hand, moptions are only added and deleted 1178 * in netisr0, so we would not see staled moption or miss moption 1179 * even if the token was released due to the blocking multicast 1180 * address deletion. 1181 */ 1182 marker = in_pcbmarker(); 1183 1184 GET_PCBINFO_TOKEN(pcbinfo); 1185 1186 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1187 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1188 struct ip_moptions *imo; 1189 1190 LIST_REMOVE(marker, inp_list); 1191 LIST_INSERT_AFTER(inp, marker, inp_list); 1192 1193 if (inp->inp_flags & INP_PLACEMARKER) 1194 continue; 1195 imo = inp->inp_moptions; 1196 if (INP_ISIPV4(inp) && imo != NULL) { 1197 int i, gap; 1198 1199 /* 1200 * Unselect the outgoing interface if it is being 1201 * detached. 1202 */ 1203 if (imo->imo_multicast_ifp == ifp) 1204 imo->imo_multicast_ifp = NULL; 1205 1206 /* 1207 * Drop multicast group membership if we joined 1208 * through the interface being detached. 1209 */ 1210 for (i = 0, gap = 0; i < imo->imo_num_memberships; 1211 i++) { 1212 if (imo->imo_membership[i]->inm_ifp == ifp) { 1213 /* 1214 * NOTE: 1215 * This could block and the pcbinfo 1216 * token could be passively released. 1217 */ 1218 in_delmulti(imo->imo_membership[i]); 1219 gap++; 1220 } else if (gap != 0) 1221 imo->imo_membership[i - gap] = 1222 imo->imo_membership[i]; 1223 } 1224 imo->imo_num_memberships -= gap; 1225 } 1226 } 1227 LIST_REMOVE(marker, inp_list); 1228 1229 REL_PCBINFO_TOKEN(pcbinfo); 1230 } 1231 1232 /* 1233 * Check for alternatives when higher level complains 1234 * about service problems. For now, invalidate cached 1235 * routing information. If the route was created dynamically 1236 * (by a redirect), time to try a default gateway again. 1237 */ 1238 void 1239 in_losing(struct inpcb *inp) 1240 { 1241 struct rtentry *rt; 1242 struct rt_addrinfo rtinfo; 1243 1244 if ((rt = inp->inp_route.ro_rt)) { 1245 bzero(&rtinfo, sizeof(struct rt_addrinfo)); 1246 rtinfo.rti_info[RTAX_DST] = rt_key(rt); 1247 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway; 1248 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt); 1249 rtinfo.rti_flags = rt->rt_flags; 1250 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0); 1251 if (rt->rt_flags & RTF_DYNAMIC) { 1252 rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, 1253 rt_mask(rt), rt->rt_flags, NULL); 1254 } 1255 inp->inp_route.ro_rt = NULL; 1256 rtfree(rt); 1257 /* 1258 * A new route can be allocated 1259 * the next time output is attempted. 1260 */ 1261 } 1262 } 1263 1264 /* 1265 * After a routing change, flush old routing 1266 * and allocate a (hopefully) better one. 1267 */ 1268 void 1269 in_rtchange(struct inpcb *inp, int err) 1270 { 1271 if (inp->inp_route.ro_rt) { 1272 rtfree(inp->inp_route.ro_rt); 1273 inp->inp_route.ro_rt = NULL; 1274 /* 1275 * A new route can be allocated the next time 1276 * output is attempted. 1277 */ 1278 } 1279 } 1280 1281 /* 1282 * Lookup a PCB based on the local address and port. 1283 */ 1284 static struct inpcb * 1285 in_pcblookup_local(struct inpcbporthead *porthash, struct in_addr laddr, 1286 u_int lport_arg, int wild_okay, struct ucred *cred) 1287 { 1288 struct inpcb *inp; 1289 int matchwild = 3, wildcard; 1290 u_short lport = lport_arg; 1291 struct inpcbport *phd; 1292 struct inpcb *match = NULL; 1293 1294 /* 1295 * If the porthashbase is shared across several cpus, it must 1296 * have been locked. 1297 */ 1298 ASSERT_PORTHASH_TOKEN_HELD(porthash); 1299 1300 /* 1301 * Best fit PCB lookup. 1302 * 1303 * First see if this local port is in use by looking on the 1304 * port hash list. 1305 */ 1306 LIST_FOREACH(phd, porthash, phd_hash) { 1307 if (phd->phd_port == lport) 1308 break; 1309 } 1310 if (phd != NULL) { 1311 /* 1312 * Port is in use by one or more PCBs. Look for best 1313 * fit. 1314 */ 1315 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 1316 wildcard = 0; 1317 #ifdef INET6 1318 if (!INP_ISIPV4(inp)) 1319 continue; 1320 #endif 1321 if (inp->inp_faddr.s_addr != INADDR_ANY) 1322 wildcard++; 1323 if (inp->inp_laddr.s_addr != INADDR_ANY) { 1324 if (laddr.s_addr == INADDR_ANY) 1325 wildcard++; 1326 else if (inp->inp_laddr.s_addr != laddr.s_addr) 1327 continue; 1328 } else { 1329 if (laddr.s_addr != INADDR_ANY) 1330 wildcard++; 1331 } 1332 if (wildcard && !wild_okay) 1333 continue; 1334 if (wildcard < matchwild && 1335 (cred == NULL || 1336 cred->cr_prison == 1337 inp->inp_socket->so_cred->cr_prison)) { 1338 match = inp; 1339 matchwild = wildcard; 1340 if (matchwild == 0) { 1341 break; 1342 } 1343 } 1344 } 1345 } 1346 return (match); 1347 } 1348 1349 struct inpcb * 1350 in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo, 1351 const struct inpcb *inp) 1352 { 1353 const struct inp_localgrphead *hdr; 1354 const struct inp_localgroup *grp; 1355 int i; 1356 1357 if (pcbinfo->localgrphashbase == NULL) 1358 return NULL; 1359 1360 GET_PCBINFO_TOKEN(pcbinfo); 1361 1362 hdr = &pcbinfo->localgrphashbase[ 1363 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1364 1365 LIST_FOREACH(grp, hdr, il_list) { 1366 if (grp->il_af == inp->inp_af && 1367 grp->il_lport == inp->inp_lport && 1368 memcmp(&grp->il_dependladdr, 1369 &inp->inp_inc.inc_ie.ie_dependladdr, 1370 sizeof(grp->il_dependladdr)) == 0) { 1371 break; 1372 } 1373 } 1374 if (grp == NULL || grp->il_inpcnt == 1) { 1375 REL_PCBINFO_TOKEN(pcbinfo); 1376 return NULL; 1377 } 1378 1379 KASSERT(grp->il_inpcnt >= 2, 1380 ("invalid localgroup inp count %d", grp->il_inpcnt)); 1381 for (i = 0; i < grp->il_inpcnt; ++i) { 1382 if (grp->il_inp[i] == inp) { 1383 int last = grp->il_inpcnt - 1; 1384 1385 if (i == last) 1386 last = grp->il_inpcnt - 2; 1387 REL_PCBINFO_TOKEN(pcbinfo); 1388 return grp->il_inp[last]; 1389 } 1390 } 1391 REL_PCBINFO_TOKEN(pcbinfo); 1392 return NULL; 1393 } 1394 1395 static struct inpcb * 1396 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo, 1397 struct in_addr laddr, uint16_t lport, uint32_t pkt_hash) 1398 { 1399 struct inpcb *local_wild = NULL; 1400 const struct inp_localgrphead *hdr; 1401 const struct inp_localgroup *grp; 1402 1403 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1404 1405 hdr = &pcbinfo->localgrphashbase[ 1406 INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)]; 1407 1408 /* 1409 * Order of socket selection: 1410 * 1. non-wild. 1411 * 2. wild. 1412 * 1413 * NOTE: Local group does not contain jailed sockets 1414 */ 1415 LIST_FOREACH(grp, hdr, il_list) { 1416 #ifdef INET6 1417 if (grp->il_af != AF_INET) 1418 continue; 1419 #endif 1420 if (grp->il_lport == lport) { 1421 int idx; 1422 1423 /* 1424 * Modulo-N is used here, which greatly reduces 1425 * completion queue token contention, thus more 1426 * cpu time is saved. 1427 */ 1428 idx = netisr_hashlsb(pkt_hash) % grp->il_inpcnt; 1429 if (grp->il_laddr.s_addr == laddr.s_addr) 1430 return grp->il_inp[idx]; 1431 else if (grp->il_laddr.s_addr == INADDR_ANY) 1432 local_wild = grp->il_inp[idx]; 1433 } 1434 } 1435 if (local_wild != NULL) 1436 return local_wild; 1437 return NULL; 1438 } 1439 1440 /* 1441 * Lookup PCB in hash list. 1442 */ 1443 struct inpcb * 1444 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1445 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1446 boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m) 1447 { 1448 struct inpcbhead *head; 1449 struct inpcb *inp, *jinp=NULL; 1450 u_short fport = fport_arg, lport = lport_arg; 1451 1452 /* 1453 * First look for an exact match. 1454 */ 1455 head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport, 1456 laddr.s_addr, lport, pcbinfo->hashmask)]; 1457 LIST_FOREACH(inp, head, inp_hash) { 1458 #ifdef INET6 1459 if (!INP_ISIPV4(inp)) 1460 continue; 1461 #endif 1462 if (in_hosteq(inp->inp_faddr, faddr) && 1463 in_hosteq(inp->inp_laddr, laddr) && 1464 inp->inp_fport == fport && inp->inp_lport == lport) { 1465 /* found */ 1466 if (inp->inp_socket == NULL || 1467 inp->inp_socket->so_cred->cr_prison == NULL) { 1468 return (inp); 1469 } else { 1470 if (jinp == NULL) 1471 jinp = inp; 1472 } 1473 } 1474 } 1475 if (jinp != NULL) 1476 return (jinp); 1477 1478 if (wildcard) { 1479 struct inpcb *local_wild = NULL; 1480 struct inpcb *jinp_wild = NULL; 1481 struct inpcontainer *ic; 1482 struct inpcontainerhead *chead; 1483 struct sockaddr_in jsin; 1484 struct ucred *cred; 1485 1486 GET_PCBINFO_TOKEN(pcbinfo); 1487 1488 /* 1489 * Check local group first 1490 */ 1491 if (pcbinfo->localgrphashbase != NULL && 1492 m != NULL && (m->m_flags & M_HASH)) { 1493 inp = inp_localgroup_lookup(pcbinfo, 1494 laddr, lport, m->m_pkthdr.hash); 1495 if (inp != NULL) { 1496 REL_PCBINFO_TOKEN(pcbinfo); 1497 return inp; 1498 } 1499 } 1500 1501 /* 1502 * Order of socket selection: 1503 * 1. non-jailed, non-wild. 1504 * 2. non-jailed, wild. 1505 * 3. jailed, non-wild. 1506 * 4. jailed, wild. 1507 */ 1508 jsin.sin_family = AF_INET; 1509 chead = &pcbinfo->wildcardhashbase[ 1510 INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)]; 1511 LIST_FOREACH(ic, chead, ic_list) { 1512 inp = ic->ic_inp; 1513 if (inp->inp_flags & INP_PLACEMARKER) 1514 continue; 1515 1516 jsin.sin_addr.s_addr = laddr.s_addr; 1517 #ifdef INET6 1518 if (!INP_ISIPV4(inp)) 1519 continue; 1520 #endif 1521 if (inp->inp_socket != NULL) 1522 cred = inp->inp_socket->so_cred; 1523 else 1524 cred = NULL; 1525 if (cred != NULL && jailed(cred)) { 1526 if (jinp != NULL) 1527 continue; 1528 else 1529 if (!jailed_ip(cred->cr_prison, 1530 (struct sockaddr *)&jsin)) 1531 continue; 1532 } 1533 if (inp->inp_lport == lport) { 1534 if (inp->inp_laddr.s_addr == laddr.s_addr) { 1535 if (cred != NULL && jailed(cred)) { 1536 jinp = inp; 1537 } else { 1538 REL_PCBINFO_TOKEN(pcbinfo); 1539 return (inp); 1540 } 1541 } 1542 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1543 if (cred != NULL && jailed(cred)) 1544 jinp_wild = inp; 1545 else 1546 local_wild = inp; 1547 } 1548 } 1549 } 1550 1551 REL_PCBINFO_TOKEN(pcbinfo); 1552 1553 if (local_wild != NULL) 1554 return (local_wild); 1555 if (jinp != NULL) 1556 return (jinp); 1557 return (jinp_wild); 1558 } 1559 1560 /* 1561 * Not found. 1562 */ 1563 return (NULL); 1564 } 1565 1566 struct inpcb * 1567 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1568 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1569 boolean_t wildcard, struct ifnet *ifp) 1570 { 1571 return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg, 1572 laddr, lport_arg, wildcard, ifp, NULL); 1573 } 1574 1575 /* 1576 * Insert PCB into connection hash table. 1577 */ 1578 void 1579 in_pcbinsconnhash(struct inpcb *inp) 1580 { 1581 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1582 struct inpcbhead *bucket; 1583 u_int32_t hashkey_faddr, hashkey_laddr; 1584 1585 #ifdef INET6 1586 if (INP_ISIPV6(inp)) { 1587 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */; 1588 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */; 1589 } else { 1590 #endif 1591 hashkey_faddr = inp->inp_faddr.s_addr; 1592 hashkey_laddr = inp->inp_laddr.s_addr; 1593 #ifdef INET6 1594 } 1595 #endif 1596 1597 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1598 ("not in the correct netisr")); 1599 ASSERT_INP_NOTINHASH(inp); 1600 inp->inp_flags |= INP_CONNECTED; 1601 1602 /* 1603 * Insert into the connection hash table. 1604 */ 1605 bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr, 1606 inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)]; 1607 LIST_INSERT_HEAD(bucket, inp, inp_hash); 1608 } 1609 1610 /* 1611 * Remove PCB from connection hash table. 1612 */ 1613 void 1614 in_pcbremconnhash(struct inpcb *inp) 1615 { 1616 struct inpcbinfo *pcbinfo __debugvar = inp->inp_pcbinfo; 1617 1618 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1619 ("not in the correct netisr")); 1620 KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected")); 1621 1622 LIST_REMOVE(inp, inp_hash); 1623 inp->inp_flags &= ~INP_CONNECTED; 1624 } 1625 1626 /* 1627 * Insert PCB into port hash table. 1628 */ 1629 void 1630 in_pcbinsporthash(struct inpcbporthead *pcbporthash, struct inpcb *inp) 1631 { 1632 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1633 struct inpcbport *phd; 1634 1635 /* 1636 * If the porthashbase is shared across several cpus, it must 1637 * have been locked. 1638 */ 1639 ASSERT_PORTHASH_TOKEN_HELD(pcbporthash); 1640 1641 /* 1642 * Insert into the port hash table. 1643 */ 1644 1645 /* Go through port list and look for a head for this lport. */ 1646 LIST_FOREACH(phd, pcbporthash, phd_hash) { 1647 if (phd->phd_port == inp->inp_lport) 1648 break; 1649 } 1650 1651 /* If none exists, use saved one and tack it on. */ 1652 if (phd == NULL) { 1653 KKASSERT(pcbinfo->portsave != NULL); 1654 phd = pcbinfo->portsave; 1655 pcbinfo->portsave = NULL; 1656 phd->phd_port = inp->inp_lport; 1657 LIST_INIT(&phd->phd_pcblist); 1658 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 1659 } 1660 1661 inp->inp_porthash = pcbporthash; 1662 inp->inp_phd = phd; 1663 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 1664 1665 /* 1666 * Malloc one inpcbport for later use. It is safe to use 1667 * "wait" malloc here (port token would be released, if 1668 * malloc ever blocked), since all changes to the porthash 1669 * are done. 1670 */ 1671 if (pcbinfo->portsave == NULL) { 1672 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), 1673 M_PCB, M_INTWAIT | M_ZERO); 1674 } 1675 } 1676 1677 void 1678 in_pcbinsporthash_lport(struct inpcb *inp) 1679 { 1680 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1681 struct inpcbportinfo *portinfo; 1682 struct inpcbporthead *porthash; 1683 u_short lport_ho; 1684 1685 /* Locate the proper portinfo based on lport */ 1686 lport_ho = ntohs(inp->inp_lport); 1687 portinfo = &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt]; 1688 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == portinfo->offset); 1689 1690 porthash = in_pcbporthash_head(portinfo, inp->inp_lport); 1691 GET_PORTHASH_TOKEN(porthash); 1692 in_pcbinsporthash(porthash, inp); 1693 REL_PORTHASH_TOKEN(porthash); 1694 } 1695 1696 void 1697 in_pcbremporthash(struct inpcb *inp) 1698 { 1699 struct inpcbporthead *porthash; 1700 struct inpcbport *phd; 1701 1702 if (inp->inp_phd == NULL) 1703 return; 1704 KASSERT(inp->inp_lport != 0, ("inpcb has no lport")); 1705 1706 porthash = inp->inp_porthash; 1707 KASSERT(porthash != NULL, ("no porthash")); 1708 1709 GET_PORTHASH_TOKEN(porthash); 1710 1711 phd = inp->inp_phd; 1712 LIST_REMOVE(inp, inp_portlist); 1713 if (LIST_FIRST(&phd->phd_pcblist) == NULL) { 1714 LIST_REMOVE(phd, phd_hash); 1715 kfree(phd, M_PCB); 1716 } 1717 1718 REL_PORTHASH_TOKEN(porthash); 1719 1720 inp->inp_phd = NULL; 1721 /* NOTE: Don't whack inp_lport, which may be used later */ 1722 } 1723 1724 static struct inp_localgroup * 1725 inp_localgroup_alloc(u_char af, uint16_t port, 1726 const union in_dependaddr *addr, int size) 1727 { 1728 struct inp_localgroup *grp; 1729 1730 grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]), 1731 M_TEMP, M_INTWAIT | M_ZERO); 1732 grp->il_af = af; 1733 grp->il_lport = port; 1734 grp->il_dependladdr = *addr; 1735 grp->il_inpsiz = size; 1736 1737 return grp; 1738 } 1739 1740 static void 1741 inp_localgroup_free(struct inp_localgroup *grp) 1742 { 1743 kfree(grp, M_TEMP); 1744 } 1745 1746 static void 1747 inp_localgroup_destroy(struct inp_localgroup *grp) 1748 { 1749 LIST_REMOVE(grp, il_list); 1750 inp_localgroup_free(grp); 1751 } 1752 1753 static void 1754 inp_localgroup_copy(struct inp_localgroup *grp, 1755 const struct inp_localgroup *old_grp) 1756 { 1757 int i; 1758 1759 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 1760 ("invalid new local group size %d and old local group count %d", 1761 grp->il_inpsiz, old_grp->il_inpcnt)); 1762 for (i = 0; i < old_grp->il_inpcnt; ++i) 1763 grp->il_inp[i] = old_grp->il_inp[i]; 1764 grp->il_inpcnt = old_grp->il_inpcnt; 1765 } 1766 1767 static void 1768 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1769 { 1770 struct inp_localgrphead *hdr; 1771 struct inp_localgroup *grp, *grp_alloc = NULL; 1772 struct ucred *cred; 1773 int i, idx; 1774 1775 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1776 1777 if (pcbinfo->localgrphashbase == NULL) 1778 return; 1779 1780 /* 1781 * XXX don't allow jailed socket to join local group 1782 */ 1783 if (inp->inp_socket != NULL) 1784 cred = inp->inp_socket->so_cred; 1785 else 1786 cred = NULL; 1787 if (cred != NULL && jailed(cred)) 1788 return; 1789 1790 hdr = &pcbinfo->localgrphashbase[ 1791 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1792 1793 again: 1794 LIST_FOREACH(grp, hdr, il_list) { 1795 if (grp->il_af == inp->inp_af && 1796 grp->il_lport == inp->inp_lport && 1797 memcmp(&grp->il_dependladdr, 1798 &inp->inp_inc.inc_ie.ie_dependladdr, 1799 sizeof(grp->il_dependladdr)) == 0) { 1800 break; 1801 } 1802 } 1803 if (grp == NULL) { 1804 /* 1805 * Create a new local group 1806 */ 1807 if (grp_alloc == NULL) { 1808 grp_alloc = inp_localgroup_alloc(inp->inp_af, 1809 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 1810 INP_LOCALGROUP_SIZMIN); 1811 /* 1812 * Local group allocation could block and the 1813 * local group w/ the same property might have 1814 * been added by others when we were blocked; 1815 * check again. 1816 */ 1817 goto again; 1818 } else { 1819 /* Local group has been allocated; link it */ 1820 grp = grp_alloc; 1821 grp_alloc = NULL; 1822 LIST_INSERT_HEAD(hdr, grp, il_list); 1823 } 1824 } else if (grp->il_inpcnt == grp->il_inpsiz) { 1825 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) { 1826 static int limit_logged = 0; 1827 1828 if (!limit_logged) { 1829 limit_logged = 1; 1830 kprintf("local group port %d, " 1831 "limit reached\n", ntohs(grp->il_lport)); 1832 } 1833 if (grp_alloc != NULL) { 1834 /* 1835 * This would happen if the local group 1836 * w/ the same property was expanded when 1837 * our local group allocation blocked. 1838 */ 1839 inp_localgroup_free(grp_alloc); 1840 } 1841 return; 1842 } 1843 1844 /* 1845 * Expand this local group 1846 */ 1847 if (grp_alloc == NULL || 1848 grp->il_inpcnt >= grp_alloc->il_inpsiz) { 1849 if (grp_alloc != NULL) 1850 inp_localgroup_free(grp_alloc); 1851 grp_alloc = inp_localgroup_alloc(grp->il_af, 1852 grp->il_lport, &grp->il_dependladdr, 1853 grp->il_inpsiz * 2); 1854 /* 1855 * Local group allocation could block and the 1856 * local group w/ the same property might have 1857 * been expanded by others when we were blocked; 1858 * check again. 1859 */ 1860 goto again; 1861 } 1862 1863 /* 1864 * Save the old local group, link the new one, and then 1865 * destroy the old local group 1866 */ 1867 inp_localgroup_copy(grp_alloc, grp); 1868 LIST_INSERT_HEAD(hdr, grp_alloc, il_list); 1869 inp_localgroup_destroy(grp); 1870 1871 grp = grp_alloc; 1872 grp_alloc = NULL; 1873 } else { 1874 /* 1875 * Found the local group 1876 */ 1877 if (grp_alloc != NULL) { 1878 /* 1879 * This would happen if the local group w/ the 1880 * same property was added or expanded when our 1881 * local group allocation blocked. 1882 */ 1883 inp_localgroup_free(grp_alloc); 1884 grp_alloc = NULL; 1885 } 1886 } 1887 1888 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 1889 ("invalid local group size %d and count %d", 1890 grp->il_inpsiz, grp->il_inpcnt)); 1891 1892 /* 1893 * Keep the local group sorted by the inpcb local group index 1894 * in ascending order. 1895 * 1896 * This eases the multi-process userland application which uses 1897 * SO_REUSEPORT sockets and binds process to the owner cpu of 1898 * the SO_REUSEPORT socket: 1899 * If we didn't sort the local group by the inpcb local group 1900 * index and one of the process owning an inpcb in this local 1901 * group restarted, e.g. crashed and restarted by watchdog, 1902 * other processes owning a inpcb in this local group would have 1903 * to detect that event, refetch its socket's owner cpu, and 1904 * re-bind. 1905 */ 1906 idx = grp->il_inpcnt; 1907 for (i = 0; i < idx; ++i) { 1908 struct inpcb *oinp = grp->il_inp[i]; 1909 1910 if (oinp->inp_lgrpindex > i) { 1911 if (inp->inp_lgrpindex < 0) { 1912 inp->inp_lgrpindex = i; 1913 } else if (inp->inp_lgrpindex != i) { 1914 if (bootverbose) { 1915 kprintf("inp %p: grpidx %d, " 1916 "assigned to %d, cpu%d\n", 1917 inp, inp->inp_lgrpindex, i, 1918 mycpuid); 1919 } 1920 } 1921 grp->il_inp[i] = inp; 1922 1923 /* Pull down inpcbs */ 1924 for (; i < grp->il_inpcnt; ++i) { 1925 struct inpcb *oinp1 = grp->il_inp[i + 1]; 1926 1927 grp->il_inp[i + 1] = oinp; 1928 oinp = oinp1; 1929 } 1930 grp->il_inpcnt++; 1931 return; 1932 } 1933 } 1934 1935 if (inp->inp_lgrpindex < 0) { 1936 inp->inp_lgrpindex = idx; 1937 } else if (inp->inp_lgrpindex != idx) { 1938 if (bootverbose) { 1939 kprintf("inp %p: grpidx %d, assigned to %d, cpu%d\n", 1940 inp, inp->inp_lgrpindex, idx, mycpuid); 1941 } 1942 } 1943 grp->il_inp[idx] = inp; 1944 grp->il_inpcnt++; 1945 } 1946 1947 void 1948 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1949 { 1950 struct inpcontainer *ic; 1951 struct inpcontainerhead *bucket; 1952 1953 GET_PCBINFO_TOKEN(pcbinfo); 1954 1955 in_pcbinslocalgrphash_oncpu(inp, pcbinfo); 1956 1957 bucket = &pcbinfo->wildcardhashbase[ 1958 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 1959 1960 ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT); 1961 ic->ic_inp = inp; 1962 LIST_INSERT_HEAD(bucket, ic, ic_list); 1963 1964 REL_PCBINFO_TOKEN(pcbinfo); 1965 } 1966 1967 /* 1968 * Insert PCB into wildcard hash table. 1969 */ 1970 void 1971 in_pcbinswildcardhash(struct inpcb *inp) 1972 { 1973 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1974 1975 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1976 ("not in correct netisr")); 1977 ASSERT_INP_NOTINHASH(inp); 1978 inp->inp_flags |= INP_WILDCARD; 1979 1980 in_pcbinswildcardhash_oncpu(inp, pcbinfo); 1981 } 1982 1983 static void 1984 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1985 { 1986 struct inp_localgrphead *hdr; 1987 struct inp_localgroup *grp; 1988 1989 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1990 1991 if (pcbinfo->localgrphashbase == NULL) 1992 return; 1993 1994 hdr = &pcbinfo->localgrphashbase[ 1995 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1996 1997 LIST_FOREACH(grp, hdr, il_list) { 1998 int i; 1999 2000 for (i = 0; i < grp->il_inpcnt; ++i) { 2001 if (grp->il_inp[i] != inp) 2002 continue; 2003 2004 if (grp->il_inpcnt == 1) { 2005 /* Destroy this local group */ 2006 inp_localgroup_destroy(grp); 2007 } else { 2008 /* Pull up inpcbs */ 2009 for (; i + 1 < grp->il_inpcnt; ++i) 2010 grp->il_inp[i] = grp->il_inp[i + 1]; 2011 grp->il_inpcnt--; 2012 } 2013 return; 2014 } 2015 } 2016 } 2017 2018 void 2019 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 2020 { 2021 struct inpcontainer *ic; 2022 struct inpcontainerhead *head; 2023 2024 GET_PCBINFO_TOKEN(pcbinfo); 2025 2026 in_pcbremlocalgrphash_oncpu(inp, pcbinfo); 2027 2028 /* find bucket */ 2029 head = &pcbinfo->wildcardhashbase[ 2030 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 2031 2032 LIST_FOREACH(ic, head, ic_list) { 2033 if (ic->ic_inp == inp) 2034 goto found; 2035 } 2036 REL_PCBINFO_TOKEN(pcbinfo); 2037 return; /* not found! */ 2038 2039 found: 2040 LIST_REMOVE(ic, ic_list); /* remove container from bucket chain */ 2041 REL_PCBINFO_TOKEN(pcbinfo); 2042 kfree(ic, M_TEMP); /* deallocate container */ 2043 } 2044 2045 /* 2046 * Remove PCB from wildcard hash table. 2047 */ 2048 void 2049 in_pcbremwildcardhash(struct inpcb *inp) 2050 { 2051 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2052 2053 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 2054 ("not in correct netisr")); 2055 KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard")); 2056 2057 in_pcbremwildcardhash_oncpu(inp, pcbinfo); 2058 inp->inp_lgrpindex = -1; 2059 inp->inp_flags &= ~INP_WILDCARD; 2060 } 2061 2062 /* 2063 * Remove PCB from various lists. 2064 */ 2065 void 2066 in_pcbremlists(struct inpcb *inp) 2067 { 2068 in_pcbremporthash(inp); 2069 if (inp->inp_flags & INP_WILDCARD) { 2070 in_pcbremwildcardhash(inp); 2071 } else if (inp->inp_flags & INP_CONNECTED) { 2072 in_pcbremconnhash(inp); 2073 } 2074 2075 if (inp->inp_flags & INP_ONLIST) 2076 in_pcbofflist(inp); 2077 } 2078 2079 int 2080 prison_xinpcb(struct thread *td, struct inpcb *inp) 2081 { 2082 struct ucred *cr; 2083 2084 if (td->td_proc == NULL) 2085 return (0); 2086 cr = td->td_proc->p_ucred; 2087 if (cr->cr_prison == NULL) 2088 return (0); 2089 if (inp->inp_socket && inp->inp_socket->so_cred && 2090 inp->inp_socket->so_cred->cr_prison && 2091 cr->cr_prison == inp->inp_socket->so_cred->cr_prison) 2092 return (0); 2093 return (1); 2094 } 2095 2096 int 2097 in_pcblist_range(SYSCTL_HANDLER_ARGS) 2098 { 2099 struct inpcbinfo *pcbinfo_arr = arg1; 2100 int pcbinfo_arrlen = arg2; 2101 struct inpcb *marker; 2102 int cpu, origcpu; 2103 int error, n; 2104 2105 KASSERT(pcbinfo_arrlen <= netisr_ncpus && pcbinfo_arrlen >= 1, 2106 ("invalid pcbinfo count %d", pcbinfo_arrlen)); 2107 2108 /* 2109 * The process of preparing the TCB list is too time-consuming and 2110 * resource-intensive to repeat twice on every request. 2111 */ 2112 n = 0; 2113 if (req->oldptr == NULL) { 2114 for (cpu = 0; cpu < pcbinfo_arrlen; ++cpu) 2115 n += pcbinfo_arr[cpu].ipi_count; 2116 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb); 2117 return 0; 2118 } 2119 2120 if (req->newptr != NULL) 2121 return EPERM; 2122 2123 marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO); 2124 marker->inp_flags |= INP_PLACEMARKER; 2125 2126 /* 2127 * OK, now we're committed to doing something. Re-fetch ipi_count 2128 * after obtaining the generation count. 2129 */ 2130 error = 0; 2131 origcpu = mycpuid; 2132 for (cpu = 0; cpu < pcbinfo_arrlen && error == 0; ++cpu) { 2133 struct inpcbinfo *pcbinfo = &pcbinfo_arr[cpu]; 2134 struct inpcb *inp; 2135 struct xinpcb xi; 2136 int i; 2137 2138 lwkt_migratecpu(cpu); 2139 2140 GET_PCBINFO_TOKEN(pcbinfo); 2141 2142 n = pcbinfo->ipi_count; 2143 2144 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 2145 i = 0; 2146 while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) { 2147 LIST_REMOVE(marker, inp_list); 2148 LIST_INSERT_AFTER(inp, marker, inp_list); 2149 2150 if (inp->inp_flags & INP_PLACEMARKER) 2151 continue; 2152 if (prison_xinpcb(req->td, inp)) 2153 continue; 2154 2155 bzero(&xi, sizeof xi); 2156 xi.xi_len = sizeof xi; 2157 bcopy(inp, &xi.xi_inp, sizeof *inp); 2158 if (inp->inp_socket) 2159 sotoxsocket(inp->inp_socket, &xi.xi_socket); 2160 if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0) 2161 break; 2162 ++i; 2163 } 2164 LIST_REMOVE(marker, inp_list); 2165 2166 REL_PCBINFO_TOKEN(pcbinfo); 2167 2168 if (error == 0 && i < n) { 2169 bzero(&xi, sizeof xi); 2170 xi.xi_len = sizeof xi; 2171 while (i < n) { 2172 error = SYSCTL_OUT(req, &xi, sizeof xi); 2173 if (error) 2174 break; 2175 ++i; 2176 } 2177 } 2178 } 2179 2180 lwkt_migratecpu(origcpu); 2181 kfree(marker, M_TEMP); 2182 return error; 2183 } 2184 2185 int 2186 in_pcblist_ncpus(SYSCTL_HANDLER_ARGS) 2187 { 2188 2189 return (in_pcblist_range(oidp, arg1, netisr_ncpus, req)); 2190 } 2191 2192 void 2193 in_savefaddr(struct socket *so, const struct sockaddr *faddr) 2194 { 2195 struct sockaddr_in *sin; 2196 2197 KASSERT(faddr->sa_family == AF_INET, 2198 ("not AF_INET faddr %d", faddr->sa_family)); 2199 2200 sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO); 2201 sin->sin_family = AF_INET; 2202 sin->sin_len = sizeof(*sin); 2203 sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port; 2204 sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr; 2205 2206 so->so_faddr = (struct sockaddr *)sin; 2207 } 2208 2209 void 2210 in_pcbportinfo_init(struct inpcbportinfo *portinfo, int hashsize, 2211 u_short offset) 2212 { 2213 memset(portinfo, 0, sizeof(*portinfo)); 2214 2215 portinfo->offset = offset; 2216 portinfo->porthashbase = phashinit(hashsize, M_PCB, 2217 &portinfo->porthashcnt); 2218 } 2219 2220 void 2221 in_pcbportrange(u_short *hi0, u_short *lo0, u_short ofs, u_short step) 2222 { 2223 int hi, lo; 2224 2225 if (step == 1) 2226 return; 2227 2228 hi = *hi0; 2229 lo = *lo0; 2230 2231 hi = rounddown(hi, step); 2232 hi += ofs; 2233 if (hi > (int)*hi0) 2234 hi -= step; 2235 2236 lo = roundup(lo, step); 2237 lo -= (step - ofs); 2238 if (lo < (int)*lo0) 2239 lo += step; 2240 2241 *hi0 = hi; 2242 *lo0 = lo; 2243 } 2244 2245 void 2246 in_pcbglobalinit(void) 2247 { 2248 int cpu; 2249 2250 in_pcbmarkers = kmalloc(netisr_ncpus * sizeof(struct inpcb), M_PCB, 2251 M_WAITOK | M_ZERO); 2252 in_pcbcontainer_markers = 2253 kmalloc(netisr_ncpus * sizeof(struct inpcontainer), M_PCB, 2254 M_WAITOK | M_ZERO); 2255 2256 for (cpu = 0; cpu < netisr_ncpus; ++cpu) { 2257 struct inpcontainer *ic = &in_pcbcontainer_markers[cpu]; 2258 struct inpcb *marker = &in_pcbmarkers[cpu]; 2259 2260 marker->inp_flags |= INP_PLACEMARKER; 2261 ic->ic_inp = marker; 2262 } 2263 } 2264 2265 struct inpcb * 2266 in_pcbmarker(void) 2267 { 2268 2269 ASSERT_NETISR_NCPUS(mycpuid); 2270 return &in_pcbmarkers[mycpuid]; 2271 } 2272 2273 struct inpcontainer * 2274 in_pcbcontainer_marker(void) 2275 { 2276 2277 ASSERT_NETISR_NCPUS(mycpuid); 2278 return &in_pcbcontainer_markers[mycpuid]; 2279 } 2280 2281 void 2282 in_pcbresetroute(struct inpcb *inp) 2283 { 2284 struct route *ro = &inp->inp_route; 2285 2286 if (ro->ro_rt != NULL) 2287 RTFREE(ro->ro_rt); 2288 bzero(ro, sizeof(*ro)); 2289 } 2290