1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1991, 1993, 1995 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 63 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $ 64 */ 65 66 #include "opt_inet6.h" 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/domain.h> 73 #include <sys/protosw.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/proc.h> 77 #include <sys/priv.h> 78 #include <sys/jail.h> 79 #include <sys/kernel.h> 80 #include <sys/sysctl.h> 81 82 #include <sys/socketvar2.h> 83 #include <sys/msgport2.h> 84 85 #include <machine/limits.h> 86 87 #include <net/if.h> 88 #include <net/if_types.h> 89 #include <net/route.h> 90 #include <net/netisr2.h> 91 #include <net/toeplitz2.h> 92 93 #include <netinet/in.h> 94 #include <netinet/in_pcb.h> 95 #include <netinet/in_var.h> 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/ip6_var.h> 100 #endif /* INET6 */ 101 102 #define INP_LOCALGROUP_SIZMIN 8 103 #define INP_LOCALGROUP_SIZMAX 256 104 105 static struct inpcb *in_pcblookup_local(struct inpcbporthead *porthash, 106 struct in_addr laddr, u_int lport_arg, int wild_okay, 107 struct ucred *cred); 108 109 struct in_addr zeroin_addr; 110 111 /* 112 * These configure the range of local port addresses assigned to 113 * "unspecified" outgoing connections/packets/whatever. 114 */ 115 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 116 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 117 118 int ipport_firstauto = IPPORT_RESERVED; /* 1024 */ 119 int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */ 120 121 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 122 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 123 124 #define RANGECHK(var, min, max) \ 125 if ((var) < (min)) { (var) = (min); } \ 126 else if ((var) > (max)) { (var) = (max); } 127 128 int udpencap_enable = 1; /* enabled by default */ 129 int udpencap_port = 4500; /* triggers decapsulation */ 130 131 /* 132 * Per-netisr inpcb markers. 133 * NOTE: they should only be used in netisrs. 134 */ 135 static struct inpcb *in_pcbmarkers; 136 static struct inpcontainer *in_pcbcontainer_markers; 137 138 static int 139 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 140 { 141 int error; 142 143 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 144 if (!error) { 145 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 146 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 147 148 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); 149 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); 150 151 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); 152 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); 153 } 154 return (error); 155 } 156 157 #undef RANGECHK 158 159 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); 160 161 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, 162 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); 163 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, 164 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); 165 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, 166 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); 167 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, 168 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); 169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, 170 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); 171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, 172 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); 173 174 /* Initialized by ip_init() */ 175 int ip_porthash_trycount; 176 SYSCTL_INT(_net_inet_ip, OID_AUTO, porthash_trycount, CTLFLAG_RW, 177 &ip_porthash_trycount, 0, 178 "Number of tries to find local port matching hash of 4-tuple"); 179 180 /* 181 * in_pcb.c: manage the Protocol Control Blocks. 182 * 183 * NOTE: It is assumed that most of these functions will be called from 184 * a critical section. XXX - There are, unfortunately, a few exceptions 185 * to this rule that should be fixed. 186 * 187 * NOTE: The caller should initialize the cpu field to the cpu running the 188 * protocol stack associated with this inpcbinfo. 189 */ 190 191 void 192 in_pcbinfo_init(struct inpcbinfo *pcbinfo, int cpu, boolean_t shared) 193 { 194 KASSERT(cpu >= 0 && cpu < netisr_ncpus, ("invalid cpu%d", cpu)); 195 pcbinfo->cpu = cpu; 196 197 LIST_INIT(&pcbinfo->pcblisthead); 198 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB, 199 M_WAITOK | M_ZERO); 200 201 if (shared) { 202 pcbinfo->infotoken = kmalloc(sizeof(struct lwkt_token), 203 M_PCB, M_WAITOK); 204 lwkt_token_init(pcbinfo->infotoken, "infotoken"); 205 } else { 206 pcbinfo->infotoken = NULL; 207 } 208 } 209 210 void 211 in_pcbportinfo_set(struct inpcbinfo *pcbinfo, struct inpcbportinfo *portinfo, 212 int portinfo_cnt) 213 { 214 215 KASSERT(portinfo_cnt > 0, ("invalid portinfo_cnt %d", portinfo_cnt)); 216 pcbinfo->portinfo = portinfo; 217 pcbinfo->portinfo_cnt = portinfo_cnt; 218 } 219 220 struct baddynamicports baddynamicports; 221 222 /* 223 * Check if the specified port is invalid for dynamic allocation. 224 */ 225 int 226 in_baddynamic(u_int16_t port, u_int16_t proto) 227 { 228 switch (proto) { 229 case IPPROTO_TCP: 230 return (DP_ISSET(baddynamicports.tcp, port)); 231 case IPPROTO_UDP: 232 return (DP_ISSET(baddynamicports.udp, port)); 233 default: 234 return (0); 235 } 236 } 237 238 void 239 in_pcbonlist(struct inpcb *inp) 240 { 241 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 242 243 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 244 ("not in the correct netisr")); 245 KASSERT((inp->inp_flags & INP_ONLIST) == 0, ("already on pcblist")); 246 inp->inp_flags |= INP_ONLIST; 247 248 GET_PCBINFO_TOKEN(pcbinfo); 249 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list); 250 pcbinfo->ipi_count++; 251 REL_PCBINFO_TOKEN(pcbinfo); 252 } 253 254 void 255 in_pcbofflist(struct inpcb *inp) 256 { 257 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 258 259 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 260 ("not in the correct netisr")); 261 KASSERT(inp->inp_flags & INP_ONLIST, ("not on pcblist")); 262 inp->inp_flags &= ~INP_ONLIST; 263 264 GET_PCBINFO_TOKEN(pcbinfo); 265 LIST_REMOVE(inp, inp_list); 266 KASSERT(pcbinfo->ipi_count > 0, 267 ("invalid inpcb count %d", pcbinfo->ipi_count)); 268 pcbinfo->ipi_count--; 269 REL_PCBINFO_TOKEN(pcbinfo); 270 } 271 272 /* 273 * Allocate a PCB and associate it with the socket. 274 */ 275 int 276 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 277 { 278 struct inpcb *inp; 279 280 inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO|M_NULLOK); 281 if (inp == NULL) 282 return (ENOMEM); 283 inp->inp_lgrpindex = -1; 284 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 285 inp->inp_pcbinfo = pcbinfo; 286 inp->inp_socket = so; 287 #ifdef INET6 288 if (INP_CHECK_SOCKAF(so, AF_INET6)) { 289 if (ip6_auto_flowlabel) 290 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 291 inp->inp_af = AF_INET6; 292 } else 293 #endif 294 inp->inp_af = AF_INET; 295 soreference(so); 296 so->so_pcb = inp; 297 298 in_pcbonlist(inp); 299 return (0); 300 } 301 302 /* 303 * Unlink a pcb with the intention of moving it to another cpu with a 304 * different pcbinfo. While unlinked nothing should attempt to dereference 305 * inp_pcbinfo, NULL it out so we assert if it does. 306 */ 307 void 308 in_pcbunlink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags) 309 { 310 KASSERT(inp->inp_pcbinfo == pcbinfo, ("pcbinfo mismatch")); 311 KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0, 312 ("already linked")); 313 314 in_pcbofflist(inp); 315 inp->inp_pcbinfo = NULL; 316 } 317 318 void 319 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 320 { 321 in_pcbunlink_flags(inp, pcbinfo, INP_WILDCARD); 322 } 323 324 /* 325 * Relink a pcb into a new pcbinfo. 326 */ 327 void 328 in_pcblink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags) 329 { 330 KASSERT(inp->inp_pcbinfo == NULL, ("has pcbinfo")); 331 KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0, 332 ("already linked")); 333 334 inp->inp_pcbinfo = pcbinfo; 335 in_pcbonlist(inp); 336 } 337 338 void 339 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 340 { 341 return in_pcblink_flags(inp, pcbinfo, INP_WILDCARD); 342 } 343 344 static boolean_t 345 in_pcbporthash_update(struct inpcbportinfo *portinfo, 346 struct inpcb *inp, u_short lport, struct ucred *cred, int wild) 347 { 348 struct inpcbporthead *porthash; 349 350 /* 351 * This has to be atomic. If the porthash is shared across multiple 352 * protocol threads, e.g. tcp and udp, then the token must be held. 353 */ 354 porthash = in_pcbporthash_head(portinfo, lport); 355 GET_PORTHASH_TOKEN(porthash); 356 357 if (in_pcblookup_local(porthash, inp->inp_laddr, lport, wild, cred)) { 358 REL_PORTHASH_TOKEN(porthash); 359 return FALSE; 360 } 361 inp->inp_lport = lport; 362 in_pcbinsporthash(porthash, inp); 363 364 REL_PORTHASH_TOKEN(porthash); 365 return TRUE; 366 } 367 368 static int 369 in_pcbsetlport(struct inpcb *inp, int wild, struct ucred *cred) 370 { 371 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 372 struct inpcbportinfo *portinfo; 373 u_short first, last, lport, step, first0, last0; 374 int count, error; 375 int portinfo_first, portinfo_idx; 376 uint32_t cut; 377 378 inp->inp_flags |= INP_ANONPORT; 379 380 step = pcbinfo->portinfo_cnt; 381 portinfo_first = mycpuid % pcbinfo->portinfo_cnt; 382 portinfo_idx = portinfo_first; 383 384 if (inp->inp_flags & INP_HIGHPORT) { 385 first0 = ipport_hifirstauto; /* sysctl */ 386 last0 = ipport_hilastauto; 387 } else if (inp->inp_flags & INP_LOWPORT) { 388 if (cred && 389 (error = 390 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 391 inp->inp_laddr.s_addr = INADDR_ANY; 392 return error; 393 } 394 first0 = ipport_lowfirstauto; /* 1023 */ 395 last0 = ipport_lowlastauto; /* 600 */ 396 } else { 397 first0 = ipport_firstauto; /* sysctl */ 398 last0 = ipport_lastauto; 399 } 400 if (first0 > last0) { 401 lport = last0; 402 last0 = first0; 403 first0 = lport; 404 } 405 KKASSERT(last0 >= first0); 406 407 cut = karc4random(); 408 loop: 409 portinfo = &pcbinfo->portinfo[portinfo_idx]; 410 first = first0; 411 last = last0; 412 413 /* 414 * Simple check to ensure all ports are not used up causing 415 * a deadlock here. 416 */ 417 in_pcbportrange(&last, &first, portinfo->offset, step); 418 lport = last - first; 419 count = lport / step; 420 421 lport = rounddown(cut % lport, step) + first; 422 KKASSERT(lport % step == portinfo->offset); 423 424 for (;;) { 425 if (count-- < 0) { /* completely used? */ 426 error = EADDRNOTAVAIL; 427 break; 428 } 429 430 if (__predict_false(lport < first || lport > last)) { 431 lport = first; 432 KKASSERT(lport % step == portinfo->offset); 433 } 434 435 if (in_pcbporthash_update(portinfo, inp, htons(lport), 436 cred, wild)) { 437 error = 0; 438 break; 439 } 440 441 lport += step; 442 KKASSERT(lport % step == portinfo->offset); 443 } 444 445 if (error) { 446 /* Try next portinfo */ 447 portinfo_idx++; 448 portinfo_idx %= pcbinfo->portinfo_cnt; 449 if (portinfo_idx != portinfo_first) 450 goto loop; 451 inp->inp_laddr.s_addr = INADDR_ANY; 452 } 453 return error; 454 } 455 456 int 457 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 458 { 459 struct socket *so = inp->inp_socket; 460 struct sockaddr_in jsin; 461 struct ucred *cred = NULL; 462 int wild = 0; 463 464 if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */ 465 return (EADDRNOTAVAIL); 466 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 467 return (EINVAL); /* already bound */ 468 469 if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT))) 470 wild = 1; /* neither SO_REUSEADDR nor SO_REUSEPORT is set */ 471 if (td->td_proc) 472 cred = td->td_proc->p_ucred; 473 474 if (nam != NULL) { 475 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 476 struct inpcbinfo *pcbinfo; 477 struct inpcbportinfo *portinfo; 478 struct inpcbporthead *porthash; 479 struct inpcb *t; 480 u_short lport, lport_ho; 481 int reuseport = (so->so_options & SO_REUSEPORT); 482 int error; 483 484 if (nam->sa_len != sizeof *sin) 485 return (EINVAL); 486 #ifdef notdef 487 /* 488 * We should check the family, but old programs 489 * incorrectly fail to initialize it. 490 */ 491 if (sin->sin_family != AF_INET) 492 return (EAFNOSUPPORT); 493 #endif 494 if (!prison_replace_wildcards(td, nam)) 495 return (EINVAL); 496 497 lport = sin->sin_port; 498 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 499 /* 500 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 501 * allow complete duplication of binding if 502 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 503 * and a multicast address is bound on both 504 * new and duplicated sockets. 505 */ 506 if (so->so_options & SO_REUSEADDR) 507 reuseport = SO_REUSEADDR | SO_REUSEPORT; 508 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 509 sin->sin_port = 0; /* yech... */ 510 bzero(&sin->sin_zero, sizeof sin->sin_zero); 511 if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL) 512 return (EADDRNOTAVAIL); 513 } 514 515 inp->inp_laddr = sin->sin_addr; 516 517 jsin.sin_family = AF_INET; 518 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 519 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 520 inp->inp_laddr.s_addr = INADDR_ANY; 521 return (EINVAL); 522 } 523 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 524 525 if (lport == 0) { 526 /* Auto-select local port */ 527 return in_pcbsetlport(inp, wild, cred); 528 } 529 lport_ho = ntohs(lport); 530 531 /* GROSS */ 532 if (lport_ho < IPPORT_RESERVED && cred && 533 (error = 534 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 535 inp->inp_laddr.s_addr = INADDR_ANY; 536 return (error); 537 } 538 539 /* 540 * Locate the proper portinfo based on lport 541 */ 542 pcbinfo = inp->inp_pcbinfo; 543 portinfo = 544 &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt]; 545 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == 546 portinfo->offset); 547 548 /* 549 * This has to be atomic. If the porthash is shared across 550 * multiple protocol threads, e.g. tcp and udp then the token 551 * must be held. 552 */ 553 porthash = in_pcbporthash_head(portinfo, lport); 554 GET_PORTHASH_TOKEN(porthash); 555 556 if (so->so_cred->cr_uid != 0 && 557 !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 558 t = in_pcblookup_local(porthash, sin->sin_addr, lport, 559 INPLOOKUP_WILDCARD, cred); 560 if (t && 561 (so->so_cred->cr_uid != 562 t->inp_socket->so_cred->cr_uid)) { 563 inp->inp_laddr.s_addr = INADDR_ANY; 564 error = EADDRINUSE; 565 goto done; 566 } 567 } 568 if (cred && !prison_replace_wildcards(td, nam)) { 569 inp->inp_laddr.s_addr = INADDR_ANY; 570 error = EADDRNOTAVAIL; 571 goto done; 572 } 573 574 /* 575 * When binding to a local port if the best match is against 576 * an accepted socket we generally want to allow the binding. 577 * This means that there is no longer any specific socket 578 * bound or bound for listening. 579 */ 580 t = in_pcblookup_local(porthash, sin->sin_addr, lport, 581 wild, cred); 582 if (t && 583 (reuseport & t->inp_socket->so_options) == 0 && 584 (t->inp_socket->so_state & SS_ACCEPTMECH) == 0) { 585 inp->inp_laddr.s_addr = INADDR_ANY; 586 error = EADDRINUSE; 587 goto done; 588 } 589 inp->inp_lport = lport; 590 in_pcbinsporthash(porthash, inp); 591 error = 0; 592 done: 593 REL_PORTHASH_TOKEN(porthash); 594 return (error); 595 } else { 596 jsin.sin_family = AF_INET; 597 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 598 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 599 inp->inp_laddr.s_addr = INADDR_ANY; 600 return (EINVAL); 601 } 602 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 603 604 return in_pcbsetlport(inp, wild, cred); 605 } 606 } 607 608 /* 609 * Lookup a PCB based on the local and remote address and port. 610 * 611 * This function is only used when scanning for a free port. 612 */ 613 static struct inpcb * 614 in_pcblookup_localremote(struct inpcbporthead *porthash, struct in_addr laddr, 615 u_short lport, struct in_addr faddr, u_short fport, 616 struct ucred *cred) 617 { 618 struct inpcb *inp; 619 struct inpcbport *phd; 620 struct inpcb *match = NULL; 621 struct prison *pscan; 622 struct prison *pr; 623 624 /* 625 * If the porthashbase is shared across several cpus, it must 626 * have been locked. 627 */ 628 ASSERT_PORTHASH_TOKEN_HELD(porthash); 629 630 /* 631 * Best fit PCB lookup. 632 * 633 * First see if this local port is in use by looking on the 634 * port hash list. 635 */ 636 LIST_FOREACH(phd, porthash, phd_hash) { 637 if (phd->phd_port == lport) 638 break; 639 } 640 if (phd != NULL) { 641 pr = cred ? cred->cr_prison : NULL; 642 643 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 644 #ifdef INET6 645 if (!INP_ISIPV4(inp)) 646 continue; 647 #endif 648 if (inp->inp_laddr.s_addr == INADDR_ANY) { 649 if (inp->inp_socket && inp->inp_socket->so_cred) 650 pscan = inp->inp_socket->so_cred->cr_prison; 651 else 652 pscan = NULL; 653 if (pr != pscan) 654 continue; 655 } else { 656 if (inp->inp_laddr.s_addr != laddr.s_addr) 657 continue; 658 } 659 660 if (inp->inp_faddr.s_addr != INADDR_ANY && 661 inp->inp_faddr.s_addr != faddr.s_addr) 662 continue; 663 664 if (inp->inp_fport != 0 && inp->inp_fport != fport) 665 continue; 666 667 match = inp; 668 break; 669 } 670 } 671 return (match); 672 } 673 674 static boolean_t 675 in_pcbporthash_update4(struct inpcbportinfo *portinfo, struct inpcb *inp, 676 u_short lport, const struct sockaddr_in *sin, 677 struct ucred *cred) 678 { 679 struct inpcbporthead *porthash; 680 681 /* 682 * This has to be atomic. If the porthash is shared across multiple 683 * protocol threads, e.g. tcp and udp, then the token must be held. 684 */ 685 porthash = in_pcbporthash_head(portinfo, lport); 686 GET_PORTHASH_TOKEN(porthash); 687 688 if (in_pcblookup_localremote(porthash, inp->inp_laddr, lport, 689 sin->sin_addr, sin->sin_port, cred)) { 690 REL_PORTHASH_TOKEN(porthash); 691 return FALSE; 692 } 693 inp->inp_lport = lport; 694 in_pcbinsporthash(porthash, inp); 695 696 REL_PORTHASH_TOKEN(porthash); 697 return TRUE; 698 } 699 700 int 701 in_pcbbind_remote(struct inpcb *inp, const struct sockaddr *remote, 702 struct thread *td) 703 { 704 struct proc *p = td->td_proc; 705 const struct sockaddr_in *sin = (const struct sockaddr_in *)remote; 706 struct sockaddr_in jsin; 707 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 708 struct ucred *cred = NULL; 709 u_short first, last, lport; 710 int count, hash_count; 711 int error, selfconn = 0; 712 int cpuid = mycpuid; 713 uint32_t hash_base = 0, hash; 714 715 ASSERT_NETISR_NCPUS(cpuid); 716 717 if (TAILQ_EMPTY(&in_ifaddrheads[cpuid])) /* XXX broken! */ 718 return (EADDRNOTAVAIL); 719 720 KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY); 721 if (inp->inp_lport != 0) 722 return (EINVAL); /* already bound */ 723 724 KKASSERT(p); 725 cred = p->p_ucred; 726 727 jsin.sin_family = AF_INET; 728 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 729 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 730 inp->inp_laddr.s_addr = INADDR_ANY; 731 return (EINVAL); 732 } 733 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 734 735 hash_count = ip_porthash_trycount; 736 if (hash_count > 0) { 737 hash_base = toeplitz_piecemeal_addr(sin->sin_addr.s_addr) ^ 738 toeplitz_piecemeal_addr(inp->inp_laddr.s_addr) ^ 739 toeplitz_piecemeal_port(sin->sin_port); 740 } else { 741 hash_count = 0; 742 } 743 744 inp->inp_flags |= INP_ANONPORT; 745 746 if (inp->inp_flags & INP_HIGHPORT) { 747 first = ipport_hifirstauto; /* sysctl */ 748 last = ipport_hilastauto; 749 } else if (inp->inp_flags & INP_LOWPORT) { 750 if (cred && 751 (error = 752 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 753 inp->inp_laddr.s_addr = INADDR_ANY; 754 return (error); 755 } 756 first = ipport_lowfirstauto; /* 1023 */ 757 last = ipport_lowlastauto; /* 600 */ 758 } else { 759 first = ipport_firstauto; /* sysctl */ 760 last = ipport_lastauto; 761 } 762 if (first > last) { 763 lport = last; 764 last = first; 765 first = lport; 766 } 767 KKASSERT(last >= first); 768 769 count = last - first; 770 lport = (karc4random() % count) + first; 771 count += hash_count; 772 773 /* 774 * Simple check to ensure all ports are not used up causing 775 * a deadlock here. 776 */ 777 for (;;) { 778 u_short lport_no; 779 780 if (count-- < 0) { /* completely used? */ 781 error = EADDRNOTAVAIL; 782 break; 783 } 784 785 if (__predict_false(lport < first || lport > last)) 786 lport = first; 787 lport_no = htons(lport); 788 789 /* This could happen on loopback interface */ 790 if (__predict_false(sin->sin_port == lport_no && 791 sin->sin_addr.s_addr == inp->inp_laddr.s_addr)) { 792 if (!selfconn) { 793 ++count; /* don't count this try */ 794 selfconn = 1; 795 } 796 goto next; 797 } 798 799 if (hash_count) { 800 --hash_count; 801 hash = hash_base ^ 802 toeplitz_piecemeal_port(lport_no); 803 if (netisr_hashcpu(hash) != cpuid && hash_count) 804 goto next; 805 } 806 807 if (in_pcbporthash_update4( 808 &pcbinfo->portinfo[lport % pcbinfo->portinfo_cnt], 809 inp, lport_no, sin, cred)) { 810 error = 0; 811 break; 812 } 813 next: 814 ++lport; 815 } 816 817 if (error) 818 inp->inp_laddr.s_addr = INADDR_ANY; 819 return (error); 820 } 821 822 /* 823 * Figure out the local interface address to pair against the requested 824 * target address, as well as validate the target address. 825 */ 826 int 827 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam, 828 struct sockaddr_in **plocal_sin, struct thread *td, int find) 829 { 830 struct in_ifaddr_container *iac; 831 struct in_ifaddr *ia; 832 struct ucred *cred = NULL; 833 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 834 struct sockaddr *jsin; 835 struct prison *pr; 836 struct route *ro; 837 int alloc_route = 0; 838 839 if (nam->sa_len != sizeof *sin) 840 return (EINVAL); 841 if (sin->sin_family != AF_INET) 842 return (EAFNOSUPPORT); 843 if (sin->sin_port == 0) 844 return (EADDRNOTAVAIL); 845 846 /* 847 * Are we in a jail? 848 */ 849 pr = NULL; 850 if (td && td->td_proc && td->td_proc->p_ucred) 851 cred = td->td_proc->p_ucred; 852 if (cred) 853 pr = cred->cr_prison; 854 855 /* 856 * If the destination address is INADDR_ANY then use the primary 857 * local address. 858 * 859 * If the supplied address is INADDR_BROADCAST, and the primary 860 * interface supports broadcast, choose the broadcast address for 861 * that interface. 862 * 863 * If jailed, locate an interface address acceptable to the jail. 864 */ 865 if (sin->sin_addr.s_addr == INADDR_ANY) { 866 TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) { 867 ia = iac->ia; 868 if (pr == NULL || 869 jailed_ip(pr, sintosa(&ia->ia_addr))) { 870 sin->sin_addr = IA_SIN(ia)->sin_addr; 871 break; 872 } 873 } 874 } else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST) { 875 TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) { 876 ia = iac->ia; 877 if ((pr == NULL || 878 jailed_ip(pr, sintosa(&ia->ia_addr))) && 879 (iac->ia->ia_ifp->if_flags & IFF_BROADCAST)) { 880 sin->sin_addr = 881 satosin(&ia->ia_broadaddr)->sin_addr; 882 break; 883 } 884 } 885 } 886 887 /* 888 * If asked to do a search, use the cached route or do a route table 889 * lookup to try to find an acceptable local interface IP. 890 */ 891 if (find == 0) 892 return 0; 893 894 ia = NULL; 895 896 /* 897 * If we have a cached route, check to see if it is acceptable. 898 * If not, free it. 899 */ 900 ro = &inp->inp_route; 901 if (ro->ro_rt && 902 (!(ro->ro_rt->rt_flags & RTF_UP) || 903 ro->ro_dst.sa_family != AF_INET || 904 satosin(&ro->ro_dst)->sin_addr.s_addr != 905 sin->sin_addr.s_addr || 906 inp->inp_socket->so_options & SO_DONTROUTE)) { 907 RTFREE(ro->ro_rt); 908 ro->ro_rt = NULL; 909 } 910 911 /* 912 * If we do not have a route, construct one and do a lookup, 913 * unless we are forbidden to do so. 914 * 915 * Note that we should check the address family of the cached 916 * destination, in case of sharing the cache with IPv6. 917 */ 918 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 919 (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) { 920 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 921 ro->ro_dst.sa_family = AF_INET; 922 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 923 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sin->sin_addr; 924 rtalloc(ro); 925 alloc_route = 1; 926 } 927 928 /* 929 * If we found a route, use the address corresponding to the 930 * outgoing interface. 931 * 932 * If jailed, try to find a compatible address on the outgoing 933 * interface. 934 */ 935 if (ro->ro_rt) { 936 ia = ifatoia(ro->ro_rt->rt_ifa); 937 if (pr == NULL) 938 goto skip; 939 if (jailed_ip(pr, sintosa(&ia->ia_addr))) 940 goto skip; 941 TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) { 942 if (iac->ia->ia_ifp != ia->ia_ifp) 943 continue; 944 ia = iac->ia; 945 if (jailed_ip(pr, sintosa(&ia->ia_addr))) 946 goto skip; 947 } 948 ia = NULL; 949 } 950 skip: 951 952 /* 953 * If the route didn't work or there was no route, 954 * fall-back to the first address in in_ifaddrheads[]. 955 * 956 * If jailed and this address is not available for 957 * the jail, leave ia set to NULL. 958 */ 959 if (ia == NULL) { 960 u_short fport = sin->sin_port; 961 962 sin->sin_port = 0; 963 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); 964 if (ia && pr && !jailed_ip(pr, sintosa(&ia->ia_addr))) 965 ia = NULL; 966 967 if (ia == NULL) 968 ia = ifatoia(ifa_ifwithnet(sintosa(sin))); 969 if (ia && pr && !jailed_ip(pr, sintosa(&ia->ia_addr))) 970 ia = NULL; 971 972 sin->sin_port = fport; 973 if (ia == NULL && !TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) 974 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia; 975 976 if (ia && pr && !jailed_ip(pr, sintosa(&ia->ia_addr))) 977 ia = NULL; 978 979 if (pr == NULL && ia == NULL) 980 goto fail; 981 } 982 983 /* 984 * If the destination address is multicast and an outgoing 985 * interface has been set as a multicast option, use the 986 * address of that interface as our source address. 987 */ 988 if (pr == NULL && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 989 inp->inp_moptions != NULL) { 990 struct ip_moptions *imo; 991 struct ifnet *ifp; 992 993 imo = inp->inp_moptions; 994 if ((ifp = imo->imo_multicast_ifp) != NULL) { 995 struct in_ifaddr_container *iac; 996 997 ia = NULL; 998 TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) { 999 if (iac->ia->ia_ifp == ifp) { 1000 ia = iac->ia; 1001 break; 1002 } 1003 } 1004 if (ia == NULL) 1005 goto fail; 1006 } 1007 } 1008 1009 /* 1010 * If we still don't have a local address, and are jailed, 1011 * use the jail's first non-localhost IP. If there isn't 1012 * one, use the jail's first localhost IP. 1013 * 1014 * Don't do pcblookup call here; return interface in plocal_sin 1015 * and exit to caller, that will do the lookup. 1016 */ 1017 if (ia == NULL && pr) { 1018 jsin = prison_get_nonlocal(cred->cr_prison, AF_INET, NULL); 1019 if (jsin == NULL) 1020 jsin = prison_get_local(cred->cr_prison, AF_INET, NULL); 1021 if (jsin) 1022 *plocal_sin = satosin(jsin); 1023 else 1024 goto fail; 1025 } else if (ia) { 1026 *plocal_sin = &ia->ia_addr; 1027 } else { 1028 goto fail; 1029 } 1030 return (0); 1031 fail: 1032 if (alloc_route) 1033 in_pcbresetroute(inp); 1034 return (EADDRNOTAVAIL); 1035 } 1036 1037 int 1038 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, 1039 struct sockaddr_in **plocal_sin, struct thread *td) 1040 { 1041 return in_pcbladdr_find(inp, nam, plocal_sin, td, 1042 (inp->inp_laddr.s_addr == INADDR_ANY)); 1043 } 1044 1045 /* 1046 * Outer subroutine: 1047 * Connect from a socket to a specified address. 1048 * Both address and port must be specified in argument sin. 1049 * If don't have a local address for this socket yet, 1050 * then pick one. 1051 */ 1052 int 1053 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 1054 { 1055 struct sockaddr_in *if_sin; 1056 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1057 int error; 1058 1059 if_sin = NULL; /* avoid gcc warnings */ 1060 1061 /* Call inner routine to assign local interface address. */ 1062 if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0) 1063 return (error); 1064 1065 if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, 1066 inp->inp_laddr.s_addr ? 1067 inp->inp_laddr : if_sin->sin_addr, 1068 inp->inp_lport, FALSE, NULL) != NULL) { 1069 return (EADDRINUSE); 1070 } 1071 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1072 if (inp->inp_lport == 0) { 1073 error = in_pcbbind(inp, NULL, td); 1074 if (error) 1075 return (error); 1076 } 1077 inp->inp_laddr = if_sin->sin_addr; 1078 } 1079 inp->inp_faddr = sin->sin_addr; 1080 inp->inp_fport = sin->sin_port; 1081 in_pcbinsconnhash(inp); 1082 return (0); 1083 } 1084 1085 void 1086 in_pcbdisconnect(struct inpcb *inp) 1087 { 1088 1089 in_pcbremconnhash(inp); 1090 inp->inp_faddr.s_addr = INADDR_ANY; 1091 inp->inp_fport = 0; 1092 } 1093 1094 void 1095 in_pcbdetach(struct inpcb *inp) 1096 { 1097 struct socket *so = inp->inp_socket; 1098 struct inpcbinfo *ipi = inp->inp_pcbinfo; 1099 1100 inp->inp_gencnt = ++ipi->ipi_gencnt; 1101 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 1102 in_pcbremlists(inp); 1103 so->so_pcb = NULL; 1104 sofree(so); /* remove pcb ref */ 1105 if (inp->inp_options) 1106 m_free(inp->inp_options); 1107 if (inp->inp_route.ro_rt) 1108 rtfree(inp->inp_route.ro_rt); 1109 ip_freemoptions(inp->inp_moptions); 1110 kfree(inp, M_PCB); 1111 } 1112 1113 /* 1114 * The socket may have an invalid PCB, i.e. NULL. For example, a TCP 1115 * socket received RST. 1116 */ 1117 static int 1118 in_setsockaddr(struct socket *so, struct sockaddr **nam) 1119 { 1120 struct inpcb *inp; 1121 struct sockaddr_in *sin; 1122 1123 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 1124 inp = so->so_pcb; 1125 if (!inp) 1126 return (ECONNRESET); 1127 1128 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1129 sin->sin_family = AF_INET; 1130 sin->sin_len = sizeof *sin; 1131 sin->sin_port = inp->inp_lport; 1132 sin->sin_addr = inp->inp_laddr; 1133 1134 *nam = (struct sockaddr *)sin; 1135 return (0); 1136 } 1137 1138 void 1139 in_setsockaddr_dispatch(netmsg_t msg) 1140 { 1141 int error; 1142 1143 error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1144 lwkt_replymsg(&msg->lmsg, error); 1145 } 1146 1147 /* 1148 * The socket may have an invalid PCB, i.e. NULL. For example, a TCP 1149 * socket received RST. 1150 */ 1151 int 1152 in_setpeeraddr(struct socket *so, struct sockaddr **nam) 1153 { 1154 struct inpcb *inp; 1155 struct sockaddr_in *sin; 1156 1157 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 1158 inp = so->so_pcb; 1159 if (!inp) 1160 return (ECONNRESET); 1161 1162 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1163 sin->sin_family = AF_INET; 1164 sin->sin_len = sizeof *sin; 1165 sin->sin_port = inp->inp_fport; 1166 sin->sin_addr = inp->inp_faddr; 1167 1168 *nam = (struct sockaddr *)sin; 1169 return (0); 1170 } 1171 1172 void 1173 in_setpeeraddr_dispatch(netmsg_t msg) 1174 { 1175 int error; 1176 1177 error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1178 lwkt_replymsg(&msg->lmsg, error); 1179 } 1180 1181 void 1182 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int err, 1183 inp_notify_t notify) 1184 { 1185 struct inpcb *inp, *marker; 1186 1187 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1188 ("not in the correct netisr")); 1189 marker = in_pcbmarker(); 1190 1191 /* 1192 * NOTE: 1193 * - If INP_PLACEMARKER is set we must ignore the rest of the 1194 * structure and skip it. 1195 * - It is safe to nuke inpcbs here, since we are in their own 1196 * netisr. 1197 */ 1198 GET_PCBINFO_TOKEN(pcbinfo); 1199 1200 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1201 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1202 LIST_REMOVE(marker, inp_list); 1203 LIST_INSERT_AFTER(inp, marker, inp_list); 1204 1205 if (inp->inp_flags & INP_PLACEMARKER) 1206 continue; 1207 #ifdef INET6 1208 if (!INP_ISIPV4(inp)) 1209 continue; 1210 #endif 1211 if (inp->inp_faddr.s_addr != faddr.s_addr || 1212 inp->inp_socket == NULL) 1213 continue; 1214 (*notify)(inp, err); /* can remove inp from list! */ 1215 } 1216 LIST_REMOVE(marker, inp_list); 1217 1218 REL_PCBINFO_TOKEN(pcbinfo); 1219 } 1220 1221 void 1222 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1223 { 1224 struct inpcb *inp, *marker; 1225 1226 /* 1227 * We only need to make sure that we are in netisr0, where all 1228 * multicast operation happen. We could check inpcbinfo which 1229 * does not belong to netisr0 by holding the inpcbinfo's token. 1230 * In this case, the pcbinfo must be able to be shared, i.e. 1231 * pcbinfo->infotoken is not NULL. 1232 */ 1233 ASSERT_NETISR0; 1234 KASSERT(pcbinfo->cpu == 0 || pcbinfo->infotoken != NULL, 1235 ("pcbinfo could not be shared")); 1236 1237 /* 1238 * Get a marker for the current netisr (netisr0). 1239 * 1240 * It is possible that the multicast address deletion blocks, 1241 * which could cause temporary token releasing. So we use 1242 * inpcb marker here to get a coherent view of the inpcb list. 1243 * 1244 * While, on the other hand, moptions are only added and deleted 1245 * in netisr0, so we would not see staled moption or miss moption 1246 * even if the token was released due to the blocking multicast 1247 * address deletion. 1248 */ 1249 marker = in_pcbmarker(); 1250 1251 GET_PCBINFO_TOKEN(pcbinfo); 1252 1253 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1254 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1255 struct ip_moptions *imo; 1256 1257 LIST_REMOVE(marker, inp_list); 1258 LIST_INSERT_AFTER(inp, marker, inp_list); 1259 1260 if (inp->inp_flags & INP_PLACEMARKER) 1261 continue; 1262 imo = inp->inp_moptions; 1263 if (INP_ISIPV4(inp) && imo != NULL) { 1264 int i, gap; 1265 1266 /* 1267 * Unselect the outgoing interface if it is being 1268 * detached. 1269 */ 1270 if (imo->imo_multicast_ifp == ifp) 1271 imo->imo_multicast_ifp = NULL; 1272 1273 /* 1274 * Drop multicast group membership if we joined 1275 * through the interface being detached. 1276 */ 1277 for (i = 0, gap = 0; i < imo->imo_num_memberships; 1278 i++) { 1279 if (imo->imo_membership[i]->inm_ifp == ifp) { 1280 /* 1281 * NOTE: 1282 * This could block and the pcbinfo 1283 * token could be passively released. 1284 */ 1285 in_delmulti(imo->imo_membership[i]); 1286 gap++; 1287 } else if (gap != 0) 1288 imo->imo_membership[i - gap] = 1289 imo->imo_membership[i]; 1290 } 1291 imo->imo_num_memberships -= gap; 1292 } 1293 } 1294 LIST_REMOVE(marker, inp_list); 1295 1296 REL_PCBINFO_TOKEN(pcbinfo); 1297 } 1298 1299 /* 1300 * Check for alternatives when higher level complains 1301 * about service problems. For now, invalidate cached 1302 * routing information. If the route was created dynamically 1303 * (by a redirect), time to try a default gateway again. 1304 */ 1305 void 1306 in_losing(struct inpcb *inp) 1307 { 1308 struct rtentry *rt; 1309 struct rt_addrinfo rtinfo; 1310 1311 if ((rt = inp->inp_route.ro_rt)) { 1312 bzero(&rtinfo, sizeof(struct rt_addrinfo)); 1313 rtinfo.rti_info[RTAX_DST] = rt_key(rt); 1314 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway; 1315 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt); 1316 rtinfo.rti_flags = rt->rt_flags; 1317 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0); 1318 if (rt->rt_flags & RTF_DYNAMIC) { 1319 rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, 1320 rt_mask(rt), rt->rt_flags, NULL); 1321 } 1322 inp->inp_route.ro_rt = NULL; 1323 rtfree(rt); 1324 /* 1325 * A new route can be allocated 1326 * the next time output is attempted. 1327 */ 1328 } 1329 } 1330 1331 /* 1332 * After a routing change, flush old routing 1333 * and allocate a (hopefully) better one. 1334 */ 1335 void 1336 in_rtchange(struct inpcb *inp, int err) 1337 { 1338 if (inp->inp_route.ro_rt) { 1339 rtfree(inp->inp_route.ro_rt); 1340 inp->inp_route.ro_rt = NULL; 1341 /* 1342 * A new route can be allocated the next time 1343 * output is attempted. 1344 */ 1345 } 1346 } 1347 1348 /* 1349 * Lookup a PCB based on the local address and port. 1350 * 1351 * This function is only used when scanning for a free port. 1352 */ 1353 static struct inpcb * 1354 in_pcblookup_local(struct inpcbporthead *porthash, struct in_addr laddr, 1355 u_int lport_arg, int wild_okay, struct ucred *cred) 1356 { 1357 struct prison *pscan; 1358 struct prison *pr; 1359 struct inpcb *inp; 1360 int matchwild = 3, wildcard; 1361 u_short lport = lport_arg; 1362 struct inpcbport *phd; 1363 struct inpcb *match = NULL; 1364 1365 /* 1366 * If the porthashbase is shared across several cpus, it must 1367 * have been locked. 1368 */ 1369 ASSERT_PORTHASH_TOKEN_HELD(porthash); 1370 1371 /* 1372 * Best fit PCB lookup. 1373 * 1374 * First see if this local port is in use by looking on the 1375 * port hash list. 1376 */ 1377 LIST_FOREACH(phd, porthash, phd_hash) { 1378 if (phd->phd_port == lport) 1379 break; 1380 } 1381 if (phd != NULL) { 1382 pr = cred ? cred->cr_prison : NULL; 1383 1384 /* 1385 * Port is in use by one or more PCBs. Look for best 1386 * fit. 1387 * 1388 * If in a prison we may wish to allow the jail to override 1389 * a wildcard listen on the host. Since the jail forces its 1390 * own wildcard listens to a specific set of jail IPs, this 1391 * override allows most services on the host to remain as 1392 * they were and still be 'jail friendly'. 1393 */ 1394 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 1395 wildcard = 0; 1396 #ifdef INET6 1397 if (!INP_ISIPV4(inp)) 1398 continue; 1399 #endif 1400 if (inp->inp_faddr.s_addr != INADDR_ANY) 1401 wildcard++; 1402 1403 /* 1404 * Prison are independent of each other in terms 1405 * of allowing bindings. This can result in multiple 1406 * overloaded bindings which in_pcblookup_pkthash() 1407 * will have to sort out. 1408 * 1409 * Allow wildcarded entries to co-exist with specific 1410 * entries. Specific entries override wildcarded 1411 * entries. 1412 */ 1413 if (inp->inp_socket && inp->inp_socket->so_cred) 1414 pscan = inp->inp_socket->so_cred->cr_prison; 1415 else 1416 pscan = NULL; 1417 if (pr != pscan) 1418 continue; 1419 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1420 if (laddr.s_addr != INADDR_ANY) 1421 wildcard++; 1422 } else { 1423 if (laddr.s_addr == INADDR_ANY) 1424 wildcard++; 1425 else if (inp->inp_laddr.s_addr != laddr.s_addr) 1426 continue; 1427 } 1428 if (wildcard && !wild_okay) 1429 continue; 1430 if (wildcard < matchwild) { 1431 match = inp; 1432 matchwild = wildcard; 1433 if (matchwild == 0) 1434 break; 1435 } 1436 } 1437 } 1438 return (match); 1439 } 1440 1441 struct inpcb * 1442 in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo, 1443 const struct inpcb *inp) 1444 { 1445 const struct inp_localgrphead *hdr; 1446 const struct inp_localgroup *grp; 1447 int i; 1448 1449 if (pcbinfo->localgrphashbase == NULL) 1450 return NULL; 1451 1452 GET_PCBINFO_TOKEN(pcbinfo); 1453 1454 hdr = &pcbinfo->localgrphashbase[ 1455 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1456 1457 LIST_FOREACH(grp, hdr, il_list) { 1458 if (grp->il_af == inp->inp_af && 1459 grp->il_lport == inp->inp_lport && 1460 memcmp(&grp->il_dependladdr, 1461 &inp->inp_inc.inc_ie.ie_dependladdr, 1462 sizeof(grp->il_dependladdr)) == 0) { 1463 break; 1464 } 1465 } 1466 if (grp == NULL || grp->il_inpcnt == 1) { 1467 REL_PCBINFO_TOKEN(pcbinfo); 1468 return NULL; 1469 } 1470 1471 KASSERT(grp->il_inpcnt >= 2, 1472 ("invalid localgroup inp count %d", grp->il_inpcnt)); 1473 for (i = 0; i < grp->il_inpcnt; ++i) { 1474 if (grp->il_inp[i] == inp) { 1475 int last = grp->il_inpcnt - 1; 1476 1477 if (i == last) 1478 last = grp->il_inpcnt - 2; 1479 REL_PCBINFO_TOKEN(pcbinfo); 1480 return grp->il_inp[last]; 1481 } 1482 } 1483 REL_PCBINFO_TOKEN(pcbinfo); 1484 return NULL; 1485 } 1486 1487 static struct inpcb * 1488 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo, 1489 struct in_addr laddr, uint16_t lport, uint32_t pkt_hash) 1490 { 1491 struct inpcb *local_wild; 1492 struct inpcb *jinp; 1493 struct inpcb *jinp_wild; 1494 struct inpcb *inp; 1495 const struct inp_localgrphead *hdr; 1496 const struct inp_localgroup *grp; 1497 struct sockaddr_in jsin; 1498 struct prison *pr; 1499 struct ucred *cred; 1500 int idx; 1501 int net_listen_ov_local; 1502 int net_listen_ov_wild; 1503 1504 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1505 1506 hdr = &pcbinfo->localgrphashbase[ 1507 INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)]; 1508 1509 /* 1510 * Order of socket selection: 1511 * 1. non-wild. 1512 * 2. wild. 1513 * 1514 * NOTE: Local group does not contain jailed sockets 1515 */ 1516 jsin.sin_family = AF_INET; 1517 jsin.sin_addr.s_addr = laddr.s_addr; 1518 1519 jinp = NULL; 1520 jinp_wild = NULL; 1521 local_wild = NULL; 1522 net_listen_ov_local = 0; 1523 net_listen_ov_wild = 0; 1524 1525 LIST_FOREACH(grp, hdr, il_list) { 1526 #ifdef INET6 1527 if (grp->il_af != AF_INET) 1528 continue; 1529 #endif 1530 if (grp->il_lport != lport) 1531 continue; 1532 1533 /* 1534 * look for a match 1535 */ 1536 idx = netisr_hashlsb(pkt_hash) % grp->il_inpcnt; 1537 inp = grp->il_inp[idx]; 1538 1539 /* 1540 * Modulo-N is used here, which greatly reduces 1541 * completion queue token contention, thus more 1542 * cpu time is saved. 1543 */ 1544 if (grp->il_jailed) { 1545 if (inp->inp_socket == NULL) 1546 continue; 1547 cred = inp->inp_socket->so_cred; 1548 if (cred == NULL) 1549 continue; 1550 pr = cred->cr_prison; 1551 if (pr == NULL) 1552 continue; 1553 if (!jailed_ip(pr, (struct sockaddr *)&jsin)) 1554 continue; 1555 if (grp->il_laddr.s_addr == laddr.s_addr) { 1556 jinp = inp; 1557 if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE)) 1558 net_listen_ov_local = 1; 1559 1560 } else if (grp->il_laddr.s_addr == INADDR_ANY && 1561 jinp_wild == NULL) { 1562 jinp_wild = inp; 1563 if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE)) 1564 net_listen_ov_wild = 1; 1565 } 1566 } else { 1567 if (grp->il_laddr.s_addr == laddr.s_addr) { 1568 return inp; 1569 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 1570 local_wild = inp; 1571 } 1572 } 1573 } 1574 1575 if (net_listen_ov_local) 1576 return jinp; 1577 if (net_listen_ov_wild) 1578 return jinp_wild; 1579 if (local_wild) 1580 return (local_wild); 1581 if (jinp) 1582 return (jinp); 1583 return (jinp_wild); 1584 } 1585 1586 /* 1587 * Lookup PCB in hash list. 1588 * 1589 * This is used to match incoming packets to a pcb 1590 */ 1591 struct inpcb * 1592 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1593 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1594 boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m) 1595 { 1596 struct inpcbhead *head; 1597 struct inpcb *inp, *jinp=NULL; 1598 u_short fport = fport_arg, lport = lport_arg; 1599 1600 /* 1601 * First look for an exact match. 1602 */ 1603 head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport, 1604 laddr.s_addr, lport, 1605 pcbinfo->hashmask)]; 1606 LIST_FOREACH(inp, head, inp_hash) { 1607 #ifdef INET6 1608 if (!INP_ISIPV4(inp)) 1609 continue; 1610 #endif 1611 if (in_hosteq(inp->inp_faddr, faddr) && 1612 in_hosteq(inp->inp_laddr, laddr) && 1613 inp->inp_fport == fport && inp->inp_lport == lport) { 1614 /* 1615 * Found specific address, host overrides jailed 1616 * inpcb. 1617 */ 1618 if (inp->inp_socket == NULL || 1619 inp->inp_socket->so_cred->cr_prison == NULL) { 1620 return (inp); 1621 } 1622 if (jinp == NULL) 1623 jinp = inp; 1624 } 1625 } 1626 if (jinp != NULL) 1627 return (jinp); 1628 1629 /* 1630 * We generally get here for connections to wildcarded listeners. 1631 * Any wildcarded listeners in jails must be restricted to the 1632 * jailed IPs only. 1633 */ 1634 if (wildcard) { 1635 struct inpcb *local_wild = NULL; 1636 struct inpcb *jinp_wild = NULL; 1637 struct inpcontainer *ic; 1638 struct inpcontainerhead *chead; 1639 struct sockaddr_in jsin; 1640 struct ucred *cred; 1641 struct prison *pr; 1642 int net_listen_ov_local = 0; 1643 int net_listen_ov_wild = 0; 1644 1645 GET_PCBINFO_TOKEN(pcbinfo); 1646 1647 /* 1648 * Check local group first. When present, the localgroup 1649 * hash utilizes the same non-jailed-vs/jailed priortization 1650 * that the normal wildcardhash does. 1651 */ 1652 if (pcbinfo->localgrphashbase != NULL && 1653 m != NULL && (m->m_flags & M_HASH)) { 1654 inp = inp_localgroup_lookup(pcbinfo, laddr, lport, 1655 m->m_pkthdr.hash); 1656 if (inp != NULL) { 1657 REL_PCBINFO_TOKEN(pcbinfo); 1658 return inp; 1659 } 1660 } 1661 1662 /* 1663 * Order of socket selection: 1664 * 1665 * 1. non-jailed, non-wild. 1666 * 2. non-jailed, wild. (allow_listen_override on) 1667 * 3. jailed, non-wild. 1668 * 4. jailed, wild. 1669 * 5. non-jailed, wild. (allow_listen_override off) 1670 * 1671 * NOTE: jailed wildcards are still restricted to the jail 1672 * IPs. 1673 * 1674 * NOTE: (1) and (3) already handled above. 1675 */ 1676 jsin.sin_family = AF_INET; 1677 chead = &pcbinfo->wildcardhashbase[ 1678 INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)]; 1679 1680 LIST_FOREACH(ic, chead, ic_list) { 1681 inp = ic->ic_inp; 1682 if (inp->inp_flags & INP_PLACEMARKER) 1683 continue; 1684 1685 /* 1686 * Basic validation 1687 */ 1688 #ifdef INET6 1689 if (!INP_ISIPV4(inp)) 1690 continue; 1691 #endif 1692 if (inp->inp_lport != lport) 1693 continue; 1694 1695 /* 1696 * Calculate prison, setup jsin for jailed_ip() 1697 * check. 1698 */ 1699 jsin.sin_addr.s_addr = laddr.s_addr; 1700 pr = NULL; 1701 cred = NULL; 1702 if (inp->inp_socket) { 1703 cred = inp->inp_socket->so_cred; 1704 if (cred) 1705 pr = cred->cr_prison; 1706 } 1707 1708 /* 1709 * Assign jinp, jinp_wild, and local_wild as 1710 * appropriate, track whether the jail supports 1711 * listen overrides. 1712 */ 1713 if (pr) { 1714 if (!jailed_ip(pr, (struct sockaddr *)&jsin)) 1715 continue; 1716 if (inp->inp_laddr.s_addr == laddr.s_addr && 1717 jinp == NULL) { 1718 jinp = inp; 1719 if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE)) 1720 net_listen_ov_local = 1; 1721 } 1722 if (inp->inp_laddr.s_addr == INADDR_ANY && 1723 jinp_wild == NULL) { 1724 jinp_wild = inp; 1725 if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE)) 1726 net_listen_ov_wild = 1; 1727 } 1728 } else { 1729 if (inp->inp_laddr.s_addr == laddr.s_addr) { 1730 REL_PCBINFO_TOKEN(pcbinfo); 1731 return (inp); 1732 } 1733 if (inp->inp_laddr.s_addr == INADDR_ANY) 1734 local_wild = inp; 1735 } 1736 } 1737 1738 REL_PCBINFO_TOKEN(pcbinfo); 1739 1740 if (net_listen_ov_local) 1741 return jinp; 1742 if (net_listen_ov_wild) 1743 return jinp_wild; 1744 if (local_wild) 1745 return (local_wild); 1746 if (jinp) 1747 return (jinp); 1748 return (jinp_wild); 1749 } 1750 1751 /* 1752 * Not found. 1753 */ 1754 return (NULL); 1755 } 1756 1757 struct inpcb * 1758 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1759 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1760 boolean_t wildcard, struct ifnet *ifp) 1761 { 1762 return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg, 1763 laddr, lport_arg, wildcard, ifp, NULL); 1764 } 1765 1766 /* 1767 * Insert PCB into connection hash table. 1768 */ 1769 void 1770 in_pcbinsconnhash(struct inpcb *inp) 1771 { 1772 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1773 struct inpcbhead *bucket; 1774 u_int32_t hashkey_faddr, hashkey_laddr; 1775 1776 #ifdef INET6 1777 if (INP_ISIPV6(inp)) { 1778 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */; 1779 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */; 1780 } else { 1781 #endif 1782 hashkey_faddr = inp->inp_faddr.s_addr; 1783 hashkey_laddr = inp->inp_laddr.s_addr; 1784 #ifdef INET6 1785 } 1786 #endif 1787 1788 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1789 ("not in the correct netisr")); 1790 ASSERT_INP_NOTINHASH(inp); 1791 inp->inp_flags |= INP_CONNECTED; 1792 1793 /* 1794 * Insert into the connection hash table. 1795 */ 1796 bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr, 1797 inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)]; 1798 LIST_INSERT_HEAD(bucket, inp, inp_hash); 1799 } 1800 1801 /* 1802 * Remove PCB from connection hash table. 1803 */ 1804 void 1805 in_pcbremconnhash(struct inpcb *inp) 1806 { 1807 struct inpcbinfo *pcbinfo __debugvar = inp->inp_pcbinfo; 1808 1809 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1810 ("not in the correct netisr")); 1811 KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected")); 1812 1813 LIST_REMOVE(inp, inp_hash); 1814 inp->inp_flags &= ~INP_CONNECTED; 1815 } 1816 1817 /* 1818 * Insert PCB into port hash table. 1819 */ 1820 void 1821 in_pcbinsporthash(struct inpcbporthead *pcbporthash, struct inpcb *inp) 1822 { 1823 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1824 struct inpcbport *phd; 1825 1826 /* 1827 * If the porthashbase is shared across several cpus, it must 1828 * have been locked. 1829 */ 1830 ASSERT_PORTHASH_TOKEN_HELD(pcbporthash); 1831 1832 /* 1833 * Insert into the port hash table. 1834 */ 1835 1836 /* Go through port list and look for a head for this lport. */ 1837 LIST_FOREACH(phd, pcbporthash, phd_hash) { 1838 if (phd->phd_port == inp->inp_lport) 1839 break; 1840 } 1841 1842 /* If none exists, use saved one and tack it on. */ 1843 if (phd == NULL) { 1844 KKASSERT(pcbinfo->portsave != NULL); 1845 phd = pcbinfo->portsave; 1846 pcbinfo->portsave = NULL; 1847 phd->phd_port = inp->inp_lport; 1848 LIST_INIT(&phd->phd_pcblist); 1849 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 1850 } 1851 1852 inp->inp_porthash = pcbporthash; 1853 inp->inp_phd = phd; 1854 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 1855 1856 /* 1857 * Malloc one inpcbport for later use. It is safe to use 1858 * "wait" malloc here (port token would be released, if 1859 * malloc ever blocked), since all changes to the porthash 1860 * are done. 1861 */ 1862 if (pcbinfo->portsave == NULL) { 1863 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), 1864 M_PCB, M_INTWAIT | M_ZERO); 1865 } 1866 } 1867 1868 void 1869 in_pcbinsporthash_lport(struct inpcb *inp) 1870 { 1871 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1872 struct inpcbportinfo *portinfo; 1873 struct inpcbporthead *porthash; 1874 u_short lport_ho; 1875 1876 /* Locate the proper portinfo based on lport */ 1877 lport_ho = ntohs(inp->inp_lport); 1878 portinfo = &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt]; 1879 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == portinfo->offset); 1880 1881 porthash = in_pcbporthash_head(portinfo, inp->inp_lport); 1882 GET_PORTHASH_TOKEN(porthash); 1883 in_pcbinsporthash(porthash, inp); 1884 REL_PORTHASH_TOKEN(porthash); 1885 } 1886 1887 void 1888 in_pcbremporthash(struct inpcb *inp) 1889 { 1890 struct inpcbporthead *porthash; 1891 struct inpcbport *phd; 1892 1893 if (inp->inp_phd == NULL) 1894 return; 1895 KASSERT(inp->inp_lport != 0, ("inpcb has no lport")); 1896 1897 porthash = inp->inp_porthash; 1898 KASSERT(porthash != NULL, ("no porthash")); 1899 1900 GET_PORTHASH_TOKEN(porthash); 1901 1902 phd = inp->inp_phd; 1903 LIST_REMOVE(inp, inp_portlist); 1904 if (LIST_FIRST(&phd->phd_pcblist) == NULL) { 1905 LIST_REMOVE(phd, phd_hash); 1906 kfree(phd, M_PCB); 1907 } 1908 1909 REL_PORTHASH_TOKEN(porthash); 1910 1911 inp->inp_phd = NULL; 1912 /* NOTE: Don't whack inp_lport, which may be used later */ 1913 } 1914 1915 static struct inp_localgroup * 1916 inp_localgroup_alloc(u_char af, uint16_t port, 1917 const union in_dependaddr *addr, int size) 1918 { 1919 struct inp_localgroup *grp; 1920 1921 grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]), 1922 M_TEMP, M_INTWAIT | M_ZERO); 1923 grp->il_af = af; 1924 grp->il_lport = port; 1925 grp->il_dependladdr = *addr; 1926 grp->il_inpsiz = size; 1927 1928 return grp; 1929 } 1930 1931 static void 1932 inp_localgroup_free(struct inp_localgroup *grp) 1933 { 1934 kfree(grp, M_TEMP); 1935 } 1936 1937 static void 1938 inp_localgroup_destroy(struct inp_localgroup *grp) 1939 { 1940 LIST_REMOVE(grp, il_list); 1941 inp_localgroup_free(grp); 1942 } 1943 1944 static void 1945 inp_localgroup_copy(struct inp_localgroup *grp, 1946 const struct inp_localgroup *old_grp) 1947 { 1948 int i; 1949 1950 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 1951 ("invalid new local group size %d and old local group count %d", 1952 grp->il_inpsiz, old_grp->il_inpcnt)); 1953 for (i = 0; i < old_grp->il_inpcnt; ++i) 1954 grp->il_inp[i] = old_grp->il_inp[i]; 1955 grp->il_inpcnt = old_grp->il_inpcnt; 1956 } 1957 1958 static void 1959 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1960 { 1961 struct inp_localgrphead *hdr; 1962 struct inp_localgroup *grp, *grp_alloc = NULL; 1963 u_char isjailed; 1964 int i, idx; 1965 1966 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1967 1968 if (pcbinfo->localgrphashbase == NULL) 1969 return; 1970 1971 /* 1972 * Further separate groups by whether the inp is jailed or not. 1973 * This allows the inp_localgroup_lookup() code to manage port 1974 * overloading between jails and non-jails. 1975 * 1976 * XXX all jails are collected into one group, which works fine 1977 * as we expect the jails to be listening on different addresses. 1978 * If this changes in the future we may have to break the groups 1979 * up by prison pointer as well. 1980 */ 1981 if (inp->inp_socket && inp->inp_socket->so_cred) 1982 isjailed = jailed(inp->inp_socket->so_cred); 1983 else 1984 isjailed = 0; 1985 1986 hdr = &pcbinfo->localgrphashbase[ 1987 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1988 1989 again: 1990 LIST_FOREACH(grp, hdr, il_list) { 1991 if (grp->il_af == inp->inp_af && 1992 grp->il_lport == inp->inp_lport && 1993 grp->il_jailed == isjailed && 1994 memcmp(&grp->il_dependladdr, 1995 &inp->inp_inc.inc_ie.ie_dependladdr, 1996 sizeof(grp->il_dependladdr)) == 0) { 1997 break; 1998 } 1999 } 2000 if (grp == NULL) { 2001 /* 2002 * Create a new local group 2003 */ 2004 if (grp_alloc == NULL) { 2005 grp_alloc = inp_localgroup_alloc(inp->inp_af, 2006 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 2007 INP_LOCALGROUP_SIZMIN); 2008 /* 2009 * Local group allocation could block and the 2010 * local group w/ the same property might have 2011 * been added by others when we were blocked; 2012 * check again. 2013 */ 2014 goto again; 2015 } else { 2016 /* Local group has been allocated; link it */ 2017 grp = grp_alloc; 2018 grp->il_jailed = isjailed; 2019 grp_alloc = NULL; 2020 LIST_INSERT_HEAD(hdr, grp, il_list); 2021 } 2022 } else if (grp->il_inpcnt == grp->il_inpsiz) { 2023 #if 0 2024 /* 2025 * REMOVED - Ensure that all entries are placed in the 2026 * localgroup so jail operations can be 2027 * deterministic on a il_lport basis. 2028 */ 2029 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) { 2030 static int limit_logged = 0; 2031 2032 if (!limit_logged) { 2033 limit_logged = 1; 2034 kprintf("local group port %d, " 2035 "limit reached\n", ntohs(grp->il_lport)); 2036 } 2037 if (grp_alloc != NULL) { 2038 /* 2039 * This would happen if the local group 2040 * w/ the same property was expanded when 2041 * our local group allocation blocked. 2042 */ 2043 inp_localgroup_free(grp_alloc); 2044 } 2045 return; 2046 } 2047 #endif 2048 2049 /* 2050 * Expand this local group 2051 */ 2052 if (grp_alloc == NULL || 2053 grp->il_inpcnt >= grp_alloc->il_inpsiz) { 2054 if (grp_alloc != NULL) 2055 inp_localgroup_free(grp_alloc); 2056 grp_alloc = inp_localgroup_alloc(grp->il_af, 2057 grp->il_lport, &grp->il_dependladdr, 2058 grp->il_inpsiz * 2); 2059 /* 2060 * Local group allocation could block and the 2061 * local group w/ the same property might have 2062 * been expanded by others when we were blocked; 2063 * check again. 2064 */ 2065 goto again; 2066 } 2067 2068 /* 2069 * Save the old local group, link the new one, and then 2070 * destroy the old local group 2071 */ 2072 inp_localgroup_copy(grp_alloc, grp); 2073 LIST_INSERT_HEAD(hdr, grp_alloc, il_list); 2074 inp_localgroup_destroy(grp); 2075 2076 grp = grp_alloc; 2077 grp->il_jailed = isjailed; 2078 grp_alloc = NULL; 2079 } else { 2080 /* 2081 * Found the local group 2082 */ 2083 if (grp_alloc != NULL) { 2084 /* 2085 * This would happen if the local group w/ the 2086 * same property was added or expanded when our 2087 * local group allocation blocked. 2088 */ 2089 inp_localgroup_free(grp_alloc); 2090 grp_alloc = NULL; 2091 } 2092 } 2093 2094 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 2095 ("invalid local group size %d and count %d", 2096 grp->il_inpsiz, grp->il_inpcnt)); 2097 2098 /* 2099 * Keep the local group sorted by the inpcb local group index 2100 * in ascending order. 2101 * 2102 * This eases the multi-process userland application which uses 2103 * SO_REUSEPORT sockets and binds process to the owner cpu of 2104 * the SO_REUSEPORT socket: 2105 * If we didn't sort the local group by the inpcb local group 2106 * index and one of the process owning an inpcb in this local 2107 * group restarted, e.g. crashed and restarted by watchdog, 2108 * other processes owning a inpcb in this local group would have 2109 * to detect that event, refetch its socket's owner cpu, and 2110 * re-bind. 2111 */ 2112 idx = grp->il_inpcnt; 2113 for (i = 0; i < idx; ++i) { 2114 struct inpcb *oinp = grp->il_inp[i]; 2115 2116 if (oinp->inp_lgrpindex > i) { 2117 if (inp->inp_lgrpindex < 0) { 2118 inp->inp_lgrpindex = i; 2119 } else if (inp->inp_lgrpindex != i) { 2120 if (bootverbose) { 2121 kprintf("inp %p: grpidx %d, " 2122 "assigned to %d, cpu%d\n", 2123 inp, inp->inp_lgrpindex, i, 2124 mycpuid); 2125 } 2126 } 2127 grp->il_inp[i] = inp; 2128 2129 /* Pull down inpcbs */ 2130 for (; i < grp->il_inpcnt; ++i) { 2131 struct inpcb *oinp1 = grp->il_inp[i + 1]; 2132 2133 grp->il_inp[i + 1] = oinp; 2134 oinp = oinp1; 2135 } 2136 grp->il_inpcnt++; 2137 return; 2138 } 2139 } 2140 2141 if (inp->inp_lgrpindex < 0) { 2142 inp->inp_lgrpindex = idx; 2143 } else if (inp->inp_lgrpindex != idx) { 2144 if (bootverbose) { 2145 kprintf("inp %p: grpidx %d, assigned to %d, cpu%d\n", 2146 inp, inp->inp_lgrpindex, idx, mycpuid); 2147 } 2148 } 2149 grp->il_inp[idx] = inp; 2150 grp->il_inpcnt++; 2151 } 2152 2153 void 2154 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 2155 { 2156 struct inpcontainer *ic; 2157 struct inpcontainerhead *bucket; 2158 2159 GET_PCBINFO_TOKEN(pcbinfo); 2160 2161 in_pcbinslocalgrphash_oncpu(inp, pcbinfo); 2162 2163 bucket = &pcbinfo->wildcardhashbase[ 2164 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 2165 2166 ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT); 2167 ic->ic_inp = inp; 2168 LIST_INSERT_HEAD(bucket, ic, ic_list); 2169 2170 REL_PCBINFO_TOKEN(pcbinfo); 2171 } 2172 2173 /* 2174 * Insert PCB into wildcard hash table. 2175 */ 2176 void 2177 in_pcbinswildcardhash(struct inpcb *inp) 2178 { 2179 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2180 2181 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 2182 ("not in correct netisr")); 2183 ASSERT_INP_NOTINHASH(inp); 2184 inp->inp_flags |= INP_WILDCARD; 2185 2186 in_pcbinswildcardhash_oncpu(inp, pcbinfo); 2187 } 2188 2189 static void 2190 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 2191 { 2192 struct inp_localgrphead *hdr; 2193 struct inp_localgroup *grp; 2194 2195 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 2196 2197 if (pcbinfo->localgrphashbase == NULL) 2198 return; 2199 2200 hdr = &pcbinfo->localgrphashbase[ 2201 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 2202 2203 LIST_FOREACH(grp, hdr, il_list) { 2204 int i; 2205 2206 for (i = 0; i < grp->il_inpcnt; ++i) { 2207 if (grp->il_inp[i] != inp) 2208 continue; 2209 2210 if (grp->il_inpcnt == 1) { 2211 /* Destroy this local group */ 2212 inp_localgroup_destroy(grp); 2213 } else { 2214 /* Pull up inpcbs */ 2215 for (; i + 1 < grp->il_inpcnt; ++i) 2216 grp->il_inp[i] = grp->il_inp[i + 1]; 2217 grp->il_inpcnt--; 2218 } 2219 return; 2220 } 2221 } 2222 } 2223 2224 void 2225 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 2226 { 2227 struct inpcontainer *ic; 2228 struct inpcontainerhead *head; 2229 2230 GET_PCBINFO_TOKEN(pcbinfo); 2231 2232 in_pcbremlocalgrphash_oncpu(inp, pcbinfo); 2233 2234 /* find bucket */ 2235 head = &pcbinfo->wildcardhashbase[ 2236 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 2237 2238 LIST_FOREACH(ic, head, ic_list) { 2239 if (ic->ic_inp == inp) 2240 goto found; 2241 } 2242 REL_PCBINFO_TOKEN(pcbinfo); 2243 return; /* not found! */ 2244 2245 found: 2246 LIST_REMOVE(ic, ic_list); /* remove container from bucket chain */ 2247 REL_PCBINFO_TOKEN(pcbinfo); 2248 kfree(ic, M_TEMP); /* deallocate container */ 2249 } 2250 2251 /* 2252 * Remove PCB from wildcard hash table. 2253 */ 2254 void 2255 in_pcbremwildcardhash(struct inpcb *inp) 2256 { 2257 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2258 2259 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 2260 ("not in correct netisr")); 2261 KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard")); 2262 2263 in_pcbremwildcardhash_oncpu(inp, pcbinfo); 2264 inp->inp_lgrpindex = -1; 2265 inp->inp_flags &= ~INP_WILDCARD; 2266 } 2267 2268 /* 2269 * Remove PCB from various lists. 2270 */ 2271 void 2272 in_pcbremlists(struct inpcb *inp) 2273 { 2274 in_pcbremporthash(inp); 2275 if (inp->inp_flags & INP_WILDCARD) { 2276 in_pcbremwildcardhash(inp); 2277 } else if (inp->inp_flags & INP_CONNECTED) { 2278 in_pcbremconnhash(inp); 2279 } 2280 2281 if (inp->inp_flags & INP_ONLIST) 2282 in_pcbofflist(inp); 2283 } 2284 2285 int 2286 prison_xinpcb(struct thread *td, struct inpcb *inp) 2287 { 2288 struct ucred *cr; 2289 2290 if (td->td_proc == NULL) 2291 return (0); 2292 cr = td->td_proc->p_ucred; 2293 if (cr->cr_prison == NULL) 2294 return (0); 2295 if (inp->inp_socket && inp->inp_socket->so_cred && 2296 inp->inp_socket->so_cred->cr_prison && 2297 cr->cr_prison == inp->inp_socket->so_cred->cr_prison) 2298 return (0); 2299 return (1); 2300 } 2301 2302 int 2303 in_pcblist_range(SYSCTL_HANDLER_ARGS) 2304 { 2305 struct inpcbinfo *pcbinfo_arr = arg1; 2306 int pcbinfo_arrlen = arg2; 2307 struct inpcb *marker; 2308 int cpu, origcpu; 2309 int error, n; 2310 2311 KASSERT(pcbinfo_arrlen <= netisr_ncpus && pcbinfo_arrlen >= 1, 2312 ("invalid pcbinfo count %d", pcbinfo_arrlen)); 2313 2314 /* 2315 * The process of preparing the TCB list is too time-consuming and 2316 * resource-intensive to repeat twice on every request. 2317 */ 2318 n = 0; 2319 if (req->oldptr == NULL) { 2320 for (cpu = 0; cpu < pcbinfo_arrlen; ++cpu) 2321 n += pcbinfo_arr[cpu].ipi_count; 2322 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb); 2323 return 0; 2324 } 2325 2326 if (req->newptr != NULL) 2327 return EPERM; 2328 2329 marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO); 2330 marker->inp_flags |= INP_PLACEMARKER; 2331 2332 /* 2333 * OK, now we're committed to doing something. Re-fetch ipi_count 2334 * after obtaining the generation count. 2335 */ 2336 error = 0; 2337 origcpu = mycpuid; 2338 for (cpu = 0; cpu < pcbinfo_arrlen && error == 0; ++cpu) { 2339 struct inpcbinfo *pcbinfo = &pcbinfo_arr[cpu]; 2340 struct inpcb *inp; 2341 struct xinpcb xi; 2342 int i; 2343 2344 lwkt_migratecpu(cpu); 2345 2346 GET_PCBINFO_TOKEN(pcbinfo); 2347 2348 n = pcbinfo->ipi_count; 2349 2350 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 2351 i = 0; 2352 while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) { 2353 LIST_REMOVE(marker, inp_list); 2354 LIST_INSERT_AFTER(inp, marker, inp_list); 2355 2356 if (inp->inp_flags & INP_PLACEMARKER) 2357 continue; 2358 if (prison_xinpcb(req->td, inp)) 2359 continue; 2360 2361 bzero(&xi, sizeof xi); 2362 xi.xi_len = sizeof xi; 2363 bcopy(inp, &xi.xi_inp, sizeof *inp); 2364 if (inp->inp_socket) 2365 sotoxsocket(inp->inp_socket, &xi.xi_socket); 2366 if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0) 2367 break; 2368 ++i; 2369 } 2370 LIST_REMOVE(marker, inp_list); 2371 2372 REL_PCBINFO_TOKEN(pcbinfo); 2373 2374 if (error == 0 && i < n) { 2375 bzero(&xi, sizeof xi); 2376 xi.xi_len = sizeof xi; 2377 while (i < n) { 2378 error = SYSCTL_OUT(req, &xi, sizeof xi); 2379 if (error) 2380 break; 2381 ++i; 2382 } 2383 } 2384 } 2385 2386 lwkt_migratecpu(origcpu); 2387 kfree(marker, M_TEMP); 2388 return error; 2389 } 2390 2391 int 2392 in_pcblist_ncpus(SYSCTL_HANDLER_ARGS) 2393 { 2394 2395 return (in_pcblist_range(oidp, arg1, netisr_ncpus, req)); 2396 } 2397 2398 void 2399 in_savefaddr(struct socket *so, const struct sockaddr *faddr) 2400 { 2401 struct sockaddr_in *sin; 2402 2403 KASSERT(faddr->sa_family == AF_INET, 2404 ("not AF_INET faddr %d", faddr->sa_family)); 2405 2406 sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO); 2407 sin->sin_family = AF_INET; 2408 sin->sin_len = sizeof(*sin); 2409 sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port; 2410 sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr; 2411 2412 so->so_faddr = (struct sockaddr *)sin; 2413 } 2414 2415 void 2416 in_pcbportinfo_init(struct inpcbportinfo *portinfo, int hashsize, 2417 u_short offset) 2418 { 2419 memset(portinfo, 0, sizeof(*portinfo)); 2420 2421 portinfo->offset = offset; 2422 portinfo->porthashbase = phashinit(hashsize, M_PCB, 2423 &portinfo->porthashcnt); 2424 } 2425 2426 void 2427 in_pcbportrange(u_short *hi0, u_short *lo0, u_short ofs, u_short step) 2428 { 2429 int hi, lo; 2430 2431 if (step == 1) 2432 return; 2433 2434 hi = *hi0; 2435 lo = *lo0; 2436 2437 hi = rounddown(hi, step); 2438 hi += ofs; 2439 if (hi > (int)*hi0) 2440 hi -= step; 2441 2442 lo = roundup(lo, step); 2443 lo -= (step - ofs); 2444 if (lo < (int)*lo0) 2445 lo += step; 2446 2447 *hi0 = hi; 2448 *lo0 = lo; 2449 } 2450 2451 void 2452 in_pcbglobalinit(void) 2453 { 2454 int cpu; 2455 2456 in_pcbmarkers = kmalloc(netisr_ncpus * sizeof(struct inpcb), M_PCB, 2457 M_WAITOK | M_ZERO); 2458 in_pcbcontainer_markers = 2459 kmalloc(netisr_ncpus * sizeof(struct inpcontainer), M_PCB, 2460 M_WAITOK | M_ZERO); 2461 2462 for (cpu = 0; cpu < netisr_ncpus; ++cpu) { 2463 struct inpcontainer *ic = &in_pcbcontainer_markers[cpu]; 2464 struct inpcb *marker = &in_pcbmarkers[cpu]; 2465 2466 marker->inp_flags |= INP_PLACEMARKER; 2467 ic->ic_inp = marker; 2468 } 2469 } 2470 2471 struct inpcb * 2472 in_pcbmarker(void) 2473 { 2474 2475 ASSERT_NETISR_NCPUS(mycpuid); 2476 return &in_pcbmarkers[mycpuid]; 2477 } 2478 2479 struct inpcontainer * 2480 in_pcbcontainer_marker(void) 2481 { 2482 2483 ASSERT_NETISR_NCPUS(mycpuid); 2484 return &in_pcbcontainer_markers[mycpuid]; 2485 } 2486 2487 void 2488 in_pcbresetroute(struct inpcb *inp) 2489 { 2490 struct route *ro = &inp->inp_route; 2491 2492 if (ro->ro_rt != NULL) 2493 RTFREE(ro->ro_rt); 2494 bzero(ro, sizeof(*ro)); 2495 } 2496