1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #if !defined(KLD_MODULE) 34 #include "opt_inet.h" 35 #include "opt_ipfw.h" 36 #include "opt_mac.h" 37 #include "opt_sctp.h" 38 #ifndef INET 39 #error "IPDIVERT requires INET." 40 #endif 41 #ifndef IPFIREWALL 42 #error "IPDIVERT requires IPFIREWALL" 43 #endif 44 #endif 45 46 #include <sys/param.h> 47 #include <sys/kernel.h> 48 #include <sys/lock.h> 49 #include <sys/malloc.h> 50 #include <sys/mbuf.h> 51 #include <sys/module.h> 52 #include <sys/kernel.h> 53 #include <sys/priv.h> 54 #include <sys/proc.h> 55 #include <sys/protosw.h> 56 #include <sys/rwlock.h> 57 #include <sys/signalvar.h> 58 #include <sys/socket.h> 59 #include <sys/socketvar.h> 60 #include <sys/sx.h> 61 #include <sys/sysctl.h> 62 #include <sys/systm.h> 63 #include <sys/vimage.h> 64 65 #include <vm/uma.h> 66 67 #include <net/if.h> 68 #include <net/netisr.h> 69 #include <net/route.h> 70 71 #include <netinet/in.h> 72 #include <netinet/in_pcb.h> 73 #include <netinet/in_systm.h> 74 #include <netinet/in_var.h> 75 #include <netinet/ip.h> 76 #include <netinet/ip_divert.h> 77 #include <netinet/ip_var.h> 78 #include <netinet/ip_fw.h> 79 #include <netinet/vinet.h> 80 #ifdef SCTP 81 #include <netinet/sctp_crc32.h> 82 #endif 83 84 #include <security/mac/mac_framework.h> 85 86 /* 87 * Divert sockets 88 */ 89 90 /* 91 * Allocate enough space to hold a full IP packet 92 */ 93 #define DIVSNDQ (65536 + 100) 94 #define DIVRCVQ (65536 + 100) 95 96 /* 97 * Divert sockets work in conjunction with ipfw, see the divert(4) 98 * manpage for features. 99 * Internally, packets selected by ipfw in ip_input() or ip_output(), 100 * and never diverted before, are passed to the input queue of the 101 * divert socket with a given 'divert_port' number (as specified in 102 * the matching ipfw rule), and they are tagged with a 16 bit cookie 103 * (representing the rule number of the matching ipfw rule), which 104 * is passed to process reading from the socket. 105 * 106 * Packets written to the divert socket are again tagged with a cookie 107 * (usually the same as above) and a destination address. 108 * If the destination address is INADDR_ANY then the packet is 109 * treated as outgoing and sent to ip_output(), otherwise it is 110 * treated as incoming and sent to ip_input(). 111 * In both cases, the packet is tagged with the cookie. 112 * 113 * On reinjection, processing in ip_input() and ip_output() 114 * will be exactly the same as for the original packet, except that 115 * ipfw processing will start at the rule number after the one 116 * written in the cookie (so, tagging a packet with a cookie of 0 117 * will cause it to be effectively considered as a standard packet). 118 */ 119 120 /* Internal variables. */ 121 #ifdef VIMAGE_GLOBALS 122 static struct inpcbhead divcb; 123 static struct inpcbinfo divcbinfo; 124 #endif 125 126 static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ 127 static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ 128 129 /* 130 * Initialize divert connection block queue. 131 */ 132 static void 133 div_zone_change(void *tag) 134 { 135 INIT_VNET_INET(curvnet); 136 137 uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); 138 } 139 140 static int 141 div_inpcb_init(void *mem, int size, int flags) 142 { 143 struct inpcb *inp = mem; 144 145 INP_LOCK_INIT(inp, "inp", "divinp"); 146 return (0); 147 } 148 149 static void 150 div_inpcb_fini(void *mem, int size) 151 { 152 struct inpcb *inp = mem; 153 154 INP_LOCK_DESTROY(inp); 155 } 156 157 void 158 div_init(void) 159 { 160 INIT_VNET_INET(curvnet); 161 162 INP_INFO_LOCK_INIT(&V_divcbinfo, "div"); 163 LIST_INIT(&V_divcb); 164 V_divcbinfo.ipi_listhead = &V_divcb; 165 #ifdef VIMAGE 166 V_divcbinfo.ipi_vnet = curvnet; 167 #endif 168 /* 169 * XXX We don't use the hash list for divert IP, but it's easier 170 * to allocate a one entry hash list than it is to check all 171 * over the place for hashbase == NULL. 172 */ 173 V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask); 174 V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, 175 &V_divcbinfo.ipi_porthashmask); 176 V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), 177 NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, 178 UMA_ZONE_NOFREE); 179 uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); 180 EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, 181 NULL, EVENTHANDLER_PRI_ANY); 182 } 183 184 /* 185 * IPPROTO_DIVERT is not in the real IP protocol number space; this 186 * function should never be called. Just in case, drop any packets. 187 */ 188 void 189 div_input(struct mbuf *m, int off) 190 { 191 INIT_VNET_INET(curvnet); 192 193 IPSTAT_INC(ips_noproto); 194 m_freem(m); 195 } 196 197 /* 198 * Divert a packet by passing it up to the divert socket at port 'port'. 199 * 200 * Setup generic address and protocol structures for div_input routine, 201 * then pass them along with mbuf chain. 202 */ 203 static void 204 divert_packet(struct mbuf *m, int incoming) 205 { 206 INIT_VNET_INET(curvnet); 207 struct ip *ip; 208 struct inpcb *inp; 209 struct socket *sa; 210 u_int16_t nport; 211 struct sockaddr_in divsrc; 212 struct m_tag *mtag; 213 214 mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); 215 if (mtag == NULL) { 216 printf("%s: no divert tag\n", __func__); 217 m_freem(m); 218 return; 219 } 220 /* Assure header */ 221 if (m->m_len < sizeof(struct ip) && 222 (m = m_pullup(m, sizeof(struct ip))) == 0) 223 return; 224 ip = mtod(m, struct ip *); 225 226 /* Delayed checksums are currently not compatible with divert. */ 227 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 228 ip->ip_len = ntohs(ip->ip_len); 229 in_delayed_cksum(m); 230 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 231 ip->ip_len = htons(ip->ip_len); 232 } 233 #ifdef SCTP 234 if (m->m_pkthdr.csum_flags & CSUM_SCTP) { 235 ip->ip_len = ntohs(ip->ip_len); 236 sctp_delayed_cksum(m); 237 m->m_pkthdr.csum_flags &= ~CSUM_SCTP; 238 ip->ip_len = htons(ip->ip_len); 239 } 240 #endif 241 /* 242 * Record receive interface address, if any. 243 * But only for incoming packets. 244 */ 245 bzero(&divsrc, sizeof(divsrc)); 246 divsrc.sin_len = sizeof(divsrc); 247 divsrc.sin_family = AF_INET; 248 divsrc.sin_port = divert_cookie(mtag); /* record matching rule */ 249 if (incoming) { 250 struct ifaddr *ifa; 251 struct ifnet *ifp; 252 253 /* Sanity check */ 254 M_ASSERTPKTHDR(m); 255 256 /* Find IP address for receive interface */ 257 ifp = m->m_pkthdr.rcvif; 258 IF_ADDR_LOCK(ifp); 259 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 260 if (ifa->ifa_addr->sa_family != AF_INET) 261 continue; 262 divsrc.sin_addr = 263 ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; 264 break; 265 } 266 IF_ADDR_UNLOCK(ifp); 267 } 268 /* 269 * Record the incoming interface name whenever we have one. 270 */ 271 if (m->m_pkthdr.rcvif) { 272 /* 273 * Hide the actual interface name in there in the 274 * sin_zero array. XXX This needs to be moved to a 275 * different sockaddr type for divert, e.g. 276 * sockaddr_div with multiple fields like 277 * sockaddr_dl. Presently we have only 7 bytes 278 * but that will do for now as most interfaces 279 * are 4 or less + 2 or less bytes for unit. 280 * There is probably a faster way of doing this, 281 * possibly taking it from the sockaddr_dl on the iface. 282 * This solves the problem of a P2P link and a LAN interface 283 * having the same address, which can result in the wrong 284 * interface being assigned to the packet when fed back 285 * into the divert socket. Theoretically if the daemon saves 286 * and re-uses the sockaddr_in as suggested in the man pages, 287 * this iface name will come along for the ride. 288 * (see div_output for the other half of this.) 289 */ 290 strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, 291 sizeof(divsrc.sin_zero)); 292 } 293 294 /* Put packet on socket queue, if any */ 295 sa = NULL; 296 nport = htons((u_int16_t)divert_info(mtag)); 297 INP_INFO_RLOCK(&V_divcbinfo); 298 LIST_FOREACH(inp, &V_divcb, inp_list) { 299 /* XXX why does only one socket match? */ 300 if (inp->inp_lport == nport) { 301 INP_RLOCK(inp); 302 sa = inp->inp_socket; 303 SOCKBUF_LOCK(&sa->so_rcv); 304 if (sbappendaddr_locked(&sa->so_rcv, 305 (struct sockaddr *)&divsrc, m, 306 (struct mbuf *)0) == 0) { 307 SOCKBUF_UNLOCK(&sa->so_rcv); 308 sa = NULL; /* force mbuf reclaim below */ 309 } else 310 sorwakeup_locked(sa); 311 INP_RUNLOCK(inp); 312 break; 313 } 314 } 315 INP_INFO_RUNLOCK(&V_divcbinfo); 316 if (sa == NULL) { 317 m_freem(m); 318 IPSTAT_INC(ips_noproto); 319 IPSTAT_DEC(ips_delivered); 320 } 321 } 322 323 /* 324 * Deliver packet back into the IP processing machinery. 325 * 326 * If no address specified, or address is 0.0.0.0, send to ip_output(); 327 * otherwise, send to ip_input() and mark as having been received on 328 * the interface with that address. 329 */ 330 static int 331 div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, 332 struct mbuf *control) 333 { 334 INIT_VNET_INET(curvnet); 335 struct m_tag *mtag; 336 struct divert_tag *dt; 337 int error = 0; 338 struct mbuf *options; 339 340 /* 341 * An mbuf may hasn't come from userland, but we pretend 342 * that it has. 343 */ 344 m->m_pkthdr.rcvif = NULL; 345 m->m_nextpkt = NULL; 346 M_SETFIB(m, so->so_fibnum); 347 348 if (control) 349 m_freem(control); /* XXX */ 350 351 if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) { 352 mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), 353 M_NOWAIT | M_ZERO); 354 if (mtag == NULL) { 355 error = ENOBUFS; 356 goto cantsend; 357 } 358 dt = (struct divert_tag *)(mtag+1); 359 m_tag_prepend(m, mtag); 360 } else 361 dt = (struct divert_tag *)(mtag+1); 362 363 /* Loopback avoidance and state recovery */ 364 if (sin) { 365 int i; 366 367 dt->cookie = sin->sin_port; 368 /* 369 * Find receive interface with the given name, stuffed 370 * (if it exists) in the sin_zero[] field. 371 * The name is user supplied data so don't trust its size 372 * or that it is zero terminated. 373 */ 374 for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) 375 ; 376 if ( i > 0 && i < sizeof(sin->sin_zero)) 377 m->m_pkthdr.rcvif = ifunit(sin->sin_zero); 378 } 379 380 /* Reinject packet into the system as incoming or outgoing */ 381 if (!sin || sin->sin_addr.s_addr == 0) { 382 struct ip *const ip = mtod(m, struct ip *); 383 struct inpcb *inp; 384 385 dt->info |= IP_FW_DIVERT_OUTPUT_FLAG; 386 INP_INFO_WLOCK(&V_divcbinfo); 387 inp = sotoinpcb(so); 388 INP_RLOCK(inp); 389 /* 390 * Don't allow both user specified and setsockopt options, 391 * and don't allow packet length sizes that will crash 392 */ 393 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || 394 ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { 395 error = EINVAL; 396 INP_RUNLOCK(inp); 397 INP_INFO_WUNLOCK(&V_divcbinfo); 398 m_freem(m); 399 } else { 400 /* Convert fields to host order for ip_output() */ 401 ip->ip_len = ntohs(ip->ip_len); 402 ip->ip_off = ntohs(ip->ip_off); 403 404 /* Send packet to output processing */ 405 IPSTAT_INC(ips_rawout); /* XXX */ 406 407 #ifdef MAC 408 mac_inpcb_create_mbuf(inp, m); 409 #endif 410 /* 411 * Get ready to inject the packet into ip_output(). 412 * Just in case socket options were specified on the 413 * divert socket, we duplicate them. This is done 414 * to avoid having to hold the PCB locks over the call 415 * to ip_output(), as doing this results in a number of 416 * lock ordering complexities. 417 * 418 * Note that we set the multicast options argument for 419 * ip_output() to NULL since it should be invariant that 420 * they are not present. 421 */ 422 KASSERT(inp->inp_moptions == NULL, 423 ("multicast options set on a divert socket")); 424 options = NULL; 425 /* 426 * XXXCSJP: It is unclear to me whether or not it makes 427 * sense for divert sockets to have options. However, 428 * for now we will duplicate them with the INP locks 429 * held so we can use them in ip_output() without 430 * requring a reference to the pcb. 431 */ 432 if (inp->inp_options != NULL) { 433 options = m_dup(inp->inp_options, M_DONTWAIT); 434 if (options == NULL) 435 error = ENOBUFS; 436 } 437 INP_RUNLOCK(inp); 438 INP_INFO_WUNLOCK(&V_divcbinfo); 439 if (error == ENOBUFS) { 440 m_freem(m); 441 return (error); 442 } 443 error = ip_output(m, options, NULL, 444 ((so->so_options & SO_DONTROUTE) ? 445 IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | 446 IP_RAWOUTPUT, NULL, NULL); 447 if (options != NULL) 448 m_freem(options); 449 } 450 } else { 451 dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG; 452 if (m->m_pkthdr.rcvif == NULL) { 453 /* 454 * No luck with the name, check by IP address. 455 * Clear the port and the ifname to make sure 456 * there are no distractions for ifa_ifwithaddr. 457 */ 458 struct ifaddr *ifa; 459 460 bzero(sin->sin_zero, sizeof(sin->sin_zero)); 461 sin->sin_port = 0; 462 ifa = ifa_ifwithaddr((struct sockaddr *) sin); 463 if (ifa == NULL) { 464 error = EADDRNOTAVAIL; 465 goto cantsend; 466 } 467 m->m_pkthdr.rcvif = ifa->ifa_ifp; 468 } 469 #ifdef MAC 470 SOCK_LOCK(so); 471 mac_socket_create_mbuf(so, m); 472 SOCK_UNLOCK(so); 473 #endif 474 /* Send packet to input processing via netisr */ 475 netisr_queue(NETISR_IP, m); 476 } 477 478 return error; 479 480 cantsend: 481 m_freem(m); 482 return error; 483 } 484 485 static int 486 div_attach(struct socket *so, int proto, struct thread *td) 487 { 488 INIT_VNET_INET(so->so_vnet); 489 struct inpcb *inp; 490 int error; 491 492 inp = sotoinpcb(so); 493 KASSERT(inp == NULL, ("div_attach: inp != NULL")); 494 if (td != NULL) { 495 error = priv_check(td, PRIV_NETINET_DIVERT); 496 if (error) 497 return (error); 498 } 499 error = soreserve(so, div_sendspace, div_recvspace); 500 if (error) 501 return error; 502 INP_INFO_WLOCK(&V_divcbinfo); 503 error = in_pcballoc(so, &V_divcbinfo); 504 if (error) { 505 INP_INFO_WUNLOCK(&V_divcbinfo); 506 return error; 507 } 508 inp = (struct inpcb *)so->so_pcb; 509 INP_INFO_WUNLOCK(&V_divcbinfo); 510 inp->inp_ip_p = proto; 511 inp->inp_vflag |= INP_IPV4; 512 inp->inp_flags |= INP_HDRINCL; 513 INP_WUNLOCK(inp); 514 return 0; 515 } 516 517 static void 518 div_detach(struct socket *so) 519 { 520 INIT_VNET_INET(so->so_vnet); 521 struct inpcb *inp; 522 523 inp = sotoinpcb(so); 524 KASSERT(inp != NULL, ("div_detach: inp == NULL")); 525 INP_INFO_WLOCK(&V_divcbinfo); 526 INP_WLOCK(inp); 527 in_pcbdetach(inp); 528 in_pcbfree(inp); 529 INP_INFO_WUNLOCK(&V_divcbinfo); 530 } 531 532 static int 533 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 534 { 535 INIT_VNET_INET(so->so_vnet); 536 struct inpcb *inp; 537 int error; 538 539 inp = sotoinpcb(so); 540 KASSERT(inp != NULL, ("div_bind: inp == NULL")); 541 /* in_pcbbind assumes that nam is a sockaddr_in 542 * and in_pcbbind requires a valid address. Since divert 543 * sockets don't we need to make sure the address is 544 * filled in properly. 545 * XXX -- divert should not be abusing in_pcbind 546 * and should probably have its own family. 547 */ 548 if (nam->sa_family != AF_INET) 549 return EAFNOSUPPORT; 550 ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; 551 INP_INFO_WLOCK(&V_divcbinfo); 552 INP_WLOCK(inp); 553 error = in_pcbbind(inp, nam, td->td_ucred); 554 INP_WUNLOCK(inp); 555 INP_INFO_WUNLOCK(&V_divcbinfo); 556 return error; 557 } 558 559 static int 560 div_shutdown(struct socket *so) 561 { 562 struct inpcb *inp; 563 564 inp = sotoinpcb(so); 565 KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); 566 INP_WLOCK(inp); 567 socantsendmore(so); 568 INP_WUNLOCK(inp); 569 return 0; 570 } 571 572 static int 573 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 574 struct mbuf *control, struct thread *td) 575 { 576 INIT_VNET_INET(so->so_vnet); 577 578 /* Packet must have a header (but that's about it) */ 579 if (m->m_len < sizeof (struct ip) && 580 (m = m_pullup(m, sizeof (struct ip))) == 0) { 581 IPSTAT_INC(ips_toosmall); 582 m_freem(m); 583 return EINVAL; 584 } 585 586 /* Send packet */ 587 return div_output(so, m, (struct sockaddr_in *)nam, control); 588 } 589 590 void 591 div_ctlinput(int cmd, struct sockaddr *sa, void *vip) 592 { 593 struct in_addr faddr; 594 595 faddr = ((struct sockaddr_in *)sa)->sin_addr; 596 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 597 return; 598 if (PRC_IS_REDIRECT(cmd)) 599 return; 600 } 601 602 static int 603 div_pcblist(SYSCTL_HANDLER_ARGS) 604 { 605 INIT_VNET_INET(curvnet); 606 int error, i, n; 607 struct inpcb *inp, **inp_list; 608 inp_gen_t gencnt; 609 struct xinpgen xig; 610 611 /* 612 * The process of preparing the TCB list is too time-consuming and 613 * resource-intensive to repeat twice on every request. 614 */ 615 if (req->oldptr == 0) { 616 n = V_divcbinfo.ipi_count; 617 req->oldidx = 2 * (sizeof xig) 618 + (n + n/8) * sizeof(struct xinpcb); 619 return 0; 620 } 621 622 if (req->newptr != 0) 623 return EPERM; 624 625 /* 626 * OK, now we're committed to doing something. 627 */ 628 INP_INFO_RLOCK(&V_divcbinfo); 629 gencnt = V_divcbinfo.ipi_gencnt; 630 n = V_divcbinfo.ipi_count; 631 INP_INFO_RUNLOCK(&V_divcbinfo); 632 633 error = sysctl_wire_old_buffer(req, 634 2 * sizeof(xig) + n*sizeof(struct xinpcb)); 635 if (error != 0) 636 return (error); 637 638 xig.xig_len = sizeof xig; 639 xig.xig_count = n; 640 xig.xig_gen = gencnt; 641 xig.xig_sogen = so_gencnt; 642 error = SYSCTL_OUT(req, &xig, sizeof xig); 643 if (error) 644 return error; 645 646 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 647 if (inp_list == 0) 648 return ENOMEM; 649 650 INP_INFO_RLOCK(&V_divcbinfo); 651 for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; 652 inp = LIST_NEXT(inp, inp_list)) { 653 INP_RLOCK(inp); 654 if (inp->inp_gencnt <= gencnt && 655 cr_canseeinpcb(req->td->td_ucred, inp) == 0) 656 inp_list[i++] = inp; 657 INP_RUNLOCK(inp); 658 } 659 INP_INFO_RUNLOCK(&V_divcbinfo); 660 n = i; 661 662 error = 0; 663 for (i = 0; i < n; i++) { 664 inp = inp_list[i]; 665 INP_RLOCK(inp); 666 if (inp->inp_gencnt <= gencnt) { 667 struct xinpcb xi; 668 bzero(&xi, sizeof(xi)); 669 xi.xi_len = sizeof xi; 670 /* XXX should avoid extra copy */ 671 bcopy(inp, &xi.xi_inp, sizeof *inp); 672 if (inp->inp_socket) 673 sotoxsocket(inp->inp_socket, &xi.xi_socket); 674 INP_RUNLOCK(inp); 675 error = SYSCTL_OUT(req, &xi, sizeof xi); 676 } else 677 INP_RUNLOCK(inp); 678 } 679 if (!error) { 680 /* 681 * Give the user an updated idea of our state. 682 * If the generation differs from what we told 683 * her before, she knows that something happened 684 * while we were processing this request, and it 685 * might be necessary to retry. 686 */ 687 INP_INFO_RLOCK(&V_divcbinfo); 688 xig.xig_gen = V_divcbinfo.ipi_gencnt; 689 xig.xig_sogen = so_gencnt; 690 xig.xig_count = V_divcbinfo.ipi_count; 691 INP_INFO_RUNLOCK(&V_divcbinfo); 692 error = SYSCTL_OUT(req, &xig, sizeof xig); 693 } 694 free(inp_list, M_TEMP); 695 return error; 696 } 697 698 #ifdef SYSCTL_NODE 699 SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); 700 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, 701 div_pcblist, "S,xinpcb", "List of active divert sockets"); 702 #endif 703 704 struct pr_usrreqs div_usrreqs = { 705 .pru_attach = div_attach, 706 .pru_bind = div_bind, 707 .pru_control = in_control, 708 .pru_detach = div_detach, 709 .pru_peeraddr = in_getpeeraddr, 710 .pru_send = div_send, 711 .pru_shutdown = div_shutdown, 712 .pru_sockaddr = in_getsockaddr, 713 .pru_sosetlabel = in_pcbsosetlabel 714 }; 715 716 struct protosw div_protosw = { 717 .pr_type = SOCK_RAW, 718 .pr_protocol = IPPROTO_DIVERT, 719 .pr_flags = PR_ATOMIC|PR_ADDR, 720 .pr_input = div_input, 721 .pr_ctlinput = div_ctlinput, 722 .pr_ctloutput = ip_ctloutput, 723 .pr_init = div_init, 724 .pr_usrreqs = &div_usrreqs 725 }; 726 727 static int 728 div_modevent(module_t mod, int type, void *unused) 729 { 730 INIT_VNET_INET(curvnet); /* XXX move to iattach - revisit!!! */ 731 int err = 0; 732 int n; 733 734 switch (type) { 735 case MOD_LOAD: 736 /* 737 * Protocol will be initialized by pf_proto_register(). 738 * We don't have to register ip_protox because we are not 739 * a true IP protocol that goes over the wire. 740 */ 741 err = pf_proto_register(PF_INET, &div_protosw); 742 ip_divert_ptr = divert_packet; 743 break; 744 case MOD_QUIESCE: 745 /* 746 * IPDIVERT may normally not be unloaded because of the 747 * potential race conditions. Tell kldunload we can't be 748 * unloaded unless the unload is forced. 749 */ 750 err = EPERM; 751 break; 752 case MOD_UNLOAD: 753 /* 754 * Forced unload. 755 * 756 * Module ipdivert can only be unloaded if no sockets are 757 * connected. Maybe this can be changed later to forcefully 758 * disconnect any open sockets. 759 * 760 * XXXRW: Note that there is a slight race here, as a new 761 * socket open request could be spinning on the lock and then 762 * we destroy the lock. 763 */ 764 INP_INFO_WLOCK(&V_divcbinfo); 765 n = V_divcbinfo.ipi_count; 766 if (n != 0) { 767 err = EBUSY; 768 INP_INFO_WUNLOCK(&V_divcbinfo); 769 break; 770 } 771 ip_divert_ptr = NULL; 772 err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); 773 INP_INFO_WUNLOCK(&V_divcbinfo); 774 INP_INFO_LOCK_DESTROY(&V_divcbinfo); 775 uma_zdestroy(V_divcbinfo.ipi_zone); 776 break; 777 default: 778 err = EOPNOTSUPP; 779 break; 780 } 781 return err; 782 } 783 784 static moduledata_t ipdivertmod = { 785 "ipdivert", 786 div_modevent, 787 0 788 }; 789 790 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 791 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); 792 MODULE_VERSION(ipdivert, 1); 793