1 /* $OpenBSD: ip_output.c,v 1.390 2023/07/07 08:05:02 bluhm Exp $ */ 2 /* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 33 */ 34 35 #include "pf.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/protosw.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/proc.h> 44 #include <sys/kernel.h> 45 46 #include <net/if.h> 47 #include <net/if_var.h> 48 #include <net/if_enc.h> 49 #include <net/route.h> 50 51 #include <netinet/in.h> 52 #include <netinet/ip.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/in_var.h> 55 #include <netinet/ip_var.h> 56 #include <netinet/ip_icmp.h> 57 #include <netinet/tcp.h> 58 #include <netinet/udp.h> 59 #include <netinet/tcp_timer.h> 60 #include <netinet/tcp_var.h> 61 #include <netinet/udp_var.h> 62 63 #if NPF > 0 64 #include <net/pfvar.h> 65 #endif 66 67 #ifdef IPSEC 68 #ifdef ENCDEBUG 69 #define DPRINTF(fmt, args...) \ 70 do { \ 71 if (encdebug) \ 72 printf("%s: " fmt "\n", __func__, ## args); \ 73 } while (0) 74 #else 75 #define DPRINTF(fmt, args...) \ 76 do { } while (0) 77 #endif 78 #endif /* IPSEC */ 79 80 int ip_pcbopts(struct mbuf **, struct mbuf *); 81 int ip_multicast_if(struct ip_mreqn *, u_int, unsigned int *); 82 int ip_setmoptions(int, struct ip_moptions **, struct mbuf *, u_int); 83 void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *); 84 static u_int16_t in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); 85 void in_delayed_cksum(struct mbuf *); 86 87 int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp, 88 struct tdb **, int ipsecflowinfo); 89 void ip_output_ipsec_pmtu_update(struct tdb *, struct route *, struct in_addr, 90 int, int); 91 int ip_output_ipsec_send(struct tdb *, struct mbuf *, struct route *, int); 92 93 /* 94 * IP output. The packet in mbuf chain m contains a skeletal IP 95 * header (with len, off, ttl, proto, tos, src, dst). 96 * The mbuf chain containing the packet will be freed. 97 * The mbuf opt, if present, will not be freed. 98 */ 99 int 100 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 101 struct ip_moptions *imo, struct inpcb *inp, u_int32_t ipsecflowinfo) 102 { 103 struct ip *ip; 104 struct ifnet *ifp = NULL; 105 struct mbuf_list ml; 106 int hlen = sizeof (struct ip); 107 int error = 0; 108 struct route iproute; 109 struct sockaddr_in *dst; 110 struct tdb *tdb = NULL; 111 u_long mtu; 112 #if NPF > 0 113 u_int orig_rtableid; 114 #endif 115 116 NET_ASSERT_LOCKED(); 117 118 #ifdef IPSEC 119 if (inp && (inp->inp_flags & INP_IPV6) != 0) 120 panic("ip_output: IPv6 pcb is passed"); 121 #endif /* IPSEC */ 122 123 #ifdef DIAGNOSTIC 124 if ((m->m_flags & M_PKTHDR) == 0) 125 panic("ip_output no HDR"); 126 #endif 127 if (opt) 128 m = ip_insertoptions(m, opt, &hlen); 129 130 ip = mtod(m, struct ip *); 131 132 /* 133 * Fill in IP header. 134 */ 135 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 136 ip->ip_v = IPVERSION; 137 ip->ip_off &= htons(IP_DF); 138 ip->ip_id = htons(ip_randomid()); 139 ip->ip_hl = hlen >> 2; 140 ipstat_inc(ips_localout); 141 } else { 142 hlen = ip->ip_hl << 2; 143 } 144 145 /* 146 * We should not send traffic to 0/8 say both Stevens and RFCs 147 * 5735 section 3 and 1122 sections 3.2.1.3 and 3.3.6. 148 */ 149 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == 0) { 150 error = ENETUNREACH; 151 goto bad; 152 } 153 154 #if NPF > 0 155 orig_rtableid = m->m_pkthdr.ph_rtableid; 156 reroute: 157 #endif 158 159 /* 160 * Do a route lookup now in case we need the source address to 161 * do an SPD lookup in IPsec; for most packets, the source address 162 * is set at a higher level protocol. ICMPs and other packets 163 * though (e.g., traceroute) have a source address of zeroes. 164 */ 165 if (ro == NULL) { 166 ro = &iproute; 167 memset(ro, 0, sizeof(*ro)); 168 } 169 170 dst = satosin(&ro->ro_dst); 171 172 /* 173 * If there is a cached route, check that it is to the same 174 * destination and is still up. If not, free it and try again. 175 */ 176 if (!rtisvalid(ro->ro_rt) || 177 dst->sin_addr.s_addr != ip->ip_dst.s_addr || 178 ro->ro_tableid != m->m_pkthdr.ph_rtableid) { 179 rtfree(ro->ro_rt); 180 ro->ro_rt = NULL; 181 } 182 183 if (ro->ro_rt == NULL) { 184 dst->sin_family = AF_INET; 185 dst->sin_len = sizeof(*dst); 186 dst->sin_addr = ip->ip_dst; 187 ro->ro_tableid = m->m_pkthdr.ph_rtableid; 188 } 189 190 if ((IN_MULTICAST(ip->ip_dst.s_addr) || 191 (ip->ip_dst.s_addr == INADDR_BROADCAST)) && 192 imo != NULL && (ifp = if_get(imo->imo_ifidx)) != NULL) { 193 194 mtu = ifp->if_mtu; 195 if (ip->ip_src.s_addr == INADDR_ANY) { 196 struct in_ifaddr *ia; 197 198 IFP_TO_IA(ifp, ia); 199 if (ia != NULL) 200 ip->ip_src = ia->ia_addr.sin_addr; 201 } 202 } else { 203 struct in_ifaddr *ia; 204 205 if (ro->ro_rt == NULL) 206 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, 207 &ip->ip_src.s_addr, ro->ro_tableid); 208 209 if (ro->ro_rt == NULL) { 210 ipstat_inc(ips_noroute); 211 error = EHOSTUNREACH; 212 goto bad; 213 } 214 215 ia = ifatoia(ro->ro_rt->rt_ifa); 216 if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL)) 217 ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid)); 218 else 219 ifp = if_get(ro->ro_rt->rt_ifidx); 220 /* 221 * We aren't using rtisvalid() here because the UP/DOWN state 222 * machine is broken with some Ethernet drivers like em(4). 223 * As a result we might try to use an invalid cached route 224 * entry while an interface is being detached. 225 */ 226 if (ifp == NULL) { 227 ipstat_inc(ips_noroute); 228 error = EHOSTUNREACH; 229 goto bad; 230 } 231 if ((mtu = ro->ro_rt->rt_mtu) == 0) 232 mtu = ifp->if_mtu; 233 234 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 235 dst = satosin(ro->ro_rt->rt_gateway); 236 237 /* Set the source IP address */ 238 if (ip->ip_src.s_addr == INADDR_ANY && ia) 239 ip->ip_src = ia->ia_addr.sin_addr; 240 } 241 242 #ifdef IPSEC 243 if (ipsec_in_use || inp != NULL) { 244 /* Do we have any pending SAs to apply ? */ 245 error = ip_output_ipsec_lookup(m, hlen, inp, &tdb, 246 ipsecflowinfo); 247 if (error) { 248 /* Should silently drop packet */ 249 if (error == -EINVAL) 250 error = 0; 251 goto bad; 252 } 253 if (tdb != NULL) { 254 /* 255 * If it needs TCP/UDP hardware-checksumming, do the 256 * computation now. 257 */ 258 in_proto_cksum_out(m, NULL); 259 } 260 } 261 #endif /* IPSEC */ 262 263 if (IN_MULTICAST(ip->ip_dst.s_addr) || 264 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 265 266 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 267 M_BCAST : M_MCAST; 268 269 /* 270 * IP destination address is multicast. Make sure "dst" 271 * still points to the address in "ro". (It may have been 272 * changed to point to a gateway address, above.) 273 */ 274 dst = satosin(&ro->ro_dst); 275 276 /* 277 * See if the caller provided any multicast options 278 */ 279 if (imo != NULL) 280 ip->ip_ttl = imo->imo_ttl; 281 else 282 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 283 284 /* 285 * if we don't know the outgoing ifp yet, we can't generate 286 * output 287 */ 288 if (!ifp) { 289 ipstat_inc(ips_noroute); 290 error = EHOSTUNREACH; 291 goto bad; 292 } 293 294 /* 295 * Confirm that the outgoing interface supports multicast, 296 * but only if the packet actually is going out on that 297 * interface (i.e., no IPsec is applied). 298 */ 299 if ((((m->m_flags & M_MCAST) && 300 (ifp->if_flags & IFF_MULTICAST) == 0) || 301 ((m->m_flags & M_BCAST) && 302 (ifp->if_flags & IFF_BROADCAST) == 0)) && (tdb == NULL)) { 303 ipstat_inc(ips_noroute); 304 error = ENETUNREACH; 305 goto bad; 306 } 307 308 /* 309 * If source address not specified yet, use address 310 * of outgoing interface. 311 */ 312 if (ip->ip_src.s_addr == INADDR_ANY) { 313 struct in_ifaddr *ia; 314 315 IFP_TO_IA(ifp, ia); 316 if (ia != NULL) 317 ip->ip_src = ia->ia_addr.sin_addr; 318 } 319 320 if ((imo == NULL || imo->imo_loop) && 321 in_hasmulti(&ip->ip_dst, ifp)) { 322 /* 323 * If we belong to the destination multicast group 324 * on the outgoing interface, and the caller did not 325 * forbid loopback, loop back a copy. 326 * Can't defer TCP/UDP checksumming, do the 327 * computation now. 328 */ 329 in_proto_cksum_out(m, NULL); 330 ip_mloopback(ifp, m, dst); 331 } 332 #ifdef MROUTING 333 else { 334 /* 335 * If we are acting as a multicast router, perform 336 * multicast forwarding as if the packet had just 337 * arrived on the interface to which we are about 338 * to send. The multicast forwarding function 339 * recursively calls this function, using the 340 * IP_FORWARDING flag to prevent infinite recursion. 341 * 342 * Multicasts that are looped back by ip_mloopback(), 343 * above, will be forwarded by the ip_input() routine, 344 * if necessary. 345 */ 346 if (ipmforwarding && ip_mrouter[ifp->if_rdomain] && 347 (flags & IP_FORWARDING) == 0) { 348 int rv; 349 350 KERNEL_LOCK(); 351 rv = ip_mforward(m, ifp); 352 KERNEL_UNLOCK(); 353 if (rv != 0) 354 goto bad; 355 } 356 } 357 #endif 358 /* 359 * Multicasts with a time-to-live of zero may be looped- 360 * back, above, but must not be transmitted on a network. 361 * Also, multicasts addressed to the loopback interface 362 * are not sent -- the above call to ip_mloopback() will 363 * loop back a copy if this host actually belongs to the 364 * destination group on the loopback interface. 365 */ 366 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) 367 goto bad; 368 369 goto sendit; 370 } 371 372 /* 373 * Look for broadcast address and verify user is allowed to send 374 * such a packet; if the packet is going in an IPsec tunnel, skip 375 * this check. 376 */ 377 if ((tdb == NULL) && ((dst->sin_addr.s_addr == INADDR_BROADCAST) || 378 (ro && ro->ro_rt && ISSET(ro->ro_rt->rt_flags, RTF_BROADCAST)))) { 379 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 380 error = EADDRNOTAVAIL; 381 goto bad; 382 } 383 if ((flags & IP_ALLOWBROADCAST) == 0) { 384 error = EACCES; 385 goto bad; 386 } 387 388 /* Don't allow broadcast messages to be fragmented */ 389 if (ntohs(ip->ip_len) > ifp->if_mtu) { 390 error = EMSGSIZE; 391 goto bad; 392 } 393 m->m_flags |= M_BCAST; 394 } else 395 m->m_flags &= ~M_BCAST; 396 397 sendit: 398 /* 399 * If we're doing Path MTU discovery, we need to set DF unless 400 * the route's MTU is locked. 401 */ 402 if ((flags & IP_MTUDISC) && ro && ro->ro_rt && 403 (ro->ro_rt->rt_locks & RTV_MTU) == 0) 404 ip->ip_off |= htons(IP_DF); 405 406 #ifdef IPSEC 407 /* 408 * Check if the packet needs encapsulation. 409 */ 410 if (tdb != NULL) { 411 /* Callee frees mbuf */ 412 error = ip_output_ipsec_send(tdb, m, ro, 413 (flags & IP_FORWARDING) ? 1 : 0); 414 goto done; 415 } 416 #endif /* IPSEC */ 417 418 /* 419 * Packet filter 420 */ 421 #if NPF > 0 422 if (pf_test(AF_INET, (flags & IP_FORWARDING) ? PF_FWD : PF_OUT, 423 ifp, &m) != PF_PASS) { 424 error = EACCES; 425 goto bad; 426 } 427 if (m == NULL) 428 goto done; 429 ip = mtod(m, struct ip *); 430 hlen = ip->ip_hl << 2; 431 if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) == 432 (PF_TAG_REROUTE | PF_TAG_GENERATED)) 433 /* already rerun the route lookup, go on */ 434 m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE); 435 else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) { 436 /* tag as generated to skip over pf_test on rerun */ 437 m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; 438 ro = NULL; 439 if_put(ifp); /* drop reference since target changed */ 440 ifp = NULL; 441 goto reroute; 442 } 443 #endif 444 445 #ifdef IPSEC 446 if (ipsec_in_use && (flags & IP_FORWARDING) && (ipforwarding == 2) && 447 (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) == NULL)) { 448 error = EHOSTUNREACH; 449 goto bad; 450 } 451 #endif 452 453 /* 454 * If TSO or small enough for interface, can just send directly. 455 */ 456 error = if_output_tso(ifp, &m, sintosa(dst), ro->ro_rt, mtu); 457 if (error || m == NULL) 458 goto done; 459 460 /* 461 * Too large for interface; fragment if possible. 462 * Must be able to put at least 8 bytes per fragment. 463 */ 464 if (ip->ip_off & htons(IP_DF)) { 465 #ifdef IPSEC 466 if (ip_mtudisc) 467 ipsec_adjust_mtu(m, ifp->if_mtu); 468 #endif 469 error = EMSGSIZE; 470 #if NPF > 0 471 /* pf changed routing table, use orig rtable for path MTU */ 472 if (ro->ro_tableid != orig_rtableid) { 473 rtfree(ro->ro_rt); 474 ro->ro_tableid = orig_rtableid; 475 ro->ro_rt = icmp_mtudisc_clone( 476 satosin(&ro->ro_dst)->sin_addr, ro->ro_tableid, 0); 477 } 478 #endif 479 /* 480 * This case can happen if the user changed the MTU 481 * of an interface after enabling IP on it. Because 482 * most netifs don't keep track of routes pointing to 483 * them, there is no way for one to update all its 484 * routes when the MTU is changed. 485 */ 486 if (rtisvalid(ro->ro_rt) && 487 ISSET(ro->ro_rt->rt_flags, RTF_HOST) && 488 !(ro->ro_rt->rt_locks & RTV_MTU) && 489 (ro->ro_rt->rt_mtu > ifp->if_mtu)) { 490 ro->ro_rt->rt_mtu = ifp->if_mtu; 491 } 492 ipstat_inc(ips_cantfrag); 493 goto bad; 494 } 495 496 if ((error = ip_fragment(m, &ml, ifp, mtu)) || 497 (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt))) 498 goto done; 499 ipstat_inc(ips_fragmented); 500 501 done: 502 if (ro == &iproute && ro->ro_rt) 503 rtfree(ro->ro_rt); 504 if_put(ifp); 505 #ifdef IPSEC 506 tdb_unref(tdb); 507 #endif /* IPSEC */ 508 return (error); 509 510 bad: 511 m_freem(m); 512 goto done; 513 } 514 515 #ifdef IPSEC 516 int 517 ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp, 518 struct tdb **tdbout, int ipsecflowinfo) 519 { 520 struct m_tag *mtag; 521 struct tdb_ident *tdbi; 522 struct tdb *tdb; 523 struct ipsec_ids *ids = NULL; 524 int error; 525 526 /* Do we have any pending SAs to apply ? */ 527 if (ipsecflowinfo) 528 ids = ipsp_ids_lookup(ipsecflowinfo); 529 error = ipsp_spd_lookup(m, AF_INET, hlen, IPSP_DIRECTION_OUT, 530 NULL, inp, &tdb, ids); 531 ipsp_ids_free(ids); 532 if (error || tdb == NULL) { 533 *tdbout = NULL; 534 return error; 535 } 536 /* Loop detection */ 537 for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { 538 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE) 539 continue; 540 tdbi = (struct tdb_ident *)(mtag + 1); 541 if (tdbi->spi == tdb->tdb_spi && 542 tdbi->proto == tdb->tdb_sproto && 543 tdbi->rdomain == tdb->tdb_rdomain && 544 !memcmp(&tdbi->dst, &tdb->tdb_dst, 545 sizeof(union sockaddr_union))) { 546 /* no IPsec needed */ 547 tdb_unref(tdb); 548 *tdbout = NULL; 549 return 0; 550 } 551 } 552 *tdbout = tdb; 553 return 0; 554 } 555 556 void 557 ip_output_ipsec_pmtu_update(struct tdb *tdb, struct route *ro, 558 struct in_addr dst, int rtableid, int transportmode) 559 { 560 struct rtentry *rt = NULL; 561 int rt_mtucloned = 0; 562 563 /* Find a host route to store the mtu in */ 564 if (ro != NULL) 565 rt = ro->ro_rt; 566 /* but don't add a PMTU route for transport mode SAs */ 567 if (transportmode) 568 rt = NULL; 569 else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) { 570 rt = icmp_mtudisc_clone(dst, rtableid, 1); 571 rt_mtucloned = 1; 572 } 573 DPRINTF("spi %08x mtu %d rt %p cloned %d", 574 ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned); 575 if (rt != NULL) { 576 rt->rt_mtu = tdb->tdb_mtu; 577 if (ro != NULL && ro->ro_rt != NULL) { 578 rtfree(ro->ro_rt); 579 ro->ro_rt = rtalloc(&ro->ro_dst, RT_RESOLVE, rtableid); 580 } 581 if (rt_mtucloned) 582 rtfree(rt); 583 } 584 } 585 586 int 587 ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd) 588 { 589 struct mbuf_list ml; 590 struct ifnet *encif = NULL; 591 struct ip *ip; 592 struct in_addr dst; 593 u_int len; 594 int error, rtableid, tso = 0; 595 596 #if NPF > 0 597 /* 598 * Packet filter 599 */ 600 if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL || 601 pf_test(AF_INET, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) { 602 m_freem(m); 603 return EACCES; 604 } 605 if (m == NULL) 606 return 0; 607 /* 608 * PF_TAG_REROUTE handling or not... 609 * Packet is entering IPsec so the routing is 610 * already overruled by the IPsec policy. 611 * Until now the change was not reconsidered. 612 * What's the behaviour? 613 */ 614 #endif 615 616 /* Check if we can chop the TCP packet */ 617 ip = mtod(m, struct ip *); 618 if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && 619 m->m_pkthdr.ph_mss <= tdb->tdb_mtu) { 620 tso = 1; 621 len = m->m_pkthdr.ph_mss; 622 } else 623 len = ntohs(ip->ip_len); 624 625 /* Check if we are allowed to fragment */ 626 dst = ip->ip_dst; 627 rtableid = m->m_pkthdr.ph_rtableid; 628 if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && 629 len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { 630 int transportmode; 631 632 transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) && 633 (tdb->tdb_dst.sin.sin_addr.s_addr == dst.s_addr); 634 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 635 transportmode); 636 ipsec_adjust_mtu(m, tdb->tdb_mtu); 637 m_freem(m); 638 return EMSGSIZE; 639 } 640 /* propagate IP_DF for v4-over-v6 */ 641 if (ip_mtudisc && ip->ip_off & htons(IP_DF)) 642 SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); 643 644 /* 645 * Clear these -- they'll be set in the recursive invocation 646 * as needed. 647 */ 648 m->m_flags &= ~(M_MCAST | M_BCAST); 649 650 if (tso) { 651 error = tcp_chopper(m, &ml, encif, len); 652 if (error) 653 goto done; 654 } else { 655 CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); 656 in_proto_cksum_out(m, encif); 657 ml_init(&ml); 658 ml_enqueue(&ml, m); 659 } 660 661 KERNEL_LOCK(); 662 while ((m = ml_dequeue(&ml)) != NULL) { 663 /* Callee frees mbuf */ 664 error = ipsp_process_packet(m, tdb, AF_INET, 0); 665 if (error) 666 break; 667 } 668 KERNEL_UNLOCK(); 669 done: 670 if (error) { 671 ml_purge(&ml); 672 ipsecstat_inc(ipsec_odrops); 673 tdbstat_inc(tdb, tdb_odrops); 674 } 675 if (!error && tso) 676 tcpstat_inc(tcps_outswtso); 677 if (ip_mtudisc && error == EMSGSIZE) 678 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0); 679 return error; 680 } 681 #endif /* IPSEC */ 682 683 int 684 ip_fragment(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp, 685 u_long mtu) 686 { 687 struct ip *ip; 688 int firstlen, hlen, tlen, len, off; 689 int error; 690 691 ml_init(ml); 692 ml_enqueue(ml, m0); 693 694 ip = mtod(m0, struct ip *); 695 hlen = ip->ip_hl << 2; 696 tlen = m0->m_pkthdr.len; 697 len = (mtu - hlen) &~ 7; 698 if (len < 8) { 699 error = EMSGSIZE; 700 goto bad; 701 } 702 firstlen = len; 703 704 /* 705 * If we are doing fragmentation, we can't defer TCP/UDP 706 * checksumming; compute the checksum and clear the flag. 707 */ 708 in_proto_cksum_out(m0, NULL); 709 710 /* 711 * Loop through length of payload after first fragment, 712 * make new header and copy data of each part and link onto chain. 713 */ 714 for (off = hlen + firstlen; off < tlen; off += len) { 715 struct mbuf *m; 716 struct ip *mhip; 717 int mhlen; 718 719 MGETHDR(m, M_DONTWAIT, MT_HEADER); 720 if (m == NULL) { 721 error = ENOBUFS; 722 goto bad; 723 } 724 ml_enqueue(ml, m); 725 if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) 726 goto bad; 727 m->m_data += max_linkhdr; 728 mhip = mtod(m, struct ip *); 729 *mhip = *ip; 730 if (hlen > sizeof(struct ip)) { 731 mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip); 732 mhip->ip_hl = mhlen >> 2; 733 } else 734 mhlen = sizeof(struct ip); 735 m->m_len = mhlen; 736 737 mhip->ip_off = ((off - hlen) >> 3) + 738 (ntohs(ip->ip_off) & ~IP_MF); 739 if (ip->ip_off & htons(IP_MF)) 740 mhip->ip_off |= IP_MF; 741 if (off + len >= tlen) 742 len = tlen - off; 743 else 744 mhip->ip_off |= IP_MF; 745 mhip->ip_off = htons(mhip->ip_off); 746 747 m->m_pkthdr.len = mhlen + len; 748 mhip->ip_len = htons(m->m_pkthdr.len); 749 m->m_next = m_copym(m0, off, len, M_NOWAIT); 750 if (m->m_next == NULL) { 751 error = ENOBUFS; 752 goto bad; 753 } 754 755 in_hdr_cksum_out(m, ifp); 756 } 757 758 /* 759 * Update first fragment by trimming what's been copied out 760 * and updating header, then send each fragment (in order). 761 */ 762 if (hlen + firstlen < tlen) { 763 m_adj(m0, hlen + firstlen - tlen); 764 ip->ip_off |= htons(IP_MF); 765 } 766 ip->ip_len = htons(m0->m_pkthdr.len); 767 768 in_hdr_cksum_out(m0, ifp); 769 770 ipstat_add(ips_ofragments, ml_len(ml)); 771 return (0); 772 773 bad: 774 ipstat_inc(ips_odropped); 775 ml_purge(ml); 776 return (error); 777 } 778 779 /* 780 * Insert IP options into preformed packet. 781 * Adjust IP destination as required for IP source routing, 782 * as indicated by a non-zero in_addr at the start of the options. 783 */ 784 struct mbuf * 785 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 786 { 787 struct ipoption *p = mtod(opt, struct ipoption *); 788 struct mbuf *n; 789 struct ip *ip = mtod(m, struct ip *); 790 unsigned int optlen; 791 792 optlen = opt->m_len - sizeof(p->ipopt_dst); 793 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 794 return (m); /* XXX should fail */ 795 796 /* check if options will fit to IP header */ 797 if ((optlen + sizeof(struct ip)) > (0x0f << 2)) { 798 *phlen = sizeof(struct ip); 799 return (m); 800 } 801 802 if (p->ipopt_dst.s_addr) 803 ip->ip_dst = p->ipopt_dst; 804 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 805 MGETHDR(n, M_DONTWAIT, MT_HEADER); 806 if (n == NULL) 807 return (m); 808 M_MOVE_HDR(n, m); 809 n->m_pkthdr.len += optlen; 810 m->m_len -= sizeof(struct ip); 811 m->m_data += sizeof(struct ip); 812 n->m_next = m; 813 m = n; 814 m->m_len = optlen + sizeof(struct ip); 815 m->m_data += max_linkhdr; 816 memcpy(mtod(m, caddr_t), ip, sizeof(struct ip)); 817 } else { 818 m->m_data -= optlen; 819 m->m_len += optlen; 820 m->m_pkthdr.len += optlen; 821 memmove(mtod(m, caddr_t), (caddr_t)ip, sizeof(struct ip)); 822 } 823 ip = mtod(m, struct ip *); 824 memcpy(ip + 1, p->ipopt_list, optlen); 825 *phlen = sizeof(struct ip) + optlen; 826 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 827 return (m); 828 } 829 830 /* 831 * Copy options from ip to jp, 832 * omitting those not copied during fragmentation. 833 */ 834 int 835 ip_optcopy(struct ip *ip, struct ip *jp) 836 { 837 u_char *cp, *dp; 838 int opt, optlen, cnt; 839 840 cp = (u_char *)(ip + 1); 841 dp = (u_char *)(jp + 1); 842 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 843 for (; cnt > 0; cnt -= optlen, cp += optlen) { 844 opt = cp[0]; 845 if (opt == IPOPT_EOL) 846 break; 847 if (opt == IPOPT_NOP) { 848 /* Preserve for IP mcast tunnel's LSRR alignment. */ 849 *dp++ = IPOPT_NOP; 850 optlen = 1; 851 continue; 852 } 853 #ifdef DIAGNOSTIC 854 if (cnt < IPOPT_OLEN + sizeof(*cp)) 855 panic("malformed IPv4 option passed to ip_optcopy"); 856 #endif 857 optlen = cp[IPOPT_OLEN]; 858 #ifdef DIAGNOSTIC 859 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 860 panic("malformed IPv4 option passed to ip_optcopy"); 861 #endif 862 /* bogus lengths should have been caught by ip_dooptions */ 863 if (optlen > cnt) 864 optlen = cnt; 865 if (IPOPT_COPIED(opt)) { 866 memcpy(dp, cp, optlen); 867 dp += optlen; 868 } 869 } 870 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 871 *dp++ = IPOPT_EOL; 872 return (optlen); 873 } 874 875 /* 876 * IP socket option processing. 877 */ 878 int 879 ip_ctloutput(int op, struct socket *so, int level, int optname, 880 struct mbuf *m) 881 { 882 struct inpcb *inp = sotoinpcb(so); 883 int optval = 0; 884 struct proc *p = curproc; /* XXX */ 885 int error = 0; 886 u_int rtableid, rtid = 0; 887 888 if (level != IPPROTO_IP) 889 return (EINVAL); 890 891 rtableid = p->p_p->ps_rtableid; 892 893 switch (op) { 894 case PRCO_SETOPT: 895 switch (optname) { 896 case IP_OPTIONS: 897 return (ip_pcbopts(&inp->inp_options, m)); 898 899 case IP_TOS: 900 case IP_TTL: 901 case IP_MINTTL: 902 case IP_RECVOPTS: 903 case IP_RECVRETOPTS: 904 case IP_RECVDSTADDR: 905 case IP_RECVIF: 906 case IP_RECVTTL: 907 case IP_RECVDSTPORT: 908 case IP_RECVRTABLE: 909 case IP_IPSECFLOWINFO: 910 if (m == NULL || m->m_len != sizeof(int)) 911 error = EINVAL; 912 else { 913 optval = *mtod(m, int *); 914 switch (optname) { 915 916 case IP_TOS: 917 inp->inp_ip.ip_tos = optval; 918 break; 919 920 case IP_TTL: 921 if (optval > 0 && optval <= MAXTTL) 922 inp->inp_ip.ip_ttl = optval; 923 else if (optval == -1) 924 inp->inp_ip.ip_ttl = ip_defttl; 925 else 926 error = EINVAL; 927 break; 928 929 case IP_MINTTL: 930 if (optval >= 0 && optval <= MAXTTL) 931 inp->inp_ip_minttl = optval; 932 else 933 error = EINVAL; 934 break; 935 #define OPTSET(bit) \ 936 if (optval) \ 937 inp->inp_flags |= bit; \ 938 else \ 939 inp->inp_flags &= ~bit; 940 941 case IP_RECVOPTS: 942 OPTSET(INP_RECVOPTS); 943 break; 944 945 case IP_RECVRETOPTS: 946 OPTSET(INP_RECVRETOPTS); 947 break; 948 949 case IP_RECVDSTADDR: 950 OPTSET(INP_RECVDSTADDR); 951 break; 952 case IP_RECVIF: 953 OPTSET(INP_RECVIF); 954 break; 955 case IP_RECVTTL: 956 OPTSET(INP_RECVTTL); 957 break; 958 case IP_RECVDSTPORT: 959 OPTSET(INP_RECVDSTPORT); 960 break; 961 case IP_RECVRTABLE: 962 OPTSET(INP_RECVRTABLE); 963 break; 964 case IP_IPSECFLOWINFO: 965 OPTSET(INP_IPSECFLOWINFO); 966 break; 967 } 968 } 969 break; 970 #undef OPTSET 971 972 case IP_MULTICAST_IF: 973 case IP_MULTICAST_TTL: 974 case IP_MULTICAST_LOOP: 975 case IP_ADD_MEMBERSHIP: 976 case IP_DROP_MEMBERSHIP: 977 error = ip_setmoptions(optname, &inp->inp_moptions, m, 978 inp->inp_rtableid); 979 break; 980 981 case IP_PORTRANGE: 982 if (m == NULL || m->m_len != sizeof(int)) 983 error = EINVAL; 984 else { 985 optval = *mtod(m, int *); 986 987 switch (optval) { 988 989 case IP_PORTRANGE_DEFAULT: 990 inp->inp_flags &= ~(INP_LOWPORT); 991 inp->inp_flags &= ~(INP_HIGHPORT); 992 break; 993 994 case IP_PORTRANGE_HIGH: 995 inp->inp_flags &= ~(INP_LOWPORT); 996 inp->inp_flags |= INP_HIGHPORT; 997 break; 998 999 case IP_PORTRANGE_LOW: 1000 inp->inp_flags &= ~(INP_HIGHPORT); 1001 inp->inp_flags |= INP_LOWPORT; 1002 break; 1003 1004 default: 1005 1006 error = EINVAL; 1007 break; 1008 } 1009 } 1010 break; 1011 case IP_AUTH_LEVEL: 1012 case IP_ESP_TRANS_LEVEL: 1013 case IP_ESP_NETWORK_LEVEL: 1014 case IP_IPCOMP_LEVEL: 1015 #ifndef IPSEC 1016 error = EOPNOTSUPP; 1017 #else 1018 if (m == NULL || m->m_len != sizeof(int)) { 1019 error = EINVAL; 1020 break; 1021 } 1022 optval = *mtod(m, int *); 1023 1024 if (optval < IPSEC_LEVEL_BYPASS || 1025 optval > IPSEC_LEVEL_UNIQUE) { 1026 error = EINVAL; 1027 break; 1028 } 1029 1030 switch (optname) { 1031 case IP_AUTH_LEVEL: 1032 if (optval < IPSEC_AUTH_LEVEL_DEFAULT && 1033 suser(p)) { 1034 error = EACCES; 1035 break; 1036 } 1037 inp->inp_seclevel[SL_AUTH] = optval; 1038 break; 1039 1040 case IP_ESP_TRANS_LEVEL: 1041 if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT && 1042 suser(p)) { 1043 error = EACCES; 1044 break; 1045 } 1046 inp->inp_seclevel[SL_ESP_TRANS] = optval; 1047 break; 1048 1049 case IP_ESP_NETWORK_LEVEL: 1050 if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT && 1051 suser(p)) { 1052 error = EACCES; 1053 break; 1054 } 1055 inp->inp_seclevel[SL_ESP_NETWORK] = optval; 1056 break; 1057 case IP_IPCOMP_LEVEL: 1058 if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT && 1059 suser(p)) { 1060 error = EACCES; 1061 break; 1062 } 1063 inp->inp_seclevel[SL_IPCOMP] = optval; 1064 break; 1065 } 1066 #endif 1067 break; 1068 1069 case IP_IPSEC_LOCAL_ID: 1070 case IP_IPSEC_REMOTE_ID: 1071 error = EOPNOTSUPP; 1072 break; 1073 case SO_RTABLE: 1074 if (m == NULL || m->m_len < sizeof(u_int)) { 1075 error = EINVAL; 1076 break; 1077 } 1078 rtid = *mtod(m, u_int *); 1079 if (inp->inp_rtableid == rtid) 1080 break; 1081 /* needs privileges to switch when already set */ 1082 if (rtableid != rtid && rtableid != 0 && 1083 (error = suser(p)) != 0) 1084 break; 1085 /* table must exist */ 1086 if (!rtable_exists(rtid)) { 1087 error = EINVAL; 1088 break; 1089 } 1090 if (inp->inp_lport) { 1091 error = EBUSY; 1092 break; 1093 } 1094 inp->inp_rtableid = rtid; 1095 in_pcbrehash(inp); 1096 break; 1097 case IP_PIPEX: 1098 if (m != NULL && m->m_len == sizeof(int)) 1099 inp->inp_pipex = *mtod(m, int *); 1100 else 1101 error = EINVAL; 1102 break; 1103 1104 default: 1105 error = ENOPROTOOPT; 1106 break; 1107 } 1108 break; 1109 1110 case PRCO_GETOPT: 1111 switch (optname) { 1112 case IP_OPTIONS: 1113 case IP_RETOPTS: 1114 if (inp->inp_options) { 1115 m->m_len = inp->inp_options->m_len; 1116 memcpy(mtod(m, caddr_t), 1117 mtod(inp->inp_options, caddr_t), m->m_len); 1118 } else 1119 m->m_len = 0; 1120 break; 1121 1122 case IP_TOS: 1123 case IP_TTL: 1124 case IP_MINTTL: 1125 case IP_RECVOPTS: 1126 case IP_RECVRETOPTS: 1127 case IP_RECVDSTADDR: 1128 case IP_RECVIF: 1129 case IP_RECVTTL: 1130 case IP_RECVDSTPORT: 1131 case IP_RECVRTABLE: 1132 case IP_IPSECFLOWINFO: 1133 case IP_IPDEFTTL: 1134 m->m_len = sizeof(int); 1135 switch (optname) { 1136 1137 case IP_TOS: 1138 optval = inp->inp_ip.ip_tos; 1139 break; 1140 1141 case IP_TTL: 1142 optval = inp->inp_ip.ip_ttl; 1143 break; 1144 1145 case IP_MINTTL: 1146 optval = inp->inp_ip_minttl; 1147 break; 1148 1149 case IP_IPDEFTTL: 1150 optval = ip_defttl; 1151 break; 1152 1153 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1154 1155 case IP_RECVOPTS: 1156 optval = OPTBIT(INP_RECVOPTS); 1157 break; 1158 1159 case IP_RECVRETOPTS: 1160 optval = OPTBIT(INP_RECVRETOPTS); 1161 break; 1162 1163 case IP_RECVDSTADDR: 1164 optval = OPTBIT(INP_RECVDSTADDR); 1165 break; 1166 case IP_RECVIF: 1167 optval = OPTBIT(INP_RECVIF); 1168 break; 1169 case IP_RECVTTL: 1170 optval = OPTBIT(INP_RECVTTL); 1171 break; 1172 case IP_RECVDSTPORT: 1173 optval = OPTBIT(INP_RECVDSTPORT); 1174 break; 1175 case IP_RECVRTABLE: 1176 optval = OPTBIT(INP_RECVRTABLE); 1177 break; 1178 case IP_IPSECFLOWINFO: 1179 optval = OPTBIT(INP_IPSECFLOWINFO); 1180 break; 1181 } 1182 *mtod(m, int *) = optval; 1183 break; 1184 1185 case IP_MULTICAST_IF: 1186 case IP_MULTICAST_TTL: 1187 case IP_MULTICAST_LOOP: 1188 case IP_ADD_MEMBERSHIP: 1189 case IP_DROP_MEMBERSHIP: 1190 error = ip_getmoptions(optname, inp->inp_moptions, m); 1191 break; 1192 1193 case IP_PORTRANGE: 1194 m->m_len = sizeof(int); 1195 1196 if (inp->inp_flags & INP_HIGHPORT) 1197 optval = IP_PORTRANGE_HIGH; 1198 else if (inp->inp_flags & INP_LOWPORT) 1199 optval = IP_PORTRANGE_LOW; 1200 else 1201 optval = 0; 1202 1203 *mtod(m, int *) = optval; 1204 break; 1205 1206 case IP_AUTH_LEVEL: 1207 case IP_ESP_TRANS_LEVEL: 1208 case IP_ESP_NETWORK_LEVEL: 1209 case IP_IPCOMP_LEVEL: 1210 #ifndef IPSEC 1211 m->m_len = sizeof(int); 1212 *mtod(m, int *) = IPSEC_LEVEL_NONE; 1213 #else 1214 m->m_len = sizeof(int); 1215 switch (optname) { 1216 case IP_AUTH_LEVEL: 1217 optval = inp->inp_seclevel[SL_AUTH]; 1218 break; 1219 1220 case IP_ESP_TRANS_LEVEL: 1221 optval = inp->inp_seclevel[SL_ESP_TRANS]; 1222 break; 1223 1224 case IP_ESP_NETWORK_LEVEL: 1225 optval = inp->inp_seclevel[SL_ESP_NETWORK]; 1226 break; 1227 case IP_IPCOMP_LEVEL: 1228 optval = inp->inp_seclevel[SL_IPCOMP]; 1229 break; 1230 } 1231 *mtod(m, int *) = optval; 1232 #endif 1233 break; 1234 case IP_IPSEC_LOCAL_ID: 1235 case IP_IPSEC_REMOTE_ID: 1236 error = EOPNOTSUPP; 1237 break; 1238 case SO_RTABLE: 1239 m->m_len = sizeof(u_int); 1240 *mtod(m, u_int *) = inp->inp_rtableid; 1241 break; 1242 case IP_PIPEX: 1243 m->m_len = sizeof(int); 1244 *mtod(m, int *) = inp->inp_pipex; 1245 break; 1246 default: 1247 error = ENOPROTOOPT; 1248 break; 1249 } 1250 break; 1251 } 1252 return (error); 1253 } 1254 1255 /* 1256 * Set up IP options in pcb for insertion in output packets. 1257 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1258 * with destination address if source routed. 1259 */ 1260 int 1261 ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m) 1262 { 1263 struct mbuf *n; 1264 struct ipoption *p; 1265 int cnt, off, optlen; 1266 u_char *cp; 1267 u_char opt; 1268 1269 /* turn off any old options */ 1270 m_freem(*pcbopt); 1271 *pcbopt = NULL; 1272 if (m == NULL || m->m_len == 0) { 1273 /* 1274 * Only turning off any previous options. 1275 */ 1276 return (0); 1277 } 1278 1279 if (m->m_len % sizeof(int32_t) || 1280 m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) 1281 return (EINVAL); 1282 1283 /* Don't sleep because NET_LOCK() is hold. */ 1284 if ((n = m_get(M_NOWAIT, MT_SOOPTS)) == NULL) 1285 return (ENOBUFS); 1286 p = mtod(n, struct ipoption *); 1287 memset(p, 0, sizeof (*p)); /* 0 = IPOPT_EOL, needed for padding */ 1288 n->m_len = sizeof(struct in_addr); 1289 1290 off = 0; 1291 cnt = m->m_len; 1292 cp = mtod(m, u_char *); 1293 1294 while (cnt > 0) { 1295 opt = cp[IPOPT_OPTVAL]; 1296 1297 if (opt == IPOPT_NOP || opt == IPOPT_EOL) { 1298 optlen = 1; 1299 } else { 1300 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1301 goto bad; 1302 optlen = cp[IPOPT_OLEN]; 1303 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1304 goto bad; 1305 } 1306 switch (opt) { 1307 default: 1308 memcpy(p->ipopt_list + off, cp, optlen); 1309 break; 1310 1311 case IPOPT_LSRR: 1312 case IPOPT_SSRR: 1313 /* 1314 * user process specifies route as: 1315 * ->A->B->C->D 1316 * D must be our final destination (but we can't 1317 * check that since we may not have connected yet). 1318 * A is first hop destination, which doesn't appear in 1319 * actual IP option, but is stored before the options. 1320 */ 1321 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) 1322 goto bad; 1323 1324 /* 1325 * Optlen is smaller because first address is popped. 1326 * Cnt and cp will be adjusted a bit later to reflect 1327 * this. 1328 */ 1329 optlen -= sizeof(struct in_addr); 1330 p->ipopt_list[off + IPOPT_OPTVAL] = opt; 1331 p->ipopt_list[off + IPOPT_OLEN] = optlen; 1332 1333 /* 1334 * Move first hop before start of options. 1335 */ 1336 memcpy(&p->ipopt_dst, cp + IPOPT_OFFSET, 1337 sizeof(struct in_addr)); 1338 cp += sizeof(struct in_addr); 1339 cnt -= sizeof(struct in_addr); 1340 /* 1341 * Then copy rest of options 1342 */ 1343 memcpy(p->ipopt_list + off + IPOPT_OFFSET, 1344 cp + IPOPT_OFFSET, optlen - IPOPT_OFFSET); 1345 break; 1346 } 1347 off += optlen; 1348 cp += optlen; 1349 cnt -= optlen; 1350 1351 if (opt == IPOPT_EOL) 1352 break; 1353 } 1354 /* pad options to next word, since p was zeroed just adjust off */ 1355 off = (off + sizeof(int32_t) - 1) & ~(sizeof(int32_t) - 1); 1356 n->m_len += off; 1357 if (n->m_len > sizeof(*p)) { 1358 bad: 1359 m_freem(n); 1360 return (EINVAL); 1361 } 1362 1363 *pcbopt = n; 1364 return (0); 1365 } 1366 1367 /* 1368 * Lookup the interface based on the information in the ip_mreqn struct. 1369 */ 1370 int 1371 ip_multicast_if(struct ip_mreqn *mreq, u_int rtableid, unsigned int *ifidx) 1372 { 1373 struct sockaddr_in sin; 1374 struct rtentry *rt; 1375 1376 /* 1377 * In case userland provides the imr_ifindex use this as interface. 1378 * If no interface address was provided, use the interface of 1379 * the route to the given multicast address. 1380 */ 1381 if (mreq->imr_ifindex != 0) { 1382 *ifidx = mreq->imr_ifindex; 1383 } else if (mreq->imr_address.s_addr == INADDR_ANY) { 1384 memset(&sin, 0, sizeof(sin)); 1385 sin.sin_len = sizeof(sin); 1386 sin.sin_family = AF_INET; 1387 sin.sin_addr = mreq->imr_multiaddr; 1388 rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid); 1389 if (!rtisvalid(rt)) { 1390 rtfree(rt); 1391 return EADDRNOTAVAIL; 1392 } 1393 *ifidx = rt->rt_ifidx; 1394 rtfree(rt); 1395 } else { 1396 memset(&sin, 0, sizeof(sin)); 1397 sin.sin_len = sizeof(sin); 1398 sin.sin_family = AF_INET; 1399 sin.sin_addr = mreq->imr_address; 1400 rt = rtalloc(sintosa(&sin), 0, rtableid); 1401 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) { 1402 rtfree(rt); 1403 return EADDRNOTAVAIL; 1404 } 1405 *ifidx = rt->rt_ifidx; 1406 rtfree(rt); 1407 } 1408 1409 return 0; 1410 } 1411 1412 /* 1413 * Set the IP multicast options in response to user setsockopt(). 1414 */ 1415 int 1416 ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m, 1417 u_int rtableid) 1418 { 1419 struct in_addr addr; 1420 struct in_ifaddr *ia; 1421 struct ip_mreqn mreqn; 1422 struct ifnet *ifp = NULL; 1423 struct ip_moptions *imo = *imop; 1424 struct in_multi **immp; 1425 struct sockaddr_in sin; 1426 unsigned int ifidx; 1427 int i, error = 0; 1428 u_char loop; 1429 1430 if (imo == NULL) { 1431 /* 1432 * No multicast option buffer attached to the pcb; 1433 * allocate one and initialize to default values. 1434 */ 1435 imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK|M_ZERO); 1436 immp = mallocarray(IP_MIN_MEMBERSHIPS, sizeof(*immp), M_IPMOPTS, 1437 M_WAITOK|M_ZERO); 1438 *imop = imo; 1439 imo->imo_ifidx = 0; 1440 imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL; 1441 imo->imo_loop = IP_DEFAULT_MULTICAST_LOOP; 1442 imo->imo_num_memberships = 0; 1443 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; 1444 imo->imo_membership = immp; 1445 } 1446 1447 switch (optname) { 1448 1449 case IP_MULTICAST_IF: 1450 /* 1451 * Select the interface for outgoing multicast packets. 1452 */ 1453 if (m == NULL) { 1454 error = EINVAL; 1455 break; 1456 } 1457 if (m->m_len == sizeof(struct in_addr)) { 1458 addr = *(mtod(m, struct in_addr *)); 1459 } else if (m->m_len == sizeof(struct ip_mreq) || 1460 m->m_len == sizeof(struct ip_mreqn)) { 1461 memset(&mreqn, 0, sizeof(mreqn)); 1462 memcpy(&mreqn, mtod(m, void *), m->m_len); 1463 1464 /* 1465 * If an interface index is given use this 1466 * index to set the imo_ifidx but check first 1467 * that the interface actually exists. 1468 * In the other case just set the addr to 1469 * the imr_address and fall through to the 1470 * regular code. 1471 */ 1472 if (mreqn.imr_ifindex != 0) { 1473 ifp = if_get(mreqn.imr_ifindex); 1474 if (ifp == NULL || 1475 ifp->if_rdomain != rtable_l2(rtableid)) { 1476 error = EADDRNOTAVAIL; 1477 if_put(ifp); 1478 break; 1479 } 1480 imo->imo_ifidx = ifp->if_index; 1481 if_put(ifp); 1482 break; 1483 } else 1484 addr = mreqn.imr_address; 1485 } else { 1486 error = EINVAL; 1487 break; 1488 } 1489 /* 1490 * INADDR_ANY is used to remove a previous selection. 1491 * When no interface is selected, a default one is 1492 * chosen every time a multicast packet is sent. 1493 */ 1494 if (addr.s_addr == INADDR_ANY) { 1495 imo->imo_ifidx = 0; 1496 break; 1497 } 1498 /* 1499 * The selected interface is identified by its local 1500 * IP address. Find the interface and confirm that 1501 * it supports multicasting. 1502 */ 1503 memset(&sin, 0, sizeof(sin)); 1504 sin.sin_len = sizeof(sin); 1505 sin.sin_family = AF_INET; 1506 sin.sin_addr = addr; 1507 ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid)); 1508 if (ia == NULL || 1509 (ia->ia_ifp->if_flags & IFF_MULTICAST) == 0) { 1510 error = EADDRNOTAVAIL; 1511 break; 1512 } 1513 imo->imo_ifidx = ia->ia_ifp->if_index; 1514 break; 1515 1516 case IP_MULTICAST_TTL: 1517 /* 1518 * Set the IP time-to-live for outgoing multicast packets. 1519 */ 1520 if (m == NULL || m->m_len != 1) { 1521 error = EINVAL; 1522 break; 1523 } 1524 imo->imo_ttl = *(mtod(m, u_char *)); 1525 break; 1526 1527 case IP_MULTICAST_LOOP: 1528 /* 1529 * Set the loopback flag for outgoing multicast packets. 1530 * Must be zero or one. 1531 */ 1532 if (m == NULL || m->m_len != 1 || 1533 (loop = *(mtod(m, u_char *))) > 1) { 1534 error = EINVAL; 1535 break; 1536 } 1537 imo->imo_loop = loop; 1538 break; 1539 1540 case IP_ADD_MEMBERSHIP: 1541 /* 1542 * Add a multicast group membership. 1543 * Group must be a valid IP multicast address. 1544 */ 1545 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1546 m->m_len == sizeof(struct ip_mreqn))) { 1547 error = EINVAL; 1548 break; 1549 } 1550 memset(&mreqn, 0, sizeof(mreqn)); 1551 memcpy(&mreqn, mtod(m, void *), m->m_len); 1552 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1553 error = EINVAL; 1554 break; 1555 } 1556 1557 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1558 if (error) 1559 break; 1560 1561 /* 1562 * See if we found an interface, and confirm that it 1563 * supports multicast. 1564 */ 1565 ifp = if_get(ifidx); 1566 if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) || 1567 (ifp->if_flags & IFF_MULTICAST) == 0) { 1568 error = EADDRNOTAVAIL; 1569 if_put(ifp); 1570 break; 1571 } 1572 1573 /* 1574 * See if the membership already exists or if all the 1575 * membership slots are full. 1576 */ 1577 for (i = 0; i < imo->imo_num_memberships; ++i) { 1578 if (imo->imo_membership[i]->inm_ifidx == ifidx && 1579 imo->imo_membership[i]->inm_addr.s_addr 1580 == mreqn.imr_multiaddr.s_addr) 1581 break; 1582 } 1583 if (i < imo->imo_num_memberships) { 1584 error = EADDRINUSE; 1585 if_put(ifp); 1586 break; 1587 } 1588 if (imo->imo_num_memberships == imo->imo_max_memberships) { 1589 struct in_multi **nmships, **omships; 1590 size_t newmax; 1591 /* 1592 * Resize the vector to next power-of-two minus 1. If 1593 * the size would exceed the maximum then we know we've 1594 * really run out of entries. Otherwise, we reallocate 1595 * the vector. 1596 */ 1597 nmships = NULL; 1598 omships = imo->imo_membership; 1599 newmax = ((imo->imo_max_memberships + 1) * 2) - 1; 1600 if (newmax <= IP_MAX_MEMBERSHIPS) { 1601 nmships = mallocarray(newmax, sizeof(*nmships), 1602 M_IPMOPTS, M_NOWAIT|M_ZERO); 1603 if (nmships != NULL) { 1604 memcpy(nmships, omships, 1605 sizeof(*omships) * 1606 imo->imo_max_memberships); 1607 free(omships, M_IPMOPTS, 1608 sizeof(*omships) * 1609 imo->imo_max_memberships); 1610 imo->imo_membership = nmships; 1611 imo->imo_max_memberships = newmax; 1612 } 1613 } 1614 if (nmships == NULL) { 1615 error = ENOBUFS; 1616 if_put(ifp); 1617 break; 1618 } 1619 } 1620 /* 1621 * Everything looks good; add a new record to the multicast 1622 * address list for the given interface. 1623 */ 1624 if ((imo->imo_membership[i] = 1625 in_addmulti(&mreqn.imr_multiaddr, ifp)) == NULL) { 1626 error = ENOBUFS; 1627 if_put(ifp); 1628 break; 1629 } 1630 ++imo->imo_num_memberships; 1631 if_put(ifp); 1632 break; 1633 1634 case IP_DROP_MEMBERSHIP: 1635 /* 1636 * Drop a multicast group membership. 1637 * Group must be a valid IP multicast address. 1638 */ 1639 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1640 m->m_len == sizeof(struct ip_mreqn))) { 1641 error = EINVAL; 1642 break; 1643 } 1644 memset(&mreqn, 0, sizeof(mreqn)); 1645 memcpy(&mreqn, mtod(m, void *), m->m_len); 1646 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1647 error = EINVAL; 1648 break; 1649 } 1650 1651 /* 1652 * If an interface address was specified, get a pointer 1653 * to its ifnet structure. 1654 */ 1655 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1656 if (error) 1657 break; 1658 1659 /* 1660 * Find the membership in the membership array. 1661 */ 1662 for (i = 0; i < imo->imo_num_memberships; ++i) { 1663 if ((ifidx == 0 || 1664 imo->imo_membership[i]->inm_ifidx == ifidx) && 1665 imo->imo_membership[i]->inm_addr.s_addr == 1666 mreqn.imr_multiaddr.s_addr) 1667 break; 1668 } 1669 if (i == imo->imo_num_memberships) { 1670 error = EADDRNOTAVAIL; 1671 break; 1672 } 1673 /* 1674 * Give up the multicast address record to which the 1675 * membership points. 1676 */ 1677 in_delmulti(imo->imo_membership[i]); 1678 /* 1679 * Remove the gap in the membership array. 1680 */ 1681 for (++i; i < imo->imo_num_memberships; ++i) 1682 imo->imo_membership[i-1] = imo->imo_membership[i]; 1683 --imo->imo_num_memberships; 1684 break; 1685 1686 default: 1687 error = EOPNOTSUPP; 1688 break; 1689 } 1690 1691 /* 1692 * If all options have default values, no need to keep the data. 1693 */ 1694 if (imo->imo_ifidx == 0 && 1695 imo->imo_ttl == IP_DEFAULT_MULTICAST_TTL && 1696 imo->imo_loop == IP_DEFAULT_MULTICAST_LOOP && 1697 imo->imo_num_memberships == 0) { 1698 free(imo->imo_membership , M_IPMOPTS, 1699 imo->imo_max_memberships * sizeof(struct in_multi *)); 1700 free(*imop, M_IPMOPTS, sizeof(**imop)); 1701 *imop = NULL; 1702 } 1703 1704 return (error); 1705 } 1706 1707 /* 1708 * Return the IP multicast options in response to user getsockopt(). 1709 */ 1710 int 1711 ip_getmoptions(int optname, struct ip_moptions *imo, struct mbuf *m) 1712 { 1713 u_char *ttl; 1714 u_char *loop; 1715 struct in_addr *addr; 1716 struct in_ifaddr *ia; 1717 struct ifnet *ifp; 1718 1719 switch (optname) { 1720 1721 case IP_MULTICAST_IF: 1722 addr = mtod(m, struct in_addr *); 1723 m->m_len = sizeof(struct in_addr); 1724 if (imo == NULL || (ifp = if_get(imo->imo_ifidx)) == NULL) 1725 addr->s_addr = INADDR_ANY; 1726 else { 1727 IFP_TO_IA(ifp, ia); 1728 addr->s_addr = (ia == NULL) ? INADDR_ANY 1729 : ia->ia_addr.sin_addr.s_addr; 1730 if_put(ifp); 1731 } 1732 return (0); 1733 1734 case IP_MULTICAST_TTL: 1735 ttl = mtod(m, u_char *); 1736 m->m_len = 1; 1737 *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL 1738 : imo->imo_ttl; 1739 return (0); 1740 1741 case IP_MULTICAST_LOOP: 1742 loop = mtod(m, u_char *); 1743 m->m_len = 1; 1744 *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP 1745 : imo->imo_loop; 1746 return (0); 1747 1748 default: 1749 return (EOPNOTSUPP); 1750 } 1751 } 1752 1753 /* 1754 * Discard the IP multicast options. 1755 */ 1756 void 1757 ip_freemoptions(struct ip_moptions *imo) 1758 { 1759 int i; 1760 1761 if (imo != NULL) { 1762 for (i = 0; i < imo->imo_num_memberships; ++i) 1763 in_delmulti(imo->imo_membership[i]); 1764 free(imo->imo_membership, M_IPMOPTS, 1765 imo->imo_max_memberships * sizeof(struct in_multi *)); 1766 free(imo, M_IPMOPTS, sizeof(*imo)); 1767 } 1768 } 1769 1770 /* 1771 * Routine called from ip_output() to loop back a copy of an IP multicast 1772 * packet to the input queue of a specified interface. 1773 */ 1774 void 1775 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst) 1776 { 1777 struct mbuf *copym; 1778 1779 copym = m_dup_pkt(m, max_linkhdr, M_DONTWAIT); 1780 if (copym != NULL) { 1781 /* 1782 * We don't bother to fragment if the IP length is greater 1783 * than the interface's MTU. Can this possibly matter? 1784 */ 1785 in_hdr_cksum_out(copym, NULL); 1786 if_input_local(ifp, copym, dst->sin_family); 1787 } 1788 } 1789 1790 void 1791 in_hdr_cksum_out(struct mbuf *m, struct ifnet *ifp) 1792 { 1793 struct ip *ip = mtod(m, struct ip *); 1794 1795 ip->ip_sum = 0; 1796 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { 1797 SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); 1798 } else { 1799 ipstat_inc(ips_outswcsum); 1800 ip->ip_sum = in_cksum(m, ip->ip_hl << 2); 1801 CLR(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); 1802 } 1803 } 1804 1805 /* 1806 * Compute significant parts of the IPv4 checksum pseudo-header 1807 * for use in a delayed TCP/UDP checksum calculation. 1808 */ 1809 static u_int16_t 1810 in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto) 1811 { 1812 u_int32_t sum; 1813 1814 sum = lenproto + 1815 (u_int16_t)(src >> 16) + 1816 (u_int16_t)(src /*& 0xffff*/) + 1817 (u_int16_t)(dst >> 16) + 1818 (u_int16_t)(dst /*& 0xffff*/); 1819 1820 sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); 1821 1822 if (sum > 0xffff) 1823 sum -= 0xffff; 1824 1825 return (sum); 1826 } 1827 1828 /* 1829 * Process a delayed payload checksum calculation. 1830 */ 1831 void 1832 in_delayed_cksum(struct mbuf *m) 1833 { 1834 struct ip *ip; 1835 u_int16_t csum, offset; 1836 1837 ip = mtod(m, struct ip *); 1838 offset = ip->ip_hl << 2; 1839 csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset); 1840 if (csum == 0 && ip->ip_p == IPPROTO_UDP) 1841 csum = 0xffff; 1842 1843 switch (ip->ip_p) { 1844 case IPPROTO_TCP: 1845 offset += offsetof(struct tcphdr, th_sum); 1846 break; 1847 1848 case IPPROTO_UDP: 1849 offset += offsetof(struct udphdr, uh_sum); 1850 break; 1851 1852 case IPPROTO_ICMP: 1853 offset += offsetof(struct icmp, icmp_cksum); 1854 break; 1855 1856 default: 1857 return; 1858 } 1859 1860 if ((offset + sizeof(u_int16_t)) > m->m_len) 1861 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1862 else 1863 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1864 } 1865 1866 void 1867 in_proto_cksum_out(struct mbuf *m, struct ifnet *ifp) 1868 { 1869 struct ip *ip = mtod(m, struct ip *); 1870 1871 /* some hw and in_delayed_cksum need the pseudo header cksum */ 1872 if (m->m_pkthdr.csum_flags & 1873 (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) { 1874 u_int16_t csum = 0, offset; 1875 1876 offset = ip->ip_hl << 2; 1877 if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && 1878 in_ifcap_cksum(m, ifp, IFCAP_TSOv4)) { 1879 csum = in_cksum_phdr(ip->ip_src.s_addr, 1880 ip->ip_dst.s_addr, htonl(ip->ip_p)); 1881 } else if (ISSET(m->m_pkthdr.csum_flags, 1882 M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) { 1883 csum = in_cksum_phdr(ip->ip_src.s_addr, 1884 ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) - 1885 offset + ip->ip_p)); 1886 } 1887 if (ip->ip_p == IPPROTO_TCP) 1888 offset += offsetof(struct tcphdr, th_sum); 1889 else if (ip->ip_p == IPPROTO_UDP) 1890 offset += offsetof(struct udphdr, uh_sum); 1891 else if (ip->ip_p == IPPROTO_ICMP) 1892 offset += offsetof(struct icmp, icmp_cksum); 1893 if ((offset + sizeof(u_int16_t)) > m->m_len) 1894 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1895 else 1896 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1897 } 1898 1899 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) { 1900 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_TCPv4) || 1901 ip->ip_hl != 5) { 1902 tcpstat_inc(tcps_outswcsum); 1903 in_delayed_cksum(m); 1904 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */ 1905 } 1906 } else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) { 1907 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_UDPv4) || 1908 ip->ip_hl != 5) { 1909 udpstat_inc(udps_outswcsum); 1910 in_delayed_cksum(m); 1911 m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */ 1912 } 1913 } else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) { 1914 in_delayed_cksum(m); 1915 m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */ 1916 } 1917 } 1918 1919 int 1920 in_ifcap_cksum(struct mbuf *m, struct ifnet *ifp, int ifcap) 1921 { 1922 if ((ifp == NULL) || 1923 !ISSET(ifp->if_capabilities, ifcap) || 1924 (ifp->if_bridgeidx != 0)) 1925 return (0); 1926 /* 1927 * Simplex interface sends packet back without hardware cksum. 1928 * Keep this check in sync with the condition where ether_resolve() 1929 * calls if_input_local(). 1930 */ 1931 if (ISSET(m->m_flags, M_BCAST) && 1932 ISSET(ifp->if_flags, IFF_SIMPLEX) && 1933 !m->m_pkthdr.pf.routed) 1934 return (0); 1935 return (1); 1936 } 1937