1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.37 2003/04/15 06:44:45 silby Exp $ 31 */ 32 33 #define _IP_VHL 34 35 #include "opt_ipdn.h" 36 #include "opt_ipdivert.h" 37 #include "opt_mbuf_stress_test.h" 38 #include "opt_mpls.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/kernel.h> 43 #include <sys/malloc.h> 44 #include <sys/mbuf.h> 45 #include <sys/protosw.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/proc.h> 49 #include <sys/priv.h> 50 #include <sys/sysctl.h> 51 #include <sys/in_cksum.h> 52 #include <sys/lock.h> 53 54 #include <sys/thread2.h> 55 #include <sys/mplock2.h> 56 #include <sys/msgport2.h> 57 58 #include <net/if.h> 59 #include <net/netisr.h> 60 #include <net/pfil.h> 61 #include <net/route.h> 62 63 #include <netinet/in.h> 64 #include <netinet/in_systm.h> 65 #include <netinet/ip.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/in_var.h> 68 #include <netinet/ip_var.h> 69 70 #include <netproto/mpls/mpls_var.h> 71 72 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); 73 74 #include <net/ipfw/ip_fw.h> 75 #include <net/dummynet/ip_dummynet.h> 76 77 #define print_ip(x, a, y) kprintf("%s %d.%d.%d.%d%s",\ 78 x, (ntohl(a.s_addr)>>24)&0xFF,\ 79 (ntohl(a.s_addr)>>16)&0xFF,\ 80 (ntohl(a.s_addr)>>8)&0xFF,\ 81 (ntohl(a.s_addr))&0xFF, y); 82 83 u_short ip_id; 84 85 #ifdef MBUF_STRESS_TEST 86 int mbuf_frag_size = 0; 87 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 88 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 89 #endif 90 91 static int ip_do_rfc6864 = 1; 92 SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW, &ip_do_rfc6864, 0, 93 "Don't generate IP ID for DF IP datagrams"); 94 95 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); 96 static struct ifnet *ip_multicast_if(struct in_addr *, int *); 97 static void ip_mloopback 98 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 99 static int ip_getmoptions 100 (struct sockopt *, struct ip_moptions *); 101 static int ip_pcbopts(int, struct mbuf **, struct mbuf *); 102 static int ip_setmoptions 103 (struct sockopt *, struct ip_moptions **); 104 105 int ip_optcopy(struct ip *, struct ip *); 106 107 extern struct protosw inetsw[]; 108 109 static int 110 ip_localforward(struct mbuf *m, const struct sockaddr_in *dst, int hlen) 111 { 112 struct in_ifaddr_container *iac; 113 114 /* 115 * We need to figure out if we have been forwarded to a local 116 * socket. If so, then we should somehow "loop back" to 117 * ip_input(), and get directed to the PCB as if we had received 118 * this packet. This is because it may be difficult to identify 119 * the packets you want to forward until they are being output 120 * and have selected an interface (e.g. locally initiated 121 * packets). If we used the loopback inteface, we would not be 122 * able to control what happens as the packet runs through 123 * ip_input() as it is done through a ISR. 124 */ 125 LIST_FOREACH(iac, INADDR_HASH(dst->sin_addr.s_addr), ia_hash) { 126 /* 127 * If the addr to forward to is one of ours, we pretend 128 * to be the destination for this packet. 129 */ 130 if (IA_SIN(iac->ia)->sin_addr.s_addr == dst->sin_addr.s_addr) 131 break; 132 } 133 if (iac != NULL) { 134 if (m->m_pkthdr.rcvif == NULL) 135 m->m_pkthdr.rcvif = loif; 136 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 137 m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | 138 CSUM_PSEUDO_HDR; 139 m->m_pkthdr.csum_data = 0xffff; 140 } 141 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; 142 143 /* 144 * Make sure that the IP header is in one mbuf, 145 * required by ip_input 146 */ 147 if (m->m_len < hlen) { 148 m = m_pullup(m, hlen); 149 if (m == NULL) { 150 /* The packet was freed; we are done */ 151 return 1; 152 } 153 } 154 ip_input(m); 155 156 return 1; /* The packet gets forwarded locally */ 157 } 158 return 0; 159 } 160 161 /* 162 * IP output. The packet in mbuf chain m contains a skeletal IP 163 * header (with len, off, ttl, proto, tos, src, dst). 164 * The mbuf chain containing the packet will be freed. 165 * The mbuf opt, if present, will not be freed. 166 */ 167 int 168 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, 169 int flags, struct ip_moptions *imo, struct inpcb *inp) 170 { 171 struct ip *ip; 172 struct ifnet *ifp = NULL; /* keep compiler happy */ 173 struct mbuf *m; 174 int hlen = sizeof(struct ip); 175 int len, error = 0; 176 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 177 struct in_ifaddr *ia = NULL; 178 int isbroadcast, sw_csum; 179 struct in_addr pkt_dst; 180 struct route iproute; 181 struct m_tag *mtag; 182 struct sockaddr_in *next_hop = NULL; 183 int src_was_INADDR_ANY = 0; /* as the name says... */ 184 185 ASSERT_NETISR_NCPUS(mycpuid); 186 187 m = m0; 188 M_ASSERTPKTHDR(m); 189 190 if (ro == NULL) { 191 ro = &iproute; 192 bzero(ro, sizeof *ro); 193 } else if (ro->ro_rt != NULL && ro->ro_rt->rt_cpuid != mycpuid) { 194 if (flags & IP_DEBUGROUTE) { 195 panic("ip_output: rt rt_cpuid %d accessed on cpu %d\n", 196 ro->ro_rt->rt_cpuid, mycpuid); 197 } 198 199 /* 200 * XXX 201 * If the cached rtentry's owner CPU is not the current CPU, 202 * then don't touch the cached rtentry (remote free is too 203 * expensive in this context); just relocate the route. 204 */ 205 ro = &iproute; 206 bzero(ro, sizeof *ro); 207 } 208 209 if (m->m_pkthdr.fw_flags & IPFORWARD_MBUF_TAGGED) { 210 /* Next hop */ 211 mtag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 212 KKASSERT(mtag != NULL); 213 next_hop = m_tag_data(mtag); 214 } 215 216 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) { 217 struct dn_pkt *dn_pkt; 218 219 /* Extract info from dummynet tag */ 220 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL); 221 KKASSERT(mtag != NULL); 222 dn_pkt = m_tag_data(mtag); 223 224 /* 225 * The packet was already tagged, so part of the 226 * processing was already done, and we need to go down. 227 * Get the calculated parameters from the tag. 228 */ 229 ifp = dn_pkt->ifp; 230 231 KKASSERT(ro == &iproute); 232 *ro = dn_pkt->ro; /* structure copy */ 233 KKASSERT(ro->ro_rt == NULL || ro->ro_rt->rt_cpuid == mycpuid); 234 235 dst = dn_pkt->dn_dst; 236 if (dst == (struct sockaddr_in *)&(dn_pkt->ro.ro_dst)) { 237 /* If 'dst' points into dummynet tag, adjust it */ 238 dst = (struct sockaddr_in *)&(ro->ro_dst); 239 } 240 241 ip = mtod(m, struct ip *); 242 hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; 243 if (ro->ro_rt) 244 ia = ifatoia(ro->ro_rt->rt_ifa); 245 goto sendit; 246 } 247 248 if (opt) { 249 len = 0; 250 m = ip_insertoptions(m, opt, &len); 251 if (len != 0) 252 hlen = len; 253 } 254 ip = mtod(m, struct ip *); 255 256 /* 257 * Fill in IP header. 258 */ 259 if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) { 260 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); 261 ip->ip_off &= htons(IP_DF); 262 if (ip_do_rfc6864 && (ip->ip_off & htons(IP_DF))) 263 ip->ip_id = 0; 264 else 265 ip->ip_id = ip_newid(); 266 ipstat.ips_localout++; 267 } else { 268 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 269 } 270 271 reroute: 272 pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst; 273 274 dst = (struct sockaddr_in *)&ro->ro_dst; 275 /* 276 * If there is a cached route, 277 * check that it is to the same destination 278 * and is still up. If not, free it and try again. 279 * The address family should also be checked in case of sharing the 280 * cache with IPv6. 281 */ 282 if (ro->ro_rt && 283 (!(ro->ro_rt->rt_flags & RTF_UP) || 284 dst->sin_family != AF_INET || 285 dst->sin_addr.s_addr != pkt_dst.s_addr)) { 286 rtfree(ro->ro_rt); 287 ro->ro_rt = NULL; 288 } 289 if (ro->ro_rt == NULL) { 290 bzero(dst, sizeof *dst); 291 dst->sin_family = AF_INET; 292 dst->sin_len = sizeof *dst; 293 dst->sin_addr = pkt_dst; 294 } 295 /* 296 * If routing to interface only, 297 * short circuit routing lookup. 298 */ 299 if (flags & IP_ROUTETOIF) { 300 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 301 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 302 ipstat.ips_noroute++; 303 error = ENETUNREACH; 304 goto bad; 305 } 306 ifp = ia->ia_ifp; 307 ip->ip_ttl = 1; 308 isbroadcast = in_broadcast(dst->sin_addr, ifp); 309 } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && 310 imo != NULL && imo->imo_multicast_ifp != NULL) { 311 /* 312 * Bypass the normal routing lookup for multicast 313 * packets if the interface is specified. 314 */ 315 ifp = imo->imo_multicast_ifp; 316 ia = IFP_TO_IA(ifp); 317 isbroadcast = 0; /* fool gcc */ 318 } else { 319 /* 320 * If this is the case, we probably don't want to allocate 321 * a protocol-cloned route since we didn't get one from the 322 * ULP. This lets TCP do its thing, while not burdening 323 * forwarding or ICMP with the overhead of cloning a route. 324 * Of course, we still want to do any cloning requested by 325 * the link layer, as this is probably required in all cases 326 * for correct operation (as it is for ARP). 327 */ 328 if (ro->ro_rt == NULL) 329 rtalloc_ign(ro, RTF_PRCLONING); 330 if (ro->ro_rt == NULL) { 331 ipstat.ips_noroute++; 332 error = EHOSTUNREACH; 333 goto bad; 334 } 335 ia = ifatoia(ro->ro_rt->rt_ifa); 336 ifp = ro->ro_rt->rt_ifp; 337 ro->ro_rt->rt_use++; 338 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 339 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 340 if (ro->ro_rt->rt_flags & RTF_HOST) 341 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 342 else 343 isbroadcast = in_broadcast(dst->sin_addr, ifp); 344 } 345 if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { 346 m->m_flags |= M_MCAST; 347 /* 348 * IP destination address is multicast. Make sure "dst" 349 * still points to the address in "ro". (It may have been 350 * changed to point to a gateway address, above.) 351 */ 352 dst = (struct sockaddr_in *)&ro->ro_dst; 353 /* 354 * See if the caller provided any multicast options 355 */ 356 if (imo != NULL) { 357 ip->ip_ttl = imo->imo_multicast_ttl; 358 if (imo->imo_multicast_vif != -1) { 359 ip->ip_src.s_addr = 360 ip_mcast_src ? 361 ip_mcast_src(imo->imo_multicast_vif) : 362 INADDR_ANY; 363 } 364 } else { 365 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 366 } 367 /* 368 * Confirm that the outgoing interface supports multicast. 369 */ 370 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 371 if (!(ifp->if_flags & IFF_MULTICAST)) { 372 ipstat.ips_noroute++; 373 error = ENETUNREACH; 374 goto bad; 375 } 376 } 377 /* 378 * If source address not specified yet, use address of the 379 * outgoing interface. In case, keep note we did that, so 380 * if the the firewall changes the next-hop causing the 381 * output interface to change, we can fix that. 382 */ 383 if (ip->ip_src.s_addr == INADDR_ANY || src_was_INADDR_ANY) { 384 /* Interface may have no addresses. */ 385 if (ia != NULL) { 386 ip->ip_src = IA_SIN(ia)->sin_addr; 387 src_was_INADDR_ANY = 1; 388 } 389 } 390 391 if (ip->ip_src.s_addr != INADDR_ANY) { 392 struct in_multi *inm; 393 394 inm = IN_LOOKUP_MULTI(&pkt_dst, ifp); 395 if (inm != NULL && 396 (imo == NULL || imo->imo_multicast_loop)) { 397 /* 398 * If we belong to the destination multicast 399 * group on the outgoing interface, and the 400 * caller did not forbid loopback, loop back 401 * a copy. 402 */ 403 ip_mloopback(ifp, m, dst, hlen); 404 } else { 405 /* 406 * If we are acting as a multicast router, 407 * perform multicast forwarding as if the 408 * packet had just arrived on the interface 409 * to which we are about to send. The 410 * multicast forwarding function recursively 411 * calls this function, using the IP_FORWARDING 412 * flag to prevent infinite recursion. 413 * 414 * Multicasts that are looped back by 415 * ip_mloopback(), above, will be forwarded by 416 * the ip_input() routine, if necessary. 417 */ 418 if (ip_mrouter && !(flags & IP_FORWARDING)) { 419 /* 420 * If rsvp daemon is not running, do 421 * not set ip_moptions. This ensures 422 * that the packet is multicast and 423 * not just sent down one link as 424 * prescribed by rsvpd. 425 */ 426 if (!rsvp_on) 427 imo = NULL; 428 if (ip_mforward) { 429 get_mplock(); 430 if (ip_mforward(ip, ifp, 431 m, imo) != 0) { 432 m_freem(m); 433 rel_mplock(); 434 goto done; 435 } 436 rel_mplock(); 437 } 438 } 439 } 440 } 441 442 /* 443 * Multicasts with a time-to-live of zero may be looped- 444 * back, above, but must not be transmitted on a network. 445 * Also, multicasts addressed to the loopback interface 446 * are not sent -- the above call to ip_mloopback() will 447 * loop back a copy if this host actually belongs to the 448 * destination group on the loopback interface. 449 */ 450 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 451 m_freem(m); 452 goto done; 453 } 454 455 goto sendit; 456 } else { 457 m->m_flags &= ~M_MCAST; 458 } 459 460 /* 461 * If the source address is not specified yet, use the address 462 * of the outgoing interface. In case, keep note we did that, 463 * so if the the firewall changes the next-hop causing the output 464 * interface to change, we can fix that. 465 */ 466 if (ip->ip_src.s_addr == INADDR_ANY || src_was_INADDR_ANY) { 467 /* Interface may have no addresses. */ 468 if (ia != NULL) { 469 ip->ip_src = IA_SIN(ia)->sin_addr; 470 src_was_INADDR_ANY = 1; 471 } 472 } 473 474 /* 475 * Look for broadcast address and 476 * verify user is allowed to send 477 * such a packet. 478 */ 479 if (isbroadcast) { 480 if (!(ifp->if_flags & IFF_BROADCAST)) { 481 error = EADDRNOTAVAIL; 482 goto bad; 483 } 484 if (!(flags & IP_ALLOWBROADCAST)) { 485 error = EACCES; 486 goto bad; 487 } 488 /* don't allow broadcast messages to be fragmented */ 489 if (ntohs(ip->ip_len) > ifp->if_mtu) { 490 error = EMSGSIZE; 491 goto bad; 492 } 493 m->m_flags |= M_BCAST; 494 } else { 495 m->m_flags &= ~M_BCAST; 496 } 497 498 sendit: 499 500 /* We are already being fwd'd from a firewall. */ 501 if (next_hop != NULL) 502 goto pass; 503 504 /* No pfil hooks */ 505 if (!pfil_has_hooks(&inet_pfil_hook)) { 506 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) { 507 /* 508 * Strip dummynet tags from stranded packets 509 */ 510 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL); 511 KKASSERT(mtag != NULL); 512 m_tag_delete(m, mtag); 513 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED; 514 } 515 goto pass; 516 } 517 518 /* 519 * IpHack's section. 520 * - Xlate: translate packet's addr/port (NAT). 521 * - Firewall: deny/allow/etc. 522 * - Wrap: fake packet's addr/port <unimpl.> 523 * - Encapsulate: put it in another IP and send out. <unimp.> 524 */ 525 526 /* 527 * Run through list of hooks for output packets. 528 */ 529 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT); 530 if (error != 0 || m == NULL) 531 goto done; 532 ip = mtod(m, struct ip *); 533 534 if (m->m_pkthdr.fw_flags & IPFORWARD_MBUF_TAGGED) { 535 /* 536 * Check dst to make sure it is directly reachable on the 537 * interface we previously thought it was. 538 * If it isn't (which may be likely in some situations) we have 539 * to re-route it (ie, find a route for the next-hop and the 540 * associated interface) and set them here. This is nested 541 * forwarding which in most cases is undesirable, except where 542 * such control is nigh impossible. So we do it here. 543 * And I'm babbling. 544 */ 545 mtag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 546 KKASSERT(mtag != NULL); 547 next_hop = m_tag_data(mtag); 548 549 /* 550 * Try local forwarding first 551 */ 552 if (ip_localforward(m, next_hop, hlen)) 553 goto done; 554 555 /* 556 * Relocate the route based on next_hop. 557 * If the current route is inp's cache, keep it untouched. 558 */ 559 if (ro == &iproute && ro->ro_rt != NULL) { 560 RTFREE(ro->ro_rt); 561 ro->ro_rt = NULL; 562 } 563 ro = &iproute; 564 bzero(ro, sizeof *ro); 565 566 /* 567 * Forwarding to broadcast address is not allowed. 568 * XXX Should we follow IP_ROUTETOIF? 569 */ 570 flags &= ~(IP_ALLOWBROADCAST | IP_ROUTETOIF); 571 572 /* We are doing forwarding now */ 573 flags |= IP_FORWARDING; 574 575 goto reroute; 576 } 577 578 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) { 579 struct dn_pkt *dn_pkt; 580 581 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL); 582 KKASSERT(mtag != NULL); 583 dn_pkt = m_tag_data(mtag); 584 585 /* 586 * Under certain cases it is not possible to recalculate 587 * 'ro' and 'dst', let alone 'flags', so just save them in 588 * dummynet tag and avoid the possible wrong reculcalation 589 * when we come back to ip_output() again. 590 * 591 * All other parameters have been already used and so they 592 * are not needed anymore. 593 * XXX if the ifp is deleted while a pkt is in dummynet, 594 * we are in trouble! (TODO use ifnet_detach_event) 595 * 596 * We need to copy *ro because for ICMP pkts (and maybe 597 * others) the caller passed a pointer into the stack; 598 * dst might also be a pointer into *ro so it needs to 599 * be updated. 600 */ 601 dn_pkt->ro = *ro; 602 if (ro->ro_rt) 603 ro->ro_rt->rt_refcnt++; 604 if (dst == (struct sockaddr_in *)&ro->ro_dst) { 605 /* 'dst' points into 'ro' */ 606 dst = (struct sockaddr_in *)&(dn_pkt->ro.ro_dst); 607 } 608 dn_pkt->dn_dst = dst; 609 dn_pkt->flags = flags; 610 611 ip_dn_queue(m); 612 goto done; 613 } 614 615 if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) { 616 /* ipfw was disabled/unloaded. */ 617 m_freem(m); 618 goto done; 619 } 620 pass: 621 /* 127/8 must not appear on wire - RFC1122. */ 622 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 623 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 624 if (!(ifp->if_flags & IFF_LOOPBACK)) { 625 ipstat.ips_badaddr++; 626 error = EADDRNOTAVAIL; 627 goto bad; 628 } 629 } 630 if (ip->ip_src.s_addr == INADDR_ANY || 631 IN_MULTICAST(ntohl(ip->ip_src.s_addr))) { 632 ipstat.ips_badaddr++; 633 error = EADDRNOTAVAIL; 634 goto bad; 635 } 636 637 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 638 m->m_pkthdr.csum_flags |= CSUM_IP; 639 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 640 if (sw_csum & CSUM_DELAY_DATA) { 641 in_delayed_cksum(m); 642 sw_csum &= ~CSUM_DELAY_DATA; 643 } 644 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 645 } else { 646 sw_csum = 0; 647 } 648 m->m_pkthdr.csum_iphlen = hlen; 649 650 /* 651 * If small enough for interface, or the interface will take 652 * care of the fragmentation or segmentation for us, can just 653 * send directly. 654 */ 655 if (ntohs(ip->ip_len) <= ifp->if_mtu || 656 ((ifp->if_hwassist & CSUM_FRAGMENT) && 657 !(ip->ip_off & htons(IP_DF))) || 658 (m->m_pkthdr.csum_flags & CSUM_TSO)) 659 { 660 ip->ip_sum = 0; 661 if (sw_csum & CSUM_DELAY_IP) { 662 if (ip->ip_vhl == IP_VHL_BORING) 663 ip->ip_sum = in_cksum_hdr(ip); 664 else 665 ip->ip_sum = in_cksum(m, hlen); 666 } 667 668 /* Record statistics for this interface address. */ 669 if (!(flags & IP_FORWARDING) && ia) { 670 IFA_STAT_INC(&ia->ia_ifa, opackets, 1); 671 IFA_STAT_INC(&ia->ia_ifa, obytes, m->m_pkthdr.len); 672 } 673 674 #ifdef MBUF_STRESS_TEST 675 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) { 676 struct mbuf *m1, *m2; 677 int length, tmp; 678 679 tmp = length = m->m_pkthdr.len; 680 681 while ((length -= mbuf_frag_size) >= 1) { 682 m1 = m_split(m, length, M_NOWAIT); 683 if (m1 == NULL) 684 break; 685 m2 = m; 686 while (m2->m_next != NULL) 687 m2 = m2->m_next; 688 m2->m_next = m1; 689 } 690 m->m_pkthdr.len = tmp; 691 } 692 #endif 693 694 #ifdef MPLS 695 if (!mpls_output_process(m, ro->ro_rt)) 696 goto done; 697 #endif 698 error = ifp->if_output(ifp, m, (struct sockaddr *)dst, 699 ro->ro_rt); 700 goto done; 701 } 702 703 if (ip->ip_off & htons(IP_DF)) { 704 error = EMSGSIZE; 705 /* 706 * This case can happen if the user changed the MTU 707 * of an interface after enabling IP on it. Because 708 * most netifs don't keep track of routes pointing to 709 * them, there is no way for one to update all its 710 * routes when the MTU is changed. 711 */ 712 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && 713 !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && 714 (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { 715 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 716 } 717 ipstat.ips_cantfrag++; 718 goto bad; 719 } 720 721 /* 722 * Too large for interface; fragment if possible. If successful, 723 * on return, m will point to a list of packets to be sent. 724 */ 725 error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum); 726 if (error) 727 goto bad; 728 for (; m; m = m0) { 729 m0 = m->m_nextpkt; 730 m->m_nextpkt = NULL; 731 if (error == 0) { 732 /* Record statistics for this interface address. */ 733 if (ia != NULL) { 734 IFA_STAT_INC(&ia->ia_ifa, opackets, 1); 735 IFA_STAT_INC(&ia->ia_ifa, obytes, 736 m->m_pkthdr.len); 737 } 738 #ifdef MPLS 739 if (!mpls_output_process(m, ro->ro_rt)) 740 continue; 741 #endif 742 error = ifp->if_output(ifp, m, (struct sockaddr *)dst, 743 ro->ro_rt); 744 } else { 745 m_freem(m); 746 } 747 } 748 749 if (error == 0) 750 ipstat.ips_fragmented++; 751 752 done: 753 if (ro == &iproute && ro->ro_rt != NULL) { 754 RTFREE(ro->ro_rt); 755 ro->ro_rt = NULL; 756 } 757 return (error); 758 bad: 759 m_freem(m); 760 goto done; 761 } 762 763 /* 764 * Create a chain of fragments which fit the given mtu. m_frag points to the 765 * mbuf to be fragmented; on return it points to the chain with the fragments. 766 * Return 0 if no error. If error, m_frag may contain a partially built 767 * chain of fragments that should be freed by the caller. 768 * 769 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 770 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 771 */ 772 int 773 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 774 u_long if_hwassist_flags, int sw_csum) 775 { 776 int error = 0; 777 int hlen = IP_VHL_HL(ip->ip_vhl) << 2; 778 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 779 int off; 780 struct mbuf *m0 = *m_frag; /* the original packet */ 781 int firstlen; 782 struct mbuf **mnext; 783 int nfrags; 784 785 if (ip->ip_off & htons(IP_DF)) { /* Fragmentation not allowed */ 786 ipstat.ips_cantfrag++; 787 return EMSGSIZE; 788 } 789 790 /* 791 * Must be able to put at least 8 bytes per fragment. 792 */ 793 if (len < 8) 794 return EMSGSIZE; 795 796 /* 797 * If the interface will not calculate checksums on 798 * fragmented packets, then do it here. 799 */ 800 if ((m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) && 801 !(if_hwassist_flags & CSUM_IP_FRAGS)) { 802 in_delayed_cksum(m0); 803 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 804 } 805 806 if (len > PAGE_SIZE) { 807 /* 808 * Fragment large datagrams such that each segment 809 * contains a multiple of PAGE_SIZE amount of data, 810 * plus headers. This enables a receiver to perform 811 * page-flipping zero-copy optimizations. 812 * 813 * XXX When does this help given that sender and receiver 814 * could have different page sizes, and also mtu could 815 * be less than the receiver's page size ? 816 */ 817 int newlen; 818 struct mbuf *m; 819 820 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 821 off += m->m_len; 822 823 /* 824 * firstlen (off - hlen) must be aligned on an 825 * 8-byte boundary 826 */ 827 if (off < hlen) 828 goto smart_frag_failure; 829 off = ((off - hlen) & ~7) + hlen; 830 newlen = (~PAGE_MASK) & mtu; 831 if ((newlen + sizeof(struct ip)) > mtu) { 832 /* we failed, go back the default */ 833 smart_frag_failure: 834 newlen = len; 835 off = hlen + len; 836 } 837 len = newlen; 838 839 } else { 840 off = hlen + len; 841 } 842 843 firstlen = off - hlen; 844 mnext = &m0->m_nextpkt; /* pointer to next packet */ 845 846 /* 847 * Loop through length of segment after first fragment, 848 * make new header and copy data of each part and link onto chain. 849 * Here, m0 is the original packet, m is the fragment being created. 850 * The fragments are linked off the m_nextpkt of the original 851 * packet, which after processing serves as the first fragment. 852 */ 853 for (nfrags = 1; off < ntohs(ip->ip_len); off += len, nfrags++) { 854 struct ip *mhip; /* ip header on the fragment */ 855 struct mbuf *m; 856 int mhlen = sizeof(struct ip); 857 858 MGETHDR(m, M_NOWAIT, MT_HEADER); 859 if (m == NULL) { 860 error = ENOBUFS; 861 ipstat.ips_odropped++; 862 goto done; 863 } 864 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 865 /* 866 * In the first mbuf, leave room for the link header, then 867 * copy the original IP header including options. The payload 868 * goes into an additional mbuf chain returned by m_copy(). 869 */ 870 m->m_data += max_linkhdr; 871 mhip = mtod(m, struct ip *); 872 *mhip = *ip; 873 if (hlen > sizeof(struct ip)) { 874 mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip); 875 mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); 876 } 877 m->m_len = mhlen; 878 /* XXX do we need to add ip->ip_off below ? */ 879 mhip->ip_off = htons(((off - hlen) >> 3) + ntohs(ip->ip_off)); 880 if (off + len >= ntohs(ip->ip_len)) { /* last fragment */ 881 len = ntohs(ip->ip_len) - off; 882 m->m_flags |= M_LASTFRAG; 883 } else { 884 mhip->ip_off |= htons(IP_MF); 885 } 886 mhip->ip_len = htons((u_short)(len + mhlen)); 887 m->m_next = m_copy(m0, off, len); 888 if (m->m_next == NULL) { /* copy failed */ 889 m_free(m); 890 error = ENOBUFS; /* ??? */ 891 ipstat.ips_odropped++; 892 goto done; 893 } 894 m->m_pkthdr.len = mhlen + len; 895 m->m_pkthdr.rcvif = NULL; 896 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 897 m->m_pkthdr.csum_iphlen = mhlen; 898 mhip->ip_sum = 0; 899 if (sw_csum & CSUM_DELAY_IP) 900 mhip->ip_sum = in_cksum(m, mhlen); 901 *mnext = m; 902 mnext = &m->m_nextpkt; 903 } 904 ipstat.ips_ofragments += nfrags; 905 906 /* set first marker for fragment chain */ 907 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 908 m0->m_pkthdr.csum_data = nfrags; 909 910 /* 911 * Update first fragment by trimming what's been copied out 912 * and updating header. 913 */ 914 m_adj(m0, hlen + firstlen - ntohs(ip->ip_len)); 915 m0->m_pkthdr.len = hlen + firstlen; 916 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 917 ip->ip_off |= htons(IP_MF); 918 ip->ip_sum = 0; 919 if (sw_csum & CSUM_DELAY_IP) 920 ip->ip_sum = in_cksum(m0, hlen); 921 922 done: 923 *m_frag = m0; 924 return error; 925 } 926 927 void 928 in_delayed_cksum(struct mbuf *m) 929 { 930 struct ip *ip; 931 u_short csum, offset; 932 933 ip = mtod(m, struct ip *); 934 offset = IP_VHL_HL(ip->ip_vhl) << 2 ; 935 csum = in_cksum_skip(m, ntohs(ip->ip_len), offset); 936 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 937 csum = 0xffff; 938 offset += m->m_pkthdr.csum_data; /* checksum offset */ 939 940 if (offset + sizeof(u_short) > m->m_len) { 941 kprintf("delayed m_pullup, m->len: %d off: %d p: %d\n", 942 m->m_len, offset, ip->ip_p); 943 /* 944 * XXX 945 * this shouldn't happen, but if it does, the 946 * correct behavior may be to insert the checksum 947 * in the existing chain instead of rearranging it. 948 */ 949 m = m_pullup(m, offset + sizeof(u_short)); 950 } 951 *(u_short *)(m->m_data + offset) = csum; 952 } 953 954 /* 955 * Insert IP options into preformed packet. 956 * Adjust IP destination as required for IP source routing, 957 * as indicated by a non-zero in_addr at the start of the options. 958 * 959 * XXX This routine assumes that the packet has no options in place. 960 */ 961 static struct mbuf * 962 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 963 { 964 struct ipoption *p = mtod(opt, struct ipoption *); 965 struct mbuf *n; 966 struct ip *ip = mtod(m, struct ip *); 967 unsigned optlen; 968 969 optlen = opt->m_len - sizeof p->ipopt_dst; 970 if (optlen + (u_short)ntohs(ip->ip_len) > IP_MAXPACKET) { 971 *phlen = 0; 972 return (m); /* XXX should fail */ 973 } 974 if (p->ipopt_dst.s_addr) 975 ip->ip_dst = p->ipopt_dst; 976 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 977 MGETHDR(n, M_NOWAIT, MT_HEADER); 978 if (n == NULL) { 979 *phlen = 0; 980 return (m); 981 } 982 n->m_pkthdr.rcvif = NULL; 983 n->m_pkthdr.len = m->m_pkthdr.len + optlen; 984 m->m_len -= sizeof(struct ip); 985 m->m_data += sizeof(struct ip); 986 n->m_next = m; 987 m = n; 988 m->m_len = optlen + sizeof(struct ip); 989 m->m_data += max_linkhdr; 990 memcpy(mtod(m, void *), ip, sizeof(struct ip)); 991 } else { 992 m->m_data -= optlen; 993 m->m_len += optlen; 994 m->m_pkthdr.len += optlen; 995 bcopy(ip, mtod(m, caddr_t), sizeof(struct ip)); 996 } 997 ip = mtod(m, struct ip *); 998 bcopy(p->ipopt_list, ip + 1, optlen); 999 *phlen = sizeof(struct ip) + optlen; 1000 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); 1001 ip->ip_len = htons(ntohs( ip->ip_len) + optlen); 1002 return (m); 1003 } 1004 1005 /* 1006 * Copy options from ip to jp, 1007 * omitting those not copied during fragmentation. 1008 */ 1009 int 1010 ip_optcopy(struct ip *ip, struct ip *jp) 1011 { 1012 u_char *cp, *dp; 1013 int opt, optlen, cnt; 1014 1015 cp = (u_char *)(ip + 1); 1016 dp = (u_char *)(jp + 1); 1017 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip); 1018 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1019 opt = cp[0]; 1020 if (opt == IPOPT_EOL) 1021 break; 1022 if (opt == IPOPT_NOP) { 1023 /* Preserve for IP mcast tunnel's LSRR alignment. */ 1024 *dp++ = IPOPT_NOP; 1025 optlen = 1; 1026 continue; 1027 } 1028 1029 KASSERT(cnt >= IPOPT_OLEN + sizeof *cp, 1030 ("ip_optcopy: malformed ipv4 option")); 1031 optlen = cp[IPOPT_OLEN]; 1032 KASSERT(optlen >= IPOPT_OLEN + sizeof *cp && optlen <= cnt, 1033 ("ip_optcopy: malformed ipv4 option")); 1034 1035 /* bogus lengths should have been caught by ip_dooptions */ 1036 if (optlen > cnt) 1037 optlen = cnt; 1038 if (IPOPT_COPIED(opt)) { 1039 bcopy(cp, dp, optlen); 1040 dp += optlen; 1041 } 1042 } 1043 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 1044 *dp++ = IPOPT_EOL; 1045 return (optlen); 1046 } 1047 1048 /* 1049 * IP socket option processing. 1050 */ 1051 void 1052 ip_ctloutput(netmsg_t msg) 1053 { 1054 struct socket *so = msg->base.nm_so; 1055 struct sockopt *sopt = msg->ctloutput.nm_sopt; 1056 struct inpcb *inp = so->so_pcb; 1057 int error, optval; 1058 1059 error = optval = 0; 1060 1061 /* Get socket's owner cpuid hint */ 1062 if (sopt->sopt_level == SOL_SOCKET && 1063 sopt->sopt_dir == SOPT_GET && 1064 sopt->sopt_name == SO_CPUHINT) { 1065 optval = mycpuid; 1066 soopt_from_kbuf(sopt, &optval, sizeof(optval)); 1067 goto done; 1068 } 1069 1070 if (sopt->sopt_level != IPPROTO_IP) { 1071 error = EINVAL; 1072 goto done; 1073 } 1074 1075 switch (sopt->sopt_name) { 1076 case IP_MULTICAST_IF: 1077 case IP_MULTICAST_VIF: 1078 case IP_MULTICAST_TTL: 1079 case IP_MULTICAST_LOOP: 1080 case IP_ADD_MEMBERSHIP: 1081 case IP_DROP_MEMBERSHIP: 1082 /* 1083 * Handle multicast options in netisr0 1084 */ 1085 if (&curthread->td_msgport != netisr_cpuport(0)) { 1086 /* NOTE: so_port MUST NOT be checked in netisr0 */ 1087 msg->lmsg.ms_flags |= MSGF_IGNSOPORT; 1088 lwkt_forwardmsg(netisr_cpuport(0), &msg->lmsg); 1089 return; 1090 } 1091 break; 1092 } 1093 1094 switch (sopt->sopt_dir) { 1095 case SOPT_SET: 1096 switch (sopt->sopt_name) { 1097 case IP_OPTIONS: 1098 #ifdef notyet 1099 case IP_RETOPTS: 1100 #endif 1101 { 1102 struct mbuf *m; 1103 if (sopt->sopt_valsize > MLEN) { 1104 error = EMSGSIZE; 1105 break; 1106 } 1107 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_HEADER); 1108 if (m == NULL) { 1109 error = ENOBUFS; 1110 break; 1111 } 1112 m->m_len = sopt->sopt_valsize; 1113 error = soopt_to_kbuf(sopt, mtod(m, void *), m->m_len, 1114 m->m_len); 1115 error = ip_pcbopts(sopt->sopt_name, 1116 &inp->inp_options, m); 1117 goto done; 1118 } 1119 1120 case IP_TOS: 1121 case IP_TTL: 1122 case IP_MINTTL: 1123 case IP_RECVOPTS: 1124 case IP_RECVRETOPTS: 1125 case IP_RECVDSTADDR: 1126 case IP_RECVIF: 1127 case IP_RECVTOS: 1128 case IP_RECVTTL: 1129 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1130 sizeof optval); 1131 if (error) 1132 break; 1133 switch (sopt->sopt_name) { 1134 case IP_TOS: 1135 inp->inp_ip_tos = optval; 1136 break; 1137 1138 case IP_TTL: 1139 inp->inp_ip_ttl = optval; 1140 break; 1141 case IP_MINTTL: 1142 if (optval >= 0 && optval <= MAXTTL) 1143 inp->inp_ip_minttl = optval; 1144 else 1145 error = EINVAL; 1146 break; 1147 #define OPTSET(bit) \ 1148 if (optval) \ 1149 inp->inp_flags |= bit; \ 1150 else \ 1151 inp->inp_flags &= ~bit; 1152 1153 case IP_RECVOPTS: 1154 OPTSET(INP_RECVOPTS); 1155 break; 1156 1157 case IP_RECVRETOPTS: 1158 OPTSET(INP_RECVRETOPTS); 1159 break; 1160 1161 case IP_RECVDSTADDR: 1162 OPTSET(INP_RECVDSTADDR); 1163 break; 1164 1165 case IP_RECVIF: 1166 OPTSET(INP_RECVIF); 1167 break; 1168 1169 case IP_RECVTOS: 1170 OPTSET(INP_RECVTOS); 1171 break; 1172 1173 case IP_RECVTTL: 1174 OPTSET(INP_RECVTTL); 1175 break; 1176 } 1177 break; 1178 #undef OPTSET 1179 1180 case IP_MULTICAST_IF: 1181 case IP_MULTICAST_VIF: 1182 case IP_MULTICAST_TTL: 1183 case IP_MULTICAST_LOOP: 1184 case IP_ADD_MEMBERSHIP: 1185 case IP_DROP_MEMBERSHIP: 1186 error = ip_setmoptions(sopt, &inp->inp_moptions); 1187 break; 1188 1189 case IP_PORTRANGE: 1190 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1191 sizeof optval); 1192 if (error) 1193 break; 1194 1195 switch (optval) { 1196 case IP_PORTRANGE_DEFAULT: 1197 inp->inp_flags &= ~(INP_LOWPORT); 1198 inp->inp_flags &= ~(INP_HIGHPORT); 1199 break; 1200 1201 case IP_PORTRANGE_HIGH: 1202 inp->inp_flags &= ~(INP_LOWPORT); 1203 inp->inp_flags |= INP_HIGHPORT; 1204 break; 1205 1206 case IP_PORTRANGE_LOW: 1207 inp->inp_flags &= ~(INP_HIGHPORT); 1208 inp->inp_flags |= INP_LOWPORT; 1209 break; 1210 1211 default: 1212 error = EINVAL; 1213 break; 1214 } 1215 break; 1216 1217 1218 default: 1219 error = ENOPROTOOPT; 1220 break; 1221 } 1222 break; 1223 1224 case SOPT_GET: 1225 switch (sopt->sopt_name) { 1226 case IP_OPTIONS: 1227 case IP_RETOPTS: 1228 if (inp->inp_options) 1229 soopt_from_kbuf(sopt, mtod(inp->inp_options, 1230 char *), 1231 inp->inp_options->m_len); 1232 else 1233 sopt->sopt_valsize = 0; 1234 break; 1235 1236 case IP_TOS: 1237 case IP_TTL: 1238 case IP_MINTTL: 1239 case IP_RECVOPTS: 1240 case IP_RECVRETOPTS: 1241 case IP_RECVDSTADDR: 1242 case IP_RECVTOS: 1243 case IP_RECVTTL: 1244 case IP_RECVIF: 1245 case IP_PORTRANGE: 1246 switch (sopt->sopt_name) { 1247 1248 case IP_TOS: 1249 optval = inp->inp_ip_tos; 1250 break; 1251 1252 case IP_TTL: 1253 optval = inp->inp_ip_ttl; 1254 break; 1255 case IP_MINTTL: 1256 optval = inp->inp_ip_minttl; 1257 break; 1258 1259 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1260 1261 case IP_RECVOPTS: 1262 optval = OPTBIT(INP_RECVOPTS); 1263 break; 1264 1265 case IP_RECVRETOPTS: 1266 optval = OPTBIT(INP_RECVRETOPTS); 1267 break; 1268 1269 case IP_RECVDSTADDR: 1270 optval = OPTBIT(INP_RECVDSTADDR); 1271 break; 1272 1273 case IP_RECVTOS: 1274 optval = OPTBIT(INP_RECVTOS); 1275 break; 1276 1277 case IP_RECVTTL: 1278 optval = OPTBIT(INP_RECVTTL); 1279 break; 1280 1281 case IP_RECVIF: 1282 optval = OPTBIT(INP_RECVIF); 1283 break; 1284 1285 case IP_PORTRANGE: 1286 if (inp->inp_flags & INP_HIGHPORT) 1287 optval = IP_PORTRANGE_HIGH; 1288 else if (inp->inp_flags & INP_LOWPORT) 1289 optval = IP_PORTRANGE_LOW; 1290 else 1291 optval = 0; 1292 break; 1293 } 1294 soopt_from_kbuf(sopt, &optval, sizeof optval); 1295 break; 1296 1297 case IP_MULTICAST_IF: 1298 case IP_MULTICAST_VIF: 1299 case IP_MULTICAST_TTL: 1300 case IP_MULTICAST_LOOP: 1301 case IP_ADD_MEMBERSHIP: 1302 case IP_DROP_MEMBERSHIP: 1303 error = ip_getmoptions(sopt, inp->inp_moptions); 1304 break; 1305 1306 default: 1307 error = ENOPROTOOPT; 1308 break; 1309 } 1310 break; 1311 } 1312 done: 1313 lwkt_replymsg(&msg->lmsg, error); 1314 } 1315 1316 /* 1317 * Set up IP options in pcb for insertion in output packets. 1318 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1319 * with destination address if source routed. 1320 */ 1321 static int 1322 ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m) 1323 { 1324 int cnt, optlen; 1325 u_char *cp; 1326 u_char opt; 1327 1328 /* turn off any old options */ 1329 if (*pcbopt) 1330 m_free(*pcbopt); 1331 *pcbopt = NULL; 1332 if (m == NULL || m->m_len == 0) { 1333 /* 1334 * Only turning off any previous options. 1335 */ 1336 if (m != NULL) 1337 m_free(m); 1338 return (0); 1339 } 1340 1341 if (m->m_len % sizeof(int32_t)) 1342 goto bad; 1343 /* 1344 * IP first-hop destination address will be stored before 1345 * actual options; move other options back 1346 * and clear it when none present. 1347 */ 1348 if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) 1349 goto bad; 1350 cnt = m->m_len; 1351 m->m_len += sizeof(struct in_addr); 1352 cp = mtod(m, u_char *) + sizeof(struct in_addr); 1353 bcopy(mtod(m, caddr_t), cp, cnt); 1354 bzero(mtod(m, caddr_t), sizeof(struct in_addr)); 1355 1356 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1357 opt = cp[IPOPT_OPTVAL]; 1358 if (opt == IPOPT_EOL) 1359 break; 1360 if (opt == IPOPT_NOP) 1361 optlen = 1; 1362 else { 1363 if (cnt < IPOPT_OLEN + sizeof *cp) 1364 goto bad; 1365 optlen = cp[IPOPT_OLEN]; 1366 if (optlen < IPOPT_OLEN + sizeof *cp || optlen > cnt) 1367 goto bad; 1368 } 1369 switch (opt) { 1370 1371 default: 1372 break; 1373 1374 case IPOPT_LSRR: 1375 case IPOPT_SSRR: 1376 /* 1377 * user process specifies route as: 1378 * ->A->B->C->D 1379 * D must be our final destination (but we can't 1380 * check that since we may not have connected yet). 1381 * A is first hop destination, which doesn't appear in 1382 * actual IP option, but is stored before the options. 1383 */ 1384 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) 1385 goto bad; 1386 m->m_len -= sizeof(struct in_addr); 1387 cnt -= sizeof(struct in_addr); 1388 optlen -= sizeof(struct in_addr); 1389 cp[IPOPT_OLEN] = optlen; 1390 /* 1391 * Move first hop before start of options. 1392 */ 1393 bcopy(&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), 1394 sizeof(struct in_addr)); 1395 /* 1396 * Then copy rest of options back 1397 * to close up the deleted entry. 1398 */ 1399 bcopy(&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr), 1400 &cp[IPOPT_OFFSET+1], 1401 cnt - (IPOPT_MINOFF - 1)); 1402 break; 1403 } 1404 } 1405 if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) 1406 goto bad; 1407 *pcbopt = m; 1408 return (0); 1409 1410 bad: 1411 m_free(m); 1412 return (EINVAL); 1413 } 1414 1415 /* 1416 * XXX 1417 * The whole multicast option thing needs to be re-thought. 1418 * Several of these options are equally applicable to non-multicast 1419 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a 1420 * standard option (IP_TTL). 1421 */ 1422 1423 /* 1424 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1425 */ 1426 static struct ifnet * 1427 ip_multicast_if(struct in_addr *a, int *ifindexp) 1428 { 1429 int ifindex; 1430 struct ifnet *ifp; 1431 1432 if (ifindexp) 1433 *ifindexp = 0; 1434 if (ntohl(a->s_addr) >> 24 == 0) { 1435 ifindex = ntohl(a->s_addr) & 0xffffff; 1436 if (ifindex < 0 || if_index < ifindex) 1437 return NULL; 1438 ifp = ifindex2ifnet[ifindex]; 1439 if (ifindexp) 1440 *ifindexp = ifindex; 1441 } else { 1442 ifp = INADDR_TO_IFP(a); 1443 } 1444 return ifp; 1445 } 1446 1447 /* 1448 * Set the IP multicast options in response to user setsockopt(). 1449 */ 1450 static int 1451 ip_setmoptions(struct sockopt *sopt, struct ip_moptions **imop) 1452 { 1453 int error = 0; 1454 int i; 1455 struct ip_mreqn mreqn; 1456 struct ifnet *ifp; 1457 struct ip_moptions *imo = *imop; 1458 int ifindex; 1459 1460 if (imo == NULL) { 1461 /* 1462 * No multicast option buffer attached to the pcb; 1463 * allocate one and initialize to default values. 1464 */ 1465 imo = kmalloc(sizeof *imo, M_IPMOPTS, M_WAITOK); 1466 1467 imo->imo_multicast_ifp = NULL; 1468 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1469 imo->imo_multicast_vif = -1; 1470 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1471 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1472 imo->imo_num_memberships = 0; 1473 /* Assign imo to imop after all fields are setup */ 1474 cpu_sfence(); 1475 *imop = imo; 1476 } 1477 switch (sopt->sopt_name) { 1478 /* store an index number for the vif you wanna use in the send */ 1479 case IP_MULTICAST_VIF: 1480 if (legal_vif_num == 0) { 1481 error = EOPNOTSUPP; 1482 break; 1483 } 1484 error = soopt_to_kbuf(sopt, &i, sizeof i, sizeof i); 1485 if (error) 1486 break; 1487 if (!legal_vif_num(i) && (i != -1)) { 1488 error = EINVAL; 1489 break; 1490 } 1491 imo->imo_multicast_vif = i; 1492 break; 1493 1494 case IP_MULTICAST_IF: 1495 /* 1496 * Select the interface for outgoing multicast packets. 1497 */ 1498 if (sopt->sopt_valsize >= sizeof(mreqn)) { 1499 /* 1500 * Linux compat. 1501 */ 1502 error = soopt_to_kbuf(sopt, &mreqn, 1503 sizeof(mreqn), sizeof(mreqn)); 1504 if (error) 1505 break; 1506 } else if (sopt->sopt_valsize >= sizeof(struct ip_mreq)) { 1507 /* 1508 * Linux compat. 1509 */ 1510 mreqn.imr_ifindex = 0; 1511 error = soopt_to_kbuf(sopt, &mreqn, 1512 sizeof(struct ip_mreq), sizeof(struct ip_mreq)); 1513 if (error) 1514 break; 1515 } else { 1516 mreqn.imr_ifindex = 0; 1517 error = soopt_to_kbuf(sopt, &mreqn.imr_address, 1518 sizeof(struct in_addr), sizeof(struct in_addr)); 1519 if (error) 1520 break; 1521 } 1522 1523 ifindex = mreqn.imr_ifindex; 1524 if (ifindex != 0) { 1525 if (ifindex < 0 || if_index < ifindex) { 1526 error = EINVAL; 1527 break; 1528 } 1529 ifp = ifindex2ifnet[ifindex]; 1530 mreqn.imr_address.s_addr = htonl(ifindex & 0xffffff); 1531 } else { 1532 /* 1533 * INADDR_ANY is used to remove a previous selection. 1534 * When no interface is selected, a default one is 1535 * chosen every time a multicast packet is sent. 1536 */ 1537 if (mreqn.imr_address.s_addr == INADDR_ANY) { 1538 imo->imo_multicast_ifp = NULL; 1539 break; 1540 } 1541 /* 1542 * The selected interface is identified by its local 1543 * IP address. Find the interface and confirm that 1544 * it supports multicasting. 1545 */ 1546 ifp = ip_multicast_if(&mreqn.imr_address, &ifindex); 1547 } 1548 1549 if (ifp == NULL || !(ifp->if_flags & IFF_MULTICAST)) { 1550 error = EADDRNOTAVAIL; 1551 break; 1552 } 1553 imo->imo_multicast_ifp = ifp; 1554 if (ifindex) 1555 imo->imo_multicast_addr = mreqn.imr_address; 1556 else 1557 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1558 break; 1559 1560 case IP_MULTICAST_TTL: 1561 /* 1562 * Set the IP time-to-live for outgoing multicast packets. 1563 * The original multicast API required a char argument, 1564 * which is inconsistent with the rest of the socket API. 1565 * We allow either a char or an int. 1566 */ 1567 if (sopt->sopt_valsize == 1) { 1568 u_char ttl; 1569 error = soopt_to_kbuf(sopt, &ttl, 1, 1); 1570 if (error) 1571 break; 1572 imo->imo_multicast_ttl = ttl; 1573 } else { 1574 u_int ttl; 1575 error = soopt_to_kbuf(sopt, &ttl, sizeof ttl, sizeof ttl); 1576 if (error) 1577 break; 1578 if (ttl > 255) 1579 error = EINVAL; 1580 else 1581 imo->imo_multicast_ttl = ttl; 1582 } 1583 break; 1584 1585 case IP_MULTICAST_LOOP: 1586 /* 1587 * Set the loopback flag for outgoing multicast packets. 1588 * Must be zero or one. The original multicast API required a 1589 * char argument, which is inconsistent with the rest 1590 * of the socket API. We allow either a char or an int. 1591 */ 1592 if (sopt->sopt_valsize == 1) { 1593 u_char loop; 1594 1595 error = soopt_to_kbuf(sopt, &loop, 1, 1); 1596 if (error) 1597 break; 1598 imo->imo_multicast_loop = !!loop; 1599 } else { 1600 u_int loop; 1601 1602 error = soopt_to_kbuf(sopt, &loop, sizeof loop, 1603 sizeof loop); 1604 if (error) 1605 break; 1606 imo->imo_multicast_loop = !!loop; 1607 } 1608 break; 1609 1610 case IP_ADD_MEMBERSHIP: 1611 /* 1612 * Add a multicast group membership. 1613 * Group must be a valid IP multicast address. 1614 */ 1615 if (sopt->sopt_valsize >= sizeof(mreqn)) { 1616 error = soopt_to_kbuf(sopt, &mreqn, 1617 sizeof(mreqn), sizeof(mreqn)); 1618 if (error) 1619 break; 1620 } else { 1621 mreqn.imr_ifindex = 0; 1622 error = soopt_to_kbuf(sopt, &mreqn, 1623 sizeof(struct ip_mreq), sizeof(struct ip_mreq)); 1624 if (error) 1625 break; 1626 } 1627 1628 if (!IN_MULTICAST(ntohl(mreqn.imr_multiaddr.s_addr))) { 1629 error = EINVAL; 1630 break; 1631 } 1632 1633 ifindex = mreqn.imr_ifindex; 1634 if (ifindex != 0) { 1635 if (ifindex < 0 || if_index < ifindex) { 1636 error = EINVAL; 1637 break; 1638 } 1639 ifp = ifindex2ifnet[ifindex]; 1640 } else if (mreqn.imr_address.s_addr == INADDR_ANY) { 1641 struct sockaddr_in dst; 1642 struct rtentry *rt; 1643 1644 /* 1645 * If no interface address or index was provided, 1646 * use the interface of the route to the given 1647 * multicast address. 1648 */ 1649 bzero(&dst, sizeof(struct sockaddr_in)); 1650 dst.sin_len = sizeof(struct sockaddr_in); 1651 dst.sin_family = AF_INET; 1652 dst.sin_addr = mreqn.imr_multiaddr; 1653 rt = rtlookup((struct sockaddr *)&dst); 1654 if (rt == NULL) { 1655 error = EADDRNOTAVAIL; 1656 break; 1657 } 1658 --rt->rt_refcnt; 1659 ifp = rt->rt_ifp; 1660 } else { 1661 ifp = ip_multicast_if(&mreqn.imr_address, NULL); 1662 } 1663 1664 /* 1665 * See if we found an interface, and confirm that it 1666 * supports multicast. 1667 */ 1668 if (ifp == NULL || !(ifp->if_flags & IFF_MULTICAST)) { 1669 error = EADDRNOTAVAIL; 1670 break; 1671 } 1672 /* 1673 * See if the membership already exists or if all the 1674 * membership slots are full. 1675 */ 1676 for (i = 0; i < imo->imo_num_memberships; ++i) { 1677 if (imo->imo_membership[i]->inm_ifp == ifp && 1678 imo->imo_membership[i]->inm_addr.s_addr 1679 == mreqn.imr_multiaddr.s_addr) 1680 break; 1681 } 1682 if (i < imo->imo_num_memberships) { 1683 error = EADDRINUSE; 1684 break; 1685 } 1686 if (i == IP_MAX_MEMBERSHIPS) { 1687 error = ETOOMANYREFS; 1688 break; 1689 } 1690 /* 1691 * Everything looks good; add a new record to the multicast 1692 * address list for the given interface. 1693 */ 1694 if ((imo->imo_membership[i] = 1695 in_addmulti(&mreqn.imr_multiaddr, ifp)) == NULL) { 1696 error = ENOBUFS; 1697 break; 1698 } 1699 ++imo->imo_num_memberships; 1700 break; 1701 1702 case IP_DROP_MEMBERSHIP: 1703 /* 1704 * Drop a multicast group membership. 1705 * Group must be a valid IP multicast address. 1706 */ 1707 if (sopt->sopt_valsize >= sizeof(mreqn)) { 1708 error = soopt_to_kbuf(sopt, &mreqn, 1709 sizeof(mreqn), sizeof(mreqn)); 1710 if (error) 1711 break; 1712 } else { 1713 mreqn.imr_ifindex = 0; 1714 error = soopt_to_kbuf(sopt, &mreqn, 1715 sizeof(struct ip_mreq), sizeof(struct ip_mreq)); 1716 if (error) 1717 break; 1718 } 1719 1720 if (!IN_MULTICAST(ntohl(mreqn.imr_multiaddr.s_addr))) { 1721 error = EINVAL; 1722 break; 1723 } 1724 1725 /* 1726 * If an interface index or address was specified, get a 1727 * pointer to its ifnet structure. 1728 */ 1729 ifindex = mreqn.imr_ifindex; 1730 if (ifindex != 0) { 1731 if (ifindex < 0 || if_index < ifindex) { 1732 error = EINVAL; 1733 break; 1734 } 1735 ifp = ifindex2ifnet[ifindex]; 1736 } else if (mreqn.imr_address.s_addr == INADDR_ANY) { 1737 ifp = NULL; 1738 } else { 1739 ifp = ip_multicast_if(&mreqn.imr_address, NULL); 1740 if (ifp == NULL) { 1741 error = EADDRNOTAVAIL; 1742 break; 1743 } 1744 } 1745 /* 1746 * Find the membership in the membership array. 1747 */ 1748 for (i = 0; i < imo->imo_num_memberships; ++i) { 1749 if ((ifp == NULL || 1750 imo->imo_membership[i]->inm_ifp == ifp) && 1751 imo->imo_membership[i]->inm_addr.s_addr == 1752 mreqn.imr_multiaddr.s_addr) 1753 break; 1754 } 1755 if (i == imo->imo_num_memberships) { 1756 error = EADDRNOTAVAIL; 1757 break; 1758 } 1759 /* 1760 * Give up the multicast address record to which the 1761 * membership points. 1762 */ 1763 in_delmulti(imo->imo_membership[i]); 1764 /* 1765 * Remove the gap in the membership array. 1766 */ 1767 for (++i; i < imo->imo_num_memberships; ++i) 1768 imo->imo_membership[i-1] = imo->imo_membership[i]; 1769 --imo->imo_num_memberships; 1770 break; 1771 1772 default: 1773 error = EOPNOTSUPP; 1774 break; 1775 } 1776 1777 return (error); 1778 } 1779 1780 /* 1781 * Return the IP multicast options in response to user getsockopt(). 1782 */ 1783 static int 1784 ip_getmoptions(struct sockopt *sopt, struct ip_moptions *imo) 1785 { 1786 struct in_addr addr; 1787 struct in_ifaddr *ia; 1788 int error, optval; 1789 u_char coptval; 1790 1791 error = 0; 1792 switch (sopt->sopt_name) { 1793 case IP_MULTICAST_VIF: 1794 if (imo != NULL) 1795 optval = imo->imo_multicast_vif; 1796 else 1797 optval = -1; 1798 soopt_from_kbuf(sopt, &optval, sizeof optval); 1799 break; 1800 1801 case IP_MULTICAST_IF: 1802 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1803 addr.s_addr = INADDR_ANY; 1804 else if (imo->imo_multicast_addr.s_addr) { 1805 /* return the value user has set */ 1806 addr = imo->imo_multicast_addr; 1807 } else { 1808 ia = IFP_TO_IA(imo->imo_multicast_ifp); 1809 addr.s_addr = (ia == NULL) ? INADDR_ANY 1810 : IA_SIN(ia)->sin_addr.s_addr; 1811 } 1812 soopt_from_kbuf(sopt, &addr, sizeof addr); 1813 break; 1814 1815 case IP_MULTICAST_TTL: 1816 if (imo == NULL) 1817 optval = coptval = IP_DEFAULT_MULTICAST_TTL; 1818 else 1819 optval = coptval = imo->imo_multicast_ttl; 1820 if (sopt->sopt_valsize == 1) 1821 soopt_from_kbuf(sopt, &coptval, 1); 1822 else 1823 soopt_from_kbuf(sopt, &optval, sizeof optval); 1824 break; 1825 1826 case IP_MULTICAST_LOOP: 1827 if (imo == NULL) 1828 optval = coptval = IP_DEFAULT_MULTICAST_LOOP; 1829 else 1830 optval = coptval = imo->imo_multicast_loop; 1831 if (sopt->sopt_valsize == 1) 1832 soopt_from_kbuf(sopt, &coptval, 1); 1833 else 1834 soopt_from_kbuf(sopt, &optval, sizeof optval); 1835 break; 1836 1837 default: 1838 error = ENOPROTOOPT; 1839 break; 1840 } 1841 return (error); 1842 } 1843 1844 /* 1845 * Discard the IP multicast options. 1846 */ 1847 void 1848 ip_freemoptions(struct ip_moptions *imo) 1849 { 1850 int i; 1851 1852 if (imo != NULL) { 1853 for (i = 0; i < imo->imo_num_memberships; ++i) 1854 in_delmulti(imo->imo_membership[i]); 1855 kfree(imo, M_IPMOPTS); 1856 } 1857 } 1858 1859 /* 1860 * Routine called from ip_output() to loop back a copy of an IP multicast 1861 * packet to the input queue of a specified interface. Note that this 1862 * calls the output routine of the loopback "driver", but with an interface 1863 * pointer that might NOT be a loopback interface -- evil, but easier than 1864 * replicating that code here. 1865 */ 1866 static void 1867 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, 1868 int hlen) 1869 { 1870 struct ip *ip; 1871 struct mbuf *copym; 1872 1873 copym = m_copypacket(m, M_NOWAIT); 1874 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1875 copym = m_pullup(copym, hlen); 1876 if (copym != NULL) { 1877 /* 1878 * if the checksum hasn't been computed, mark it as valid 1879 */ 1880 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1881 in_delayed_cksum(copym); 1882 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1883 copym->m_pkthdr.csum_flags |= 1884 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1885 copym->m_pkthdr.csum_data = 0xffff; 1886 } 1887 /* 1888 * We don't bother to fragment if the IP length is greater 1889 * than the interface's MTU. Can this possibly matter? 1890 */ 1891 ip = mtod(copym, struct ip *); 1892 ip->ip_sum = 0; 1893 if (ip->ip_vhl == IP_VHL_BORING) { 1894 ip->ip_sum = in_cksum_hdr(ip); 1895 } else { 1896 ip->ip_sum = in_cksum(copym, hlen); 1897 } 1898 /* 1899 * NB: 1900 * It's not clear whether there are any lingering 1901 * reentrancy problems in other areas which might 1902 * be exposed by using ip_input directly (in 1903 * particular, everything which modifies the packet 1904 * in-place). Yet another option is using the 1905 * protosw directly to deliver the looped back 1906 * packet. For the moment, we'll err on the side 1907 * of safety by using if_simloop(). 1908 */ 1909 #if 1 /* XXX */ 1910 if (dst->sin_family != AF_INET) { 1911 kprintf("ip_mloopback: bad address family %d\n", 1912 dst->sin_family); 1913 dst->sin_family = AF_INET; 1914 } 1915 #endif 1916 if_simloop(ifp, copym, dst->sin_family, 0); 1917 } 1918 } 1919