1 /* LWIP service - pktsock.c - packet code shared between UDP and RAW */ 2 3 #include "lwip.h" 4 #include "pktsock.h" 5 #include "ifaddr.h" 6 7 /* 8 * This buffer should be much bigger (at least 10KB, according to RFC 3542), 9 * but we do not support the ancillary options that take so much space anyway. 10 */ 11 #define PKTSOCK_CTLBUF_SIZE 256 12 13 static char pktsock_ctlbuf[PKTSOCK_CTLBUF_SIZE]; 14 15 /* 16 * Header structures with ancillary data for received packets. The reason that 17 * we do not simply use a generic pkthdr structure with ip_addr_t source and 18 * destination addresses, is that for UDP packets, we put this structure in 19 * place of the received (ethernet and IP headers), and such a full structure 20 * (including IPv6-size addresses) would not fit in the header space for IPv4 21 * packets. So instead we use two address structures, one for IPv4 and one for 22 * IPv6, and a generic header structure on top of it, which also identifies 23 * which address structure is underneath. The combination of the address 24 * structure and the header structure must fit in the IP header. The IPv6 25 * packet header is already so close to the limit here that we have to use 26 * packed addresses. For IPv4 we use the regular addresses for simplicity. 27 */ 28 struct pkthdr { 29 uint16_t port; /* source port number (UDP only) */ 30 uint8_t dstif; /* interface that received the pkt */ 31 uint8_t addrif; /* interface that accepted the pkt */ 32 uint8_t tos; /* TOS/TC value from the IP header */ 33 uint8_t ttl; /* TTL/HL value from the IP header */ 34 uint8_t flags; /* packet flags (PKTHF_) */ 35 uint8_t _unused; /* all that is still available.. */ 36 }; 37 38 #define PKTHF_IPV6 0x01 /* packet has IPv6 header */ 39 #define PKTHF_MCAST 0x02 /* packet has multicast destination */ 40 #define PKTHF_BCAST 0x04 /* packet has broadcast destination */ 41 42 struct pktaddr4 { 43 ip4_addr_t srcaddr; 44 ip4_addr_t dstaddr; 45 }; 46 47 struct pktaddr6 { 48 ip6_addr_p_t srcaddr; 49 ip6_addr_p_t dstaddr; 50 }; 51 52 /* 53 * Create a packet socket. Relay parameters and return values to and from the 54 * IP module's socket creation function. This function must not allocate any 55 * resources in any form, as socket creation may still fail later, in which 56 * case no destruction function is called. 57 */ 58 int 59 pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf, size_t rcvbuf, 60 struct sock ** sockp) 61 { 62 63 pkt->pkt_rcvhead = NULL; 64 pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; 65 pkt->pkt_rcvlen = 0; 66 67 mcast_reset(&pkt->pkt_mcast); 68 69 memset(&pkt->pkt_srcaddr, 0, sizeof(pkt->pkt_srcaddr)); 70 pkt->pkt_ifindex = 0; 71 72 /* 73 * Any PKTF_ type flags should be initialized on the socket only after 74 * the following call, as this call will clear the flags field. For 75 * now, no PKTF_ flags need to be set by default, though. 76 */ 77 return ipsock_socket(&pkt->pkt_ipsock, domain, sndbuf, rcvbuf, sockp); 78 } 79 80 /* 81 * Return TRUE if the given packet can and should be received on the given 82 * socket, or FALSE if there is a reason not to receive the packet. 83 */ 84 static int 85 pktsock_may_recv(struct pktsock * pkt, struct pbuf * pbuf) 86 { 87 88 /* 89 * By policy, multicast packets should not be received on sockets of 90 * which the owning application is not multicast aware. 91 */ 92 if (ip_addr_ismulticast(ip_current_dest_addr()) && 93 !(ipsock_get_flag(&pkt->pkt_ipsock, PKTF_MCAWARE))) 94 return FALSE; 95 96 /* 97 * Due to fragment reassembly, we might end up with packets that take 98 * up more buffer space than their byte size, even after rounding up 99 * the latter. The user probably does not want packets to get dropped 100 * for that reason, e.g. when they set a 64K limit and the packet ends 101 * up being estimated as 65K and dropped. So, we test against 102 * 'pbuf->tot_len' rather than the rounded-up packet size. However, 103 * 'pkt->pkt_rcvlen' itself is increased by the rounded-up packet size 104 * when enqueuing the packet, so that we still count the memory 105 * consumption (generally) conservatively, which is what we want. 106 */ 107 return (pkt->pkt_rcvlen + pbuf->tot_len <= 108 ipsock_get_rcvbuf(&pkt->pkt_ipsock)); 109 } 110 111 /* 112 * Check whether the given packet can and should be received on the given 113 * socket. If so, return the amount of space for ancillary information that 114 * will be necessary for the packet. If not, return a negative value. 115 */ 116 int 117 pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf) 118 { 119 120 /* 121 * This check will be done again in pktsock_input(), but this function 122 * is called for raw packets only (not for UDP packets) and, if this 123 * (cheap) check fails, we can avoid a (rather expensive) packet copy. 124 */ 125 if (!pktsock_may_recv(pkt, pbuf)) 126 return -1; 127 128 if (ip_current_is_v6()) 129 return (int)(sizeof(struct pktaddr6) + sizeof(struct pkthdr)); 130 else 131 return (int)(sizeof(struct pktaddr4) + sizeof(struct pkthdr)); 132 } 133 134 /* 135 * A packet has arrived on a packet socket. We own the given packet buffer, 136 * and so we must free it if we do not want to keep it. 137 */ 138 void 139 pktsock_input(struct pktsock * pkt, struct pbuf * pbuf, 140 const ip_addr_t * srcaddr, uint16_t port) 141 { 142 struct pktaddr4 pktaddr4; 143 struct pktaddr6 pktaddr6; 144 struct pkthdr pkthdr; 145 void *pktaddr; 146 struct ifdev *ifdev; 147 size_t pktaddrlen; 148 149 /* 150 * We are going to mess with the packet's header and contents, so we 151 * must be the exclusive owner of the packet. For UDP packets, lwIP 152 * must have made a copy for us in case of non-exclusive delivery 153 * (e.g., multicast packets). For raw packets, we have made a copy of 154 * the packet ourselves just before the call to this function. 155 */ 156 if (pbuf->ref != 1) 157 panic("input packet has multiple references!"); 158 159 /* If the packet should not be received on this socket, drop it. */ 160 if (!pktsock_may_recv(pkt, pbuf)) { 161 pbuf_free(pbuf); 162 163 return; 164 } 165 166 /* 167 * Enqueue the packet. Overwrite the leading IP header with packet 168 * information that is used at the time of receipt by userland. The 169 * data structures are such that the information always fits in what 170 * was the IP header. The reference count check earlier ensures that 171 * we never overwrite part of a packet that is still in use elsewhere. 172 */ 173 if (ip_current_is_v6()) { 174 assert(IP_IS_V6(srcaddr)); 175 assert(ip6_current_dest_addr() != NULL); 176 177 ip6_addr_copy_to_packed(pktaddr6.srcaddr, *ip_2_ip6(srcaddr)); 178 ip6_addr_copy_to_packed(pktaddr6.dstaddr, 179 *ip6_current_dest_addr()); 180 pktaddr = &pktaddr6; 181 pktaddrlen = sizeof(pktaddr6); 182 183 assert(pktaddrlen + sizeof(pkthdr) <= IP6_HLEN); 184 185 pkthdr.tos = IP6H_TC(ip6_current_header()); 186 pkthdr.ttl = IP6H_HOPLIM(ip6_current_header()); 187 pkthdr.flags = PKTHF_IPV6; 188 } else { 189 assert(IP_IS_V4(srcaddr)); 190 assert(ip4_current_dest_addr() != NULL); 191 192 memcpy(&pktaddr4.srcaddr, ip_2_ip4(srcaddr), 193 sizeof(pktaddr4.srcaddr)); 194 memcpy(&pktaddr4.dstaddr, ip4_current_dest_addr(), 195 sizeof(pktaddr4.srcaddr)); 196 pktaddr = &pktaddr4; 197 pktaddrlen = sizeof(pktaddr4); 198 199 assert(pktaddrlen + sizeof(pkthdr) <= IP_HLEN); 200 201 pkthdr.tos = IPH_TOS(ip4_current_header()); 202 pkthdr.ttl = IPH_TTL(ip4_current_header()); 203 pkthdr.flags = 0; 204 } 205 206 /* 207 * Save both the interface on which the packet was received (for 208 * PKTINFO) and the interface that owns the destination address of the 209 * packet (for the source address's zone ID). 210 */ 211 assert(ip_current_input_netif() != NULL); 212 ifdev = netif_get_ifdev(ip_current_input_netif()); 213 pkthdr.dstif = (uint16_t)ifdev_get_index(ifdev); 214 215 assert(ip_current_netif() != NULL); 216 ifdev = netif_get_ifdev(ip_current_netif()); 217 pkthdr.addrif = (uint16_t)ifdev_get_index(ifdev); 218 219 if ((pbuf->flags & PBUF_FLAG_LLMCAST) || 220 ip_addr_ismulticast(ip_current_dest_addr())) 221 pkthdr.flags |= PKTHF_MCAST; 222 else if ((pbuf->flags & PBUF_FLAG_LLBCAST) || 223 ip_addr_isbroadcast(ip_current_dest_addr(), ip_current_netif())) 224 pkthdr.flags |= PKTHF_BCAST; 225 226 pkthdr.port = port; 227 228 util_pbuf_header(pbuf, sizeof(pkthdr)); 229 230 memcpy(pbuf->payload, &pkthdr, sizeof(pkthdr)); 231 232 util_pbuf_header(pbuf, pktaddrlen); 233 234 memcpy(pbuf->payload, pktaddr, pktaddrlen); 235 236 util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + pktaddrlen)); 237 238 *pkt->pkt_rcvtailp = pbuf; 239 pkt->pkt_rcvtailp = pchain_end(pbuf); 240 pkt->pkt_rcvlen += pchain_size(pbuf); 241 242 sockevent_raise(ipsock_get_sock(&pkt->pkt_ipsock), SEV_RECV); 243 } 244 245 /* 246 * Obtain interface and source address information for an outgoing packet. In 247 * particular, parse any IPV6_PKTINFO options provided as either sticky options 248 * on the socket 'pkt' or as ancillary options in the packet options 'pkto'. 249 * On success, return OK, with 'ifdevp' set to either the outgoing interface to 250 * use for the packet, or NULL if no outgoing interface was specified using 251 * either of the aforementioned options. If, and only if, 'ifdevp' is set to 252 * an actual interface (i.e., not NULL), then 'src_addrp' is filled with either 253 * a locally owned, validated, unicast address to use as source of the packet, 254 * or the unspecified ('any') address if no source address was specified using 255 * the options. On failure, return a negative error code. 256 */ 257 int 258 pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto, 259 struct ifdev ** ifdevp, ip_addr_t * src_addrp) 260 { 261 struct ifdev *ifdev, *ifdev2; 262 ip_addr_t ipaddr; 263 uint32_t ifindex; 264 int r; 265 266 /* We support only IPV6_PKTINFO. IP_PKTINFO is not supported. */ 267 if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) { 268 *ifdevp = NULL; 269 return OK; 270 } 271 272 /* 273 * TODO: we are spending a lot of effort on initializing and copying 274 * stuff around, even just to find out whether there is anything to do 275 * at all here. See if this can be optimized. 276 */ 277 ip_addr_set_zero_ip6(&ipaddr); 278 279 /* 280 * Ancillary data takes precedence over sticky options. We treat the 281 * source address and interface index fields as separate, overriding 282 * each earlier value only if non-zero. TODO: is that correct? 283 */ 284 if (pkto->pkto_flags & PKTOF_PKTINFO) { 285 memcpy(ip_2_ip6(&ipaddr)->addr, &pkto->pkto_srcaddr.addr, 286 sizeof(ip_2_ip6(&ipaddr)->addr)); 287 ifindex = pkto->pkto_ifindex; 288 } else 289 ifindex = 0; 290 291 if (ip6_addr_isany(ip_2_ip6(&ipaddr))) 292 memcpy(ip_2_ip6(&ipaddr)->addr, &pkt->pkt_srcaddr.addr, 293 sizeof(ip_2_ip6(&ipaddr)->addr)); 294 if (ifindex == 0) 295 ifindex = pkt->pkt_ifindex; 296 297 /* If both fields are blank, there is nothing more to do. */ 298 if (ip6_addr_isany(ip_2_ip6(&ipaddr)) && ifindex == 0) { 299 *ifdevp = NULL; 300 return OK; 301 } 302 303 /* If an interface index is specified, it must be valid. */ 304 ifdev = NULL; 305 306 if (ifindex != 0 && (ifdev = ifdev_get_by_index(ifindex)) == NULL) 307 return ENXIO; 308 309 /* 310 * Use the interface index to set a zone on the source address, if the 311 * source address has a scope. 312 */ 313 if (ip6_addr_has_scope(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) { 314 if (ifindex == 0) 315 return EADDRNOTAVAIL; 316 317 ip6_addr_set_zone(ip_2_ip6(&ipaddr), ifindex); 318 } 319 320 /* 321 * We need to validate the given address just as thoroughly as an 322 * address given through bind(). If we don't, we could allow forged 323 * source addresses etcetera. To be sure: this call may change the 324 * address to an IPv4 type address if needed. 325 */ 326 if ((r = ipsock_check_src_addr(pktsock_get_ipsock(pkt), &ipaddr, 327 FALSE /*allow_mcast*/, &ifdev2)) != OK) 328 return r; 329 330 if (ifdev2 != NULL) { 331 if (ifdev == NULL) 332 ifdev = ifdev2; 333 else if (ifdev != ifdev2) 334 return EADDRNOTAVAIL; 335 } else { 336 /* 337 * There should be no cases where the (non-multicast) address 338 * successfully parsed, is not unspecified, and yet did not map 339 * to an interface. Eliminate the possibility anyway by 340 * throwing an error for this case. As a result, we are left 341 * with one of two cases: 342 * 343 * 1) ifdevp is not NULL, and src_addrp is unspecified; 344 * 2) ifdevp is not NULL, and src_addrp is a locally assigned 345 * (unicast) address. 346 * 347 * This is why we need not fill src_addrp when ifdevp is NULL. 348 */ 349 if (!ip_addr_isany(&ipaddr)) 350 return EADDRNOTAVAIL; 351 } 352 353 *ifdevp = ifdev; 354 if (ifdev != NULL) 355 *src_addrp = ipaddr; 356 return OK; 357 } 358 359 /* 360 * Parse a chunk of user-provided control data, on an IPv4 socket provided as 361 * 'pkt'. The control chunk is given as 'cmsg', and the length of the data 362 * following the control header (possibly zero) is given as 'len'. On success, 363 * return OK, with any parsed options merged into the set of packet options 364 * 'pkto'. On failure, return a negative error code. 365 */ 366 static int 367 pktsock_parse_ctl_v4(struct pktsock * pkt __unused, struct cmsghdr * cmsg, 368 socklen_t len, struct pktopt * pkto) 369 { 370 uint8_t byte; 371 int val; 372 373 if (cmsg->cmsg_level != IPPROTO_IP) 374 return EAFNOSUPPORT; 375 376 switch (cmsg->cmsg_type) { 377 case IP_TOS: 378 /* 379 * Some userland code (bind's libisc in particular) supplies 380 * a single byte instead of a full integer for this option. 381 * We go out of our way to accept that format, too. 382 */ 383 if (len != sizeof(val) && len != sizeof(byte)) 384 return EINVAL; 385 386 if (len == sizeof(byte)) { 387 memcpy(&byte, CMSG_DATA(cmsg), sizeof(byte)); 388 val = (int)byte; 389 } else 390 memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); 391 392 if (val < 0 || val > UINT8_MAX) 393 return EINVAL; 394 395 pkto->pkto_flags |= PKTOF_TOS; 396 pkto->pkto_tos = (uint8_t)val; 397 398 return OK; 399 400 case IP_TTL: 401 if (len != sizeof(val)) 402 return EINVAL; 403 404 memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); 405 406 if (val < 0 || val > UINT8_MAX) 407 return EINVAL; 408 409 pkto->pkto_flags |= PKTOF_TTL; 410 pkto->pkto_ttl = (uint8_t)val; 411 412 return OK; 413 414 /* 415 * Implementing IP_PKTINFO might be a bit harder than its IPV6_PKTINFO 416 * sibling, because it would require the use of zone IDs (interface 417 * indices) for IPv4, which is not supported yet. 418 */ 419 } 420 421 return EINVAL; 422 } 423 424 /* 425 * Parse a chunk of user-provided control data, on an IPv6 socket provided as 426 * 'pkt'. The control chunk is given as 'cmsg', and the length of the data 427 * following the control header (possibly zero) is given as 'len'. On success, 428 * return OK, with any parsed options merged into the set of packet options 429 * 'pkto'. On failure, return a negative error code. 430 */ 431 static int 432 pktsock_parse_ctl_v6(struct pktsock * pkt, struct cmsghdr * cmsg, 433 socklen_t len, struct pktopt * pkto) 434 { 435 struct in6_pktinfo ipi6; 436 int val; 437 438 if (cmsg->cmsg_level != IPPROTO_IPV6) 439 return EAFNOSUPPORT; 440 441 switch (cmsg->cmsg_type) { 442 case IPV6_TCLASS: 443 if (len != sizeof(val)) 444 return EINVAL; 445 446 memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); 447 448 if (val < -1 || val > UINT8_MAX) 449 return EINVAL; 450 451 if (val == -1) 452 val = 0; 453 454 pkto->pkto_flags |= PKTOF_TOS; 455 pkto->pkto_tos = (uint8_t)val; 456 457 return OK; 458 459 case IPV6_HOPLIMIT: 460 if (len != sizeof(val)) 461 return EINVAL; 462 463 memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); 464 465 if (val < -1 || val > UINT8_MAX) 466 return EINVAL; 467 468 if (val == -1) 469 val = IP_DEFAULT_TTL; 470 471 pkto->pkto_flags |= PKTOF_TTL; 472 pkto->pkto_ttl = (uint8_t)val; 473 474 return OK; 475 476 case IPV6_PKTINFO: 477 if (len != sizeof(ipi6)) 478 return EINVAL; 479 480 memcpy(&ipi6, CMSG_DATA(cmsg), sizeof(ipi6)); 481 482 pkto->pkto_flags |= PKTOF_PKTINFO; 483 memcpy(&pkto->pkto_srcaddr.addr, &ipi6.ipi6_addr, 484 sizeof(pkto->pkto_srcaddr.addr)); 485 pkto->pkto_ifindex = ipi6.ipi6_ifindex; 486 487 return OK; 488 489 case IPV6_USE_MIN_MTU: 490 if (len != sizeof(int)) 491 return EINVAL; 492 493 memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); 494 495 if (val < -1 || val > 1) 496 return EINVAL; 497 498 /* TODO: not supported by lwIP, but needed by applications. */ 499 return OK; 500 } 501 502 return EINVAL; 503 } 504 505 /* 506 * Copy in and parse control data, as part of sending a packet on socket 'pkt'. 507 * The control data is accessible through 'ctl', with a user-provided length of 508 * 'ctl_len'. On success, return OK, with any parsed packet options stored in 509 * 'pkto'. On failure, return a negative error code. 510 */ 511 int 512 pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, 513 socklen_t ctl_len, struct pktopt * pkto) 514 { 515 struct msghdr msghdr; 516 struct cmsghdr *cmsg; 517 socklen_t left, len; 518 int r; 519 520 /* The default: no packet options are being overridden. */ 521 assert(pkto->pkto_flags == 0); 522 523 /* If no control length is given, we are done here. */ 524 if (ctl_len == 0) 525 return OK; 526 527 /* 528 * For now, we put a rather aggressive limit on the size of the control 529 * data. We copy in and parse the whole thing in a single buffer. 530 */ 531 if (ctl_len > sizeof(pktsock_ctlbuf)) { 532 printf("LWIP: too much control data given (%u bytes)\n", 533 ctl_len); 534 535 return ENOBUFS; 536 } 537 538 if ((r = sockdriver_copyin(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) 539 return r; 540 541 memset(&msghdr, 0, sizeof(msghdr)); 542 msghdr.msg_control = pktsock_ctlbuf; 543 msghdr.msg_controllen = ctl_len; 544 545 for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; 546 cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { 547 /* Check for bogus lengths. */ 548 assert((socklen_t)((char *)cmsg - pktsock_ctlbuf) <= ctl_len); 549 left = ctl_len - (socklen_t)((char *)cmsg - pktsock_ctlbuf); 550 assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ 551 552 if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { 553 printf("LWIP: malformed control data rejected\n"); 554 555 return EINVAL; 556 } 557 558 len = cmsg->cmsg_len - CMSG_LEN(0); 559 560 if (ipsock_is_ipv6(&pkt->pkt_ipsock)) 561 r = pktsock_parse_ctl_v6(pkt, cmsg, len, pkto); 562 else 563 r = pktsock_parse_ctl_v4(pkt, cmsg, len, pkto); 564 565 if (r != OK) 566 return r; 567 } 568 569 return OK; 570 } 571 572 /* 573 * Copy in the packet data from the calling user process, and store it in the 574 * buffer 'pbuf' that must already have been allocated with the appropriate 575 * size. 576 */ 577 int 578 pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data, 579 size_t len, struct pbuf * pbuf) 580 581 { 582 583 return util_copy_data(data, len, 0, pbuf, 0, TRUE /*copy_in*/); 584 } 585 586 /* 587 * Dequeue and free the head of the receive queue of a packet socket. 588 */ 589 static void 590 pktsock_dequeue(struct pktsock * pkt) 591 { 592 struct pbuf *pbuf, **pnext; 593 size_t size; 594 595 pbuf = pkt->pkt_rcvhead; 596 assert(pbuf != NULL); 597 598 pnext = pchain_end(pbuf); 599 size = pchain_size(pbuf); 600 601 if ((pkt->pkt_rcvhead = *pnext) == NULL) 602 pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; 603 604 assert(pkt->pkt_rcvlen >= size); 605 pkt->pkt_rcvlen -= size; 606 607 *pnext = NULL; 608 pbuf_free(pbuf); 609 } 610 611 /* 612 * Perform preliminary checks on a receive request. 613 */ 614 int 615 pktsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, 616 int flags) 617 { 618 619 /* 620 * We accept the same flags across all socket types in LWIP, and then 621 * simply ignore the ones we do not support for packet sockets. 622 */ 623 if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) 624 return EOPNOTSUPP; 625 626 return OK; 627 } 628 629 /* 630 * Add a chunk of control data to the global control buffer, starting from 631 * offset 'off'. The chunk has the given level and type, and its data is given 632 * in the buffer 'ptr' with size 'len'. Return the (padded) size of the chunk 633 * that was generated as a result. 634 */ 635 static size_t 636 pktsock_add_ctl(int level, int type, void * ptr, socklen_t len, size_t off) 637 { 638 struct cmsghdr cmsg; 639 size_t size; 640 641 size = CMSG_SPACE(len); 642 643 /* 644 * The global control buffer must be large enough to store one chunk 645 * of each of the supported options. If this panic triggers, increase 646 * PKTSOCK_CTLBUF_SIZE by as much as needed. 647 */ 648 if (off + size > sizeof(pktsock_ctlbuf)) 649 panic("control buffer too small, increase " 650 "PKTSOCK_CTLBUF_SIZE"); 651 652 memset(&cmsg, 0, sizeof(cmsg)); 653 cmsg.cmsg_len = CMSG_LEN(len); 654 cmsg.cmsg_level = level; 655 cmsg.cmsg_type = type; 656 657 /* 658 * Clear any padding space. This can be optimized, but in any case we 659 * must be careful not to copy out any bytes that have not been 660 * initialized at all. 661 */ 662 memset(&pktsock_ctlbuf[off], 0, size); 663 664 memcpy(&pktsock_ctlbuf[off], &cmsg, sizeof(cmsg)); 665 memcpy(CMSG_DATA((struct cmsghdr *)&pktsock_ctlbuf[off]), ptr, len); 666 667 return size; 668 } 669 670 /* 671 * Generate and copy out control data, as part of delivering a packet from 672 * socket 'pkt' to userland. The control data buffer is given as 'ctl', with 673 * a user-given length of 'ctl_len' bytes. The packet's header information is 674 * provided as 'pkthdr', and its source and destination addresses as 'pktaddr', 675 * which maybe a pktaddr4 or pktaddr6 structure depending on the value of the 676 * PKTHF_IPV6 flag in the 'flags' field in 'pkthdr'. Note that we support 677 * dual-stack sockets, and as such it is possible that the socket is of domain 678 * AF_INET6 while the received packet is an IPv4 packet. On success, return 679 * the size of the control data copied out (possibly zero). If more control 680 * data were generated than copied out, also merge the MSG_CTRUNC flag into 681 * 'rflags'. On failure, return a negative error code. 682 */ 683 static int 684 pktsock_put_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, 685 socklen_t ctl_len, struct pkthdr * pkthdr, void * pktaddr, 686 int * rflags) 687 { 688 struct pktaddr6 *pktaddr6; 689 struct pktaddr4 *pktaddr4; 690 struct in_pktinfo ipi; 691 struct in6_pktinfo ipi6; 692 ip_addr_t ipaddr; 693 unsigned int flags; 694 uint8_t byte; 695 size_t off; 696 int r, val; 697 698 flags = ipsock_get_flags(&pkt->pkt_ipsock); 699 700 if (!(flags & (PKTF_RECVINFO | PKTF_RECVTOS | PKTF_RECVTTL))) 701 return 0; 702 703 /* 704 * Important: all generated control chunks must fit in the global 705 * control buffer together. When adding more options here, ensure that 706 * the control buffer remains large enough to receive all options at 707 * once. See also the panic in pktsock_add_ctl(). 708 */ 709 off = 0; 710 711 /* 712 * IPv6 sockets may receive IPv4 packets. The ancillary data is in the 713 * format corresponding to the socket, which means we may have to 714 * convert any IPv4 addresses from the packet to IPv4-mapped IPv6 715 * addresses for the ancillary data, just like the source address. 716 */ 717 if (ipsock_is_ipv6(&pkt->pkt_ipsock)) { 718 if (flags & PKTF_RECVTTL) { 719 val = pkthdr->ttl; 720 721 off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_HOPLIMIT, 722 &val, sizeof(val), off); 723 } 724 725 if (flags & PKTF_RECVTOS) { 726 val = pkthdr->tos; 727 728 off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_TCLASS, &val, 729 sizeof(val), off); 730 } 731 732 if (flags & PKTF_RECVINFO) { 733 memset(&ipi6, 0, sizeof(ipi6)); 734 735 if (pkthdr->flags & PKTHF_IPV6) { 736 pktaddr6 = (struct pktaddr6 *)pktaddr; 737 memcpy(&ipi6.ipi6_addr, &pktaddr6->dstaddr, 738 sizeof(ipi6.ipi6_addr)); 739 } else { 740 pktaddr4 = (struct pktaddr4 *)pktaddr; 741 742 addr_make_v4mapped_v6(&ipaddr, 743 &pktaddr4->dstaddr); 744 745 memcpy(&ipi6.ipi6_addr, 746 ip_2_ip6(&ipaddr)->addr, 747 sizeof(ipi6.ipi6_addr)); 748 } 749 ipi6.ipi6_ifindex = pkthdr->dstif; 750 751 off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_PKTINFO, 752 &ipi6, sizeof(ipi6), off); 753 } 754 } else { 755 if (flags & PKTF_RECVTTL) { 756 byte = pkthdr->ttl; 757 758 off += pktsock_add_ctl(IPPROTO_IP, IP_TTL, &byte, 759 sizeof(byte), off); 760 } 761 762 if (flags & PKTF_RECVINFO) { 763 assert(!(pkthdr->flags & PKTHF_IPV6)); 764 pktaddr4 = (struct pktaddr4 *)pktaddr; 765 766 memset(&ipi, 0, sizeof(ipi)); 767 memcpy(&ipi.ipi_addr, &pktaddr4->dstaddr, 768 sizeof(ipi.ipi_addr)); 769 ipi.ipi_ifindex = pkthdr->dstif; 770 771 off += pktsock_add_ctl(IPPROTO_IP, IP_PKTINFO, &ipi, 772 sizeof(ipi), off); 773 } 774 } 775 776 assert(off > 0); 777 778 if (ctl_len >= off) 779 ctl_len = off; 780 else 781 *rflags |= MSG_CTRUNC; 782 783 if (ctl_len > 0 && 784 (r = sockdriver_copyout(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) 785 return r; 786 787 return ctl_len; 788 } 789 790 /* 791 * Receive data on a packet socket. 792 */ 793 int 794 pktsock_recv(struct sock * sock, const struct sockdriver_data * data, 795 size_t len, size_t * off, const struct sockdriver_data * ctl, 796 socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr, 797 socklen_t * addr_len, endpoint_t user_endpt __unused, int flags, 798 size_t min __unused, int * rflags) 799 { 800 struct pktsock *pkt = (struct pktsock *)sock; 801 struct pktaddr4 pktaddr4; 802 struct pktaddr6 pktaddr6; 803 struct pkthdr pkthdr; 804 void *pktaddr; 805 struct pbuf *pbuf; 806 ip_addr_t srcaddr; 807 int r; 808 809 if ((pbuf = pkt->pkt_rcvhead) == NULL) 810 return SUSPEND; 811 812 /* 813 * Get the ancillary data for the packet. The format of the ancillary 814 * data depends on the received packet type, which may be different 815 * from the socket type. 816 */ 817 util_pbuf_header(pbuf, sizeof(pkthdr)); 818 819 memcpy(&pkthdr, pbuf->payload, sizeof(pkthdr)); 820 821 if (pkthdr.flags & PKTHF_IPV6) { 822 util_pbuf_header(pbuf, sizeof(pktaddr6)); 823 824 memcpy(&pktaddr6, pbuf->payload, sizeof(pktaddr6)); 825 pktaddr = &pktaddr6; 826 827 ip_addr_copy_from_ip6_packed(srcaddr, pktaddr6.srcaddr); 828 if (ip6_addr_has_scope(ip_2_ip6(&srcaddr), IP6_UNICAST)) 829 ip6_addr_set_zone(ip_2_ip6(&srcaddr), pkthdr.addrif); 830 831 util_pbuf_header(pbuf, 832 -(int)(sizeof(pkthdr) + sizeof(pktaddr6))); 833 } else { 834 util_pbuf_header(pbuf, sizeof(pktaddr4)); 835 836 memcpy(&pktaddr4, pbuf->payload, sizeof(pktaddr4)); 837 pktaddr = &pktaddr4; 838 839 ip_addr_copy_from_ip4(srcaddr, pktaddr4.srcaddr); 840 841 util_pbuf_header(pbuf, 842 -(int)(sizeof(pkthdr) + sizeof(pktaddr4))); 843 } 844 845 /* Copy out the packet data to the calling user process. */ 846 if (len >= pbuf->tot_len) 847 len = pbuf->tot_len; 848 else 849 *rflags |= MSG_TRUNC; 850 851 r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/); 852 853 if (r != OK) 854 return r; 855 856 /* Generate and copy out ancillary (control) data, if requested. */ 857 if ((r = pktsock_put_ctl(pkt, ctl, ctl_len, &pkthdr, pktaddr, 858 rflags)) < 0) 859 return r; 860 861 /* Store the source IP address. */ 862 ipsock_put_addr(&pkt->pkt_ipsock, addr, addr_len, &srcaddr, 863 pkthdr.port); 864 865 /* Set multicast or broadcast message flag, if applicable. */ 866 if (pkthdr.flags & PKTHF_MCAST) 867 *rflags |= MSG_MCAST; 868 else if (pkthdr.flags & PKTHF_BCAST) 869 *rflags |= MSG_BCAST; 870 871 /* Discard the packet now, unless we were instructed to peek only. */ 872 if (!(flags & MSG_PEEK)) 873 pktsock_dequeue(pkt); 874 875 /* Return the received part of the packet length. */ 876 *off = len; 877 *ctl_off = r; 878 return OK; 879 } 880 881 /* 882 * Test whether data can be received on a packet socket, and if so, how many 883 * bytes of data. 884 */ 885 int 886 pktsock_test_recv(struct sock * sock, size_t min __unused, size_t * size) 887 { 888 struct pktsock *pkt = (struct pktsock *)sock; 889 890 if (pkt->pkt_rcvhead == NULL) 891 return SUSPEND; 892 893 if (size != NULL) 894 *size = pkt->pkt_rcvhead->tot_len; 895 return OK; 896 } 897 898 /* 899 * The caller has performed a multicast operation on the given socket. Thus, 900 * the caller is multicast aware. Remember this, because that means the socket 901 * may also receive traffic to multicast destinations. 902 */ 903 void 904 pktsock_set_mcaware(struct pktsock * pkt) 905 { 906 907 ipsock_set_flag(&pkt->pkt_ipsock, PKTF_MCAWARE); 908 } 909 910 /* 911 * Set socket options on a packet socket. 912 */ 913 int 914 pktsock_setsockopt(struct pktsock * pkt, int level, int name, 915 const struct sockdriver_data * data, socklen_t len, 916 struct ipopts * ipopts) 917 { 918 struct ip_mreq imr; 919 struct ipv6_mreq ipv6mr; 920 struct in6_pktinfo ipi6; 921 ip_addr_t ipaddr, ifaddr; 922 struct ifdev *ifdev; 923 unsigned int flag; 924 uint32_t ifindex; 925 int r, val, has_scope; 926 927 switch (level) { 928 case IPPROTO_IP: 929 if (ipsock_is_ipv6(&pkt->pkt_ipsock)) 930 break; 931 932 switch (name) { 933 case IP_ADD_MEMBERSHIP: 934 case IP_DROP_MEMBERSHIP: 935 pktsock_set_mcaware(pkt); 936 937 if ((r = sockdriver_copyin_opt(data, &imr, sizeof(imr), 938 len)) != OK) 939 return r; 940 941 ip_addr_set_ip4_u32(&ipaddr, imr.imr_multiaddr.s_addr); 942 ip_addr_set_ip4_u32(&ifaddr, imr.imr_interface.s_addr); 943 944 if (!ip_addr_isany(&ifaddr)) { 945 ifdev = ifaddr_map_by_addr(&ifaddr); 946 947 if (ifdev == NULL) 948 return EADDRNOTAVAIL; 949 } else 950 ifdev = NULL; 951 952 if (name == IP_ADD_MEMBERSHIP) 953 r = mcast_join(&pkt->pkt_mcast, &ipaddr, 954 ifdev); 955 else 956 r = mcast_leave(&pkt->pkt_mcast, &ipaddr, 957 ifdev); 958 959 return r; 960 961 case IP_RECVTTL: 962 case IP_RECVPKTINFO: 963 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 964 len)) != OK) 965 return r; 966 967 switch (name) { 968 case IP_RECVTTL: flag = PKTF_RECVTTL; break; 969 case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; 970 default: flag = 0; assert(0); break; 971 } 972 973 if (val) 974 ipsock_set_flag(&pkt->pkt_ipsock, flag); 975 else 976 ipsock_clear_flag(&pkt->pkt_ipsock, flag); 977 978 return OK; 979 } 980 981 break; 982 983 case IPPROTO_IPV6: 984 if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) 985 break; 986 987 switch (name) { 988 case IPV6_JOIN_GROUP: 989 case IPV6_LEAVE_GROUP: 990 pktsock_set_mcaware(pkt); 991 992 if ((r = sockdriver_copyin_opt(data, &ipv6mr, 993 sizeof(ipv6mr), len)) != OK) 994 return r; 995 996 ip_addr_set_zero_ip6(&ipaddr); 997 memcpy(ip_2_ip6(&ipaddr)->addr, 998 &ipv6mr.ipv6mr_multiaddr, 999 sizeof(ip_2_ip6(&ipaddr)->addr)); 1000 1001 /* 1002 * We currently do not support joining IPv4 multicast 1003 * groups on IPv6 sockets. The reason for this is that 1004 * this would require decisions on what to do if the 1005 * socket is set to V6ONLY later, as well as various 1006 * additional exceptions for a case that hopefully 1007 * doesn't occur in practice anyway. 1008 */ 1009 if (ip6_addr_isipv4mappedipv6(ip_2_ip6(&ipaddr))) 1010 return EADDRNOTAVAIL; 1011 1012 has_scope = ip6_addr_has_scope(ip_2_ip6(&ipaddr), 1013 IP6_UNKNOWN); 1014 1015 if ((ifindex = ipv6mr.ipv6mr_interface) != 0) { 1016 ifdev = ifdev_get_by_index(ifindex); 1017 1018 if (ifdev == NULL) 1019 return ENXIO; 1020 1021 if (has_scope) 1022 ip6_addr_set_zone(ip_2_ip6(&ipaddr), 1023 ifindex); 1024 } else { 1025 if (has_scope) 1026 return EADDRNOTAVAIL; 1027 1028 ifdev = NULL; 1029 } 1030 1031 if (name == IPV6_JOIN_GROUP) 1032 r = mcast_join(&pkt->pkt_mcast, &ipaddr, 1033 ifdev); 1034 else 1035 r = mcast_leave(&pkt->pkt_mcast, &ipaddr, 1036 ifdev); 1037 1038 return r; 1039 1040 case IPV6_USE_MIN_MTU: 1041 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 1042 len)) != OK) 1043 return r; 1044 1045 if (val < -1 || val > 1) 1046 return EINVAL; 1047 1048 /* 1049 * lwIP does not support path MTU discovery, so do 1050 * nothing. TODO: see if this is actually good enough. 1051 */ 1052 return OK; 1053 1054 case IPV6_PKTINFO: 1055 if ((r = sockdriver_copyin_opt(data, &ipi6, 1056 sizeof(ipi6), len)) != OK) 1057 return r; 1058 1059 /* 1060 * Simply copy in what is given. The values will be 1061 * parsed only once a packet is sent, in 1062 * pktsock_get_pktinfo(). Otherwise, if we perform 1063 * checks here, they may be outdated by the time the 1064 * values are actually used. 1065 */ 1066 memcpy(&pkt->pkt_srcaddr.addr, &ipi6.ipi6_addr, 1067 sizeof(pkt->pkt_srcaddr.addr)); 1068 pkt->pkt_ifindex = ipi6.ipi6_ifindex; 1069 1070 return OK; 1071 1072 case IPV6_RECVPKTINFO: 1073 case IPV6_RECVHOPLIMIT: 1074 case IPV6_RECVTCLASS: 1075 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 1076 len)) != OK) 1077 return r; 1078 1079 switch (name) { 1080 case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; 1081 case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; 1082 case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; 1083 default: flag = 0; assert(0); break; 1084 } 1085 1086 if (val) 1087 ipsock_set_flag(&pkt->pkt_ipsock, flag); 1088 else 1089 ipsock_clear_flag(&pkt->pkt_ipsock, flag); 1090 1091 return OK; 1092 } 1093 1094 break; 1095 } 1096 1097 return ipsock_setsockopt(&pkt->pkt_ipsock, level, name, data, len, 1098 ipopts); 1099 } 1100 1101 /* 1102 * Retrieve socket options on a packet socket. 1103 */ 1104 int 1105 pktsock_getsockopt(struct pktsock * pkt, int level, int name, 1106 const struct sockdriver_data * data, socklen_t * len, 1107 struct ipopts * ipopts) 1108 { 1109 struct in6_pktinfo ipi6; 1110 unsigned int flag; 1111 int val; 1112 1113 switch (level) { 1114 case IPPROTO_IP: 1115 if (ipsock_is_ipv6(&pkt->pkt_ipsock)) 1116 break; 1117 1118 switch (name) { 1119 case IP_RECVTTL: 1120 case IP_RECVPKTINFO: 1121 switch (name) { 1122 case IP_RECVTTL: flag = PKTF_RECVTTL; break; 1123 case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; 1124 default: flag = 0; assert(0); break; 1125 } 1126 1127 val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); 1128 1129 return sockdriver_copyout_opt(data, &val, sizeof(val), 1130 len); 1131 } 1132 1133 break; 1134 1135 case IPPROTO_IPV6: 1136 if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) 1137 break; 1138 1139 switch (name) { 1140 case IPV6_USE_MIN_MTU: 1141 /* 1142 * TODO: sort out exactly what lwIP actually supports 1143 * in the way of path MTU discovery. Value 1 means 1144 * that path MTU discovery is disabled and packets are 1145 * sent at the minimum MTU (RFC 3542). 1146 */ 1147 val = 1; 1148 1149 return sockdriver_copyout_opt(data, &val, sizeof(val), 1150 len); 1151 1152 case IPV6_PKTINFO: 1153 memset(&ipi6, 0, sizeof(ipi6)); 1154 1155 /* 1156 * Simply copy out whatever was given before. These 1157 * fields are initialized to zero on socket creation. 1158 */ 1159 memcpy(&ipi6.ipi6_addr, &pkt->pkt_srcaddr.addr, 1160 sizeof(ipi6.ipi6_addr)); 1161 ipi6.ipi6_ifindex = pkt->pkt_ifindex; 1162 1163 return sockdriver_copyout_opt(data, &ipi6, 1164 sizeof(ipi6), len); 1165 1166 case IPV6_RECVPKTINFO: 1167 case IPV6_RECVHOPLIMIT: 1168 case IPV6_RECVTCLASS: 1169 switch (name) { 1170 case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; 1171 case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; 1172 case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; 1173 default: flag = 0; assert(0); break; 1174 } 1175 1176 val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); 1177 1178 return sockdriver_copyout_opt(data, &val, sizeof(val), 1179 len); 1180 } 1181 1182 break; 1183 } 1184 1185 return ipsock_getsockopt(&pkt->pkt_ipsock, level, name, data, len, 1186 ipopts); 1187 } 1188 1189 /* 1190 * Drain the receive queue of a packet socket. 1191 */ 1192 static void 1193 pktsock_drain(struct pktsock * pkt) 1194 { 1195 1196 while (pkt->pkt_rcvhead != NULL) 1197 pktsock_dequeue(pkt); 1198 1199 assert(pkt->pkt_rcvlen == 0); 1200 assert(pkt->pkt_rcvtailp == &pkt->pkt_rcvhead); 1201 } 1202 1203 /* 1204 * Shut down a packet socket for reading and/or writing. 1205 */ 1206 void 1207 pktsock_shutdown(struct pktsock * pkt, unsigned int mask) 1208 { 1209 1210 if (mask & SFL_SHUT_RD) 1211 pktsock_drain(pkt); 1212 } 1213 1214 /* 1215 * Close a packet socket. 1216 */ 1217 void 1218 pktsock_close(struct pktsock * pkt) 1219 { 1220 1221 pktsock_drain(pkt); 1222 1223 mcast_leave_all(&pkt->pkt_mcast); 1224 } 1225 1226 /* 1227 * Return the rounded-up number of bytes in the packet socket's receive queue, 1228 * for sysctl(7). NetBSD returns the used portion of each buffer, but that 1229 * would be quite some extra effort for us (TODO). 1230 */ 1231 size_t 1232 pktsock_get_recvlen(struct pktsock * pkt) 1233 { 1234 1235 return pkt->pkt_rcvlen; 1236 } 1237