1 /* UNIX Domain Sockets - io.c - sending and receiving */ 2 3 #include "uds.h" 4 #include <sys/mman.h> 5 6 /* 7 * Our UDS sockets do not have a send buffer. They only have a receive buffer. 8 * This receive buffer, when not empty, is split up in segments. Each segment 9 * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and 10 * (SOCK_DGRAM) neither. There are two types of ancillary data: in-flight file 11 * descriptors and sender credentials. In addition, for SOCK_DGRAM sockets, 12 * the segment may contain the sender's socket path (if the sender's socket is 13 * bound). Each segment has has a header, containing the full segment size, 14 * the size of the actual data in the segment (if any), and a flags field that 15 * states which ancillary are associated with the segment (if any). For 16 * SOCK_STREAM type sockets, new data may be merged into a previous segment, 17 * but only if it has no ancillary data. For the other two socket types, each 18 * packet has its own header. The resulting behavior should be in line with 19 * the POSIX "Socket Receive Queue" specification. 20 * 21 * More specifically, each segment consists of the following parts: 22 * - always a five-byte header, containing a two-byte segment length (including 23 * the header, so always non-zero), a two-byte regular data length (zero or 24 * more), and a one-byte flags field which is a bitwise combination of 25 * UDS_HAS_{FD,CRED,PATH} flags; 26 * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure; 27 * since this structure is variable-size, the structure is prepended by a 28 * single byte that contains the length of the structure (excluding the byte 29 * itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN); 30 * - next, if UDS_HAS_PATH is set in the segment header: 31 * - next, if the data length is non-zero, the actual regular data. 32 * If the segment is not the last in the receive buffer, it is followed by the 33 * next segment immediately afterward. There is no alignment. 34 * 35 * It is the sender's responsibility to merge new data into the last segment 36 * whenever possible, so that the receiver side never needs to consider more 37 * than one segment at once. In order to allow such merging, each receive 38 * buffer has not only a tail and in-use length (pointing to the head when 39 * combined) but also an offset from the tail to the last header, if any. Note 40 * that the receiver may over time still look at multiple segments for a single 41 * request: this happens when a MSG_WAITALL request empties the buffer and then 42 * blocks - the next piece of arriving data can then obviously not be merged. 43 * 44 * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file 45 * descriptors are associated with the segment. These are stored in a separate 46 * data structure, mainly to simplify cleaning up when the socket is shut down 47 * for reading or closed. That structure also contains the number of file 48 * descriptors associated with the current segment, so this is not stored in 49 * the segment itself. As mentioned later, this may be changed in the future. 50 * 51 * On the sender side, there is a trade-off between fully utilizing the receive 52 * buffer, and not repeatedly performing expensive actions for the same call: 53 * it may be costly to determine exactly how many in-flight file descriptors 54 * there will be (if any) and/or how much space is needed to store credentials. 55 * We currently use the policy that we rather block/reject a send request that 56 * may (just) have fit in the remaining part of the receive buffer, than obtain 57 * the same information multiple times or keep state between callbacks. In 58 * practice this is not expected to make a difference, especially since 59 * transfer of ancillary data should be rare anyway. 60 */ 61 /* 62 * The current layout of the segment header is as follows. 63 * 64 * The first byte contains the upper eight bits of the total segment length. 65 * The second byte contains the lower eight bits of the total segment length. 66 * The third byte contains the upper eight bits of the data length. 67 * The fourth byte contains the lower eight bits of the data length. 68 * The fifth byte is a bitmask for ancillary data associated with the segment. 69 */ 70 #define UDS_HDRLEN 5 71 72 #define UDS_HAS_FDS 0x01 /* segment has in-flight file descriptors */ 73 #define UDS_HAS_CRED 0x02 /* segment has sender credentials */ 74 #define UDS_HAS_PATH 0x04 /* segment has source socket path */ 75 76 #define UDS_MAXCREDLEN SOCKCREDSIZE(NGROUPS_MAX) 77 78 #define uds_get_head(uds) \ 79 ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF) 80 #define uds_get_last(uds) \ 81 ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF) 82 #define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF) 83 84 /* 85 * All in-flight file descriptors are (co-)owned by the UDS driver itself, as 86 * local open file descriptors. Like any other process, the UDS driver can not 87 * have more than OPEN_MAX open file descriptors at any time. Thus, this is 88 * also the inherent maximum number of in-flight file descriptors. Therefore, 89 * we maintain a single pool of in-flight FD structures, and we associate these 90 * structures with sockets as needed. 91 */ 92 static struct uds_fd uds_fds[OPEN_MAX]; 93 static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds; 94 95 static char uds_ctlbuf[UDS_CTL_MAX]; 96 static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)]; 97 98 /* 99 * Initialize the input/output part of the UDS service. 100 */ 101 void 102 uds_io_init(void) 103 { 104 unsigned int slot; 105 106 SIMPLEQ_INIT(&uds_freefds); 107 108 for (slot = 0; slot < __arraycount(uds_fds); slot++) 109 SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next); 110 } 111 112 /* 113 * Set up all input/output state for the given socket, which has just been 114 * allocated. As part of this, allocate memory for the receive buffer of the 115 * socket. Return OK or a negative error code. 116 */ 117 int 118 uds_io_setup(struct udssock * uds) 119 { 120 121 /* TODO: decide if we should preallocate the memory. */ 122 if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE, 123 MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) 124 return ENOMEM; 125 126 uds->uds_tail = 0; 127 uds->uds_len = 0; 128 uds->uds_last = 0; 129 130 SIMPLEQ_INIT(&uds->uds_fds); 131 132 return OK; 133 } 134 135 /* 136 * Clean up the input/output state for the given socket, which is about to be 137 * freed. As part of this, deallocate memory for the receive buffer and close 138 * any file descriptors still in flight on the socket. 139 */ 140 void 141 uds_io_cleanup(struct udssock * uds) 142 { 143 144 /* Close any in-flight file descriptors. */ 145 uds_io_reset(uds); 146 147 /* Free the receive buffer memory. */ 148 if (munmap(uds->uds_buf, UDS_BUF) != 0) 149 panic("UDS: munmap failed: %d", errno); 150 } 151 152 /* 153 * The socket is being closed or shut down for reading. If there are still any 154 * in-flight file descriptors, theey will never be received anymore, so close 155 * them now. 156 */ 157 void 158 uds_io_reset(struct udssock * uds) 159 { 160 struct uds_fd *ufd; 161 162 /* 163 * The UDS service may have the last and only reference to any of these 164 * file descriptors here. For that reason, we currently disallow 165 * transfer of UDS file descriptors, because the close(2) here could 166 * block on a socket close operation back to us, leading to a deadlock. 167 * Also, we use a non-blocking variant of close(2), to prevent that we 168 * end up hanging on sockets with SO_LINGER turned on. 169 */ 170 SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) { 171 dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); 172 173 closenb(ufd->ufd_fd); 174 } 175 176 SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds); 177 178 /* 179 * If this reset happens as part of a shutdown, it might be done 180 * again on close, so ensure that it will find a clean state. The 181 * receive buffer should never be looked at again either way, but reset 182 * it too just to be sure. 183 */ 184 uds->uds_tail = 0; 185 uds->uds_len = 0; 186 uds->uds_last = 0; 187 188 SIMPLEQ_INIT(&uds->uds_fds); 189 } 190 191 /* 192 * Return the maximum usable part of the receive buffer, in bytes. The return 193 * value is used for the SO_SNDBUF and SO_RCVBUF socket options. 194 */ 195 size_t 196 uds_io_buflen(void) 197 { 198 199 /* 200 * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we 201 * could use the full receive buffer for data. This would require that 202 * we store up to one header in the socket object rather than in the 203 * receive buffer. 204 */ 205 return UDS_BUF - UDS_HDRLEN; 206 } 207 208 /* 209 * Fetch 'len' bytes starting from absolute position 'pos' into the receive 210 * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'. 211 * Return the absolute position of the first byte after the fetched data in the 212 * receive buffer. 213 */ 214 static size_t 215 uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len) 216 { 217 size_t left; 218 219 assert(off < UDS_BUF); 220 221 left = UDS_BUF - off; 222 if (len >= left) { 223 memcpy(ptr, &uds->uds_buf[off], left); 224 225 if ((len -= left) > 0) 226 memcpy((char *)ptr + left, &uds->uds_buf[0], len); 227 228 return len; 229 } else { 230 memcpy(ptr, &uds->uds_buf[off], len); 231 232 return off + len; 233 } 234 } 235 236 /* 237 * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive 238 * buffer of socket 'uds', starting at absolute position 'pos' into the receive 239 * buffer. Return the absolute position of the first byte after the stored 240 * data in the receive buffer. 241 */ 242 static size_t 243 uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len) 244 { 245 size_t left; 246 247 assert(off < UDS_BUF); 248 249 left = UDS_BUF - off; 250 if (len >= left) { 251 memcpy(&uds->uds_buf[off], ptr, left); 252 253 if ((len -= left) > 0) 254 memcpy(&uds->uds_buf[0], (const char *)ptr + left, 255 len); 256 257 return len; 258 } else { 259 memcpy(&uds->uds_buf[off], ptr, len); 260 261 return off + len; 262 } 263 } 264 265 /* 266 * Fetch a segment header previously stored in the receive buffer of socket 267 * 'uds' at absolute position 'off'. Return the absolute position of the first 268 * byte after the header, as well as the entire segment length in 'seglen', the 269 * length of the data in the segment in 'datalen', and the segment flags in 270 * 'segflags'. 271 */ 272 static size_t 273 uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen, 274 size_t * datalen, unsigned int * segflags) 275 { 276 unsigned char hdr[UDS_HDRLEN]; 277 278 off = uds_fetch(uds, off, hdr, sizeof(hdr)); 279 280 *seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1]; 281 *datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3]; 282 *segflags = hdr[4]; 283 284 assert(*seglen >= UDS_HDRLEN); 285 assert(*seglen <= uds->uds_len); 286 assert(*datalen <= *seglen - UDS_HDRLEN); 287 assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN); 288 assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); 289 290 return off; 291 } 292 293 /* 294 * Store a segment header in the receive buffer of socket 'uds' at absolute 295 * position 'off', with the segment length 'seglen', the segment data length 296 * 'datalen', and the segment flags 'segflags'. Return the absolute receive 297 * buffer position of the first data byte after the stored header. 298 */ 299 static size_t 300 uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen, 301 unsigned int segflags) 302 { 303 unsigned char hdr[UDS_HDRLEN]; 304 305 assert(seglen <= USHRT_MAX); 306 assert(datalen <= seglen); 307 assert(segflags <= UCHAR_MAX); 308 assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); 309 310 hdr[0] = (seglen >> 8) & 0xff; 311 hdr[1] = seglen & 0xff; 312 hdr[2] = (datalen >> 8) & 0xff; 313 hdr[3] = datalen & 0xff; 314 hdr[4] = segflags; 315 316 return uds_store(uds, off, hdr, sizeof(hdr)); 317 } 318 319 /* 320 * Perform initial checks on a send request, before it may potentially be 321 * suspended. Return OK if this send request is valid, or a negative error 322 * code if it is not. 323 */ 324 int 325 uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, 326 const struct sockaddr * addr, socklen_t addr_len __unused, 327 endpoint_t user_endpt __unused, int flags) 328 { 329 struct udssock *uds = (struct udssock *)sock; 330 size_t pathlen; 331 332 /* 333 * Reject calls with unknown flags. Besides the flags handled entirely 334 * by libsockevent (which are not part of 'flags' here), that is all of 335 * them. TODO: ensure that we should really reject all other flags 336 * rather than ignore them. 337 */ 338 if (flags != 0) 339 return EOPNOTSUPP; 340 341 /* 342 * Perform very basic address and message size checks on the send call. 343 * For non-stream sockets, we must reject packets that may never fit in 344 * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the 345 * send call may end up being suspended indefinitely. Therefore, we 346 * assume the worst-case scenario, which is that a full set of 347 * credentials must be associated with the packet. As a result, we may 348 * reject some large packets that could actually just fit. Checking 349 * the peer's LOCAL_CREDS setting here is not safe: even if we know the 350 * peer already at all (for SOCK_DGRAM we do not), the send may still 351 * block and the option toggled before it unblocks. 352 */ 353 switch (uds_get_type(uds)) { 354 case SOCK_STREAM: 355 /* Nothing to check for this case. */ 356 break; 357 358 case SOCK_SEQPACKET: 359 if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN) 360 return EMSGSIZE; 361 362 break; 363 364 case SOCK_DGRAM: 365 if (!uds_has_link(uds) && addr == NULL) 366 return EDESTADDRREQ; 367 368 /* 369 * The path is stored without null terminator, but with leading 370 * byte containing the path length--if there is a path at all. 371 */ 372 pathlen = (size_t)uds->uds_pathlen; 373 if (pathlen > 0) 374 pathlen++; 375 376 if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN) 377 return EMSGSIZE; 378 379 break; 380 381 default: 382 assert(0); 383 } 384 385 return OK; 386 } 387 388 /* 389 * Determine whether the (real or pretend) send request should be processed 390 * now, suspended until later, or rejected based on the current socket state. 391 * Return OK if the send request should be processed now. Return SUSPEND if 392 * the send request should be retried later. Return an appropriate negative 393 * error code if the send request should fail. 394 */ 395 static int 396 uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min, 397 int partial) 398 { 399 struct udssock *conn; 400 size_t avail, hdrlen, credlen; 401 402 assert(!uds_is_shutdown(uds, SFL_SHUT_WR)); 403 404 if (uds_get_type(uds) != SOCK_DGRAM) { 405 if (uds_is_connecting(uds)) 406 return SUSPEND; 407 if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) 408 return ENOTCONN; 409 if (!uds_has_conn(uds)) 410 return EPIPE; 411 412 conn = uds->uds_conn; 413 414 if (uds_is_shutdown(conn, SFL_SHUT_RD)) 415 return EPIPE; 416 417 /* 418 * For connection-type sockets, we now have to check if there 419 * is enough room in the receive buffer. For SOCK_STREAM 420 * sockets, we must check if at least 'min' bytes can be moved 421 * into the receive buffer, at least if that is a reasonable 422 * value for ever making any forward progress at all. For 423 * SOCK_SEQPACKET sockets, we must check if the entire packet 424 * of size 'len' can be stored in the receive buffer. In both 425 * cases, we must take into account any metadata to store along 426 * with the data. 427 * 428 * Unlike in uds_pre_send(), we can now check safely whether 429 * the peer is expecting credentials, but we still don't know 430 * the actual size of the credentials, so again we take the 431 * maximum possible size. The same applies to file descriptors 432 * transferred via control data: all we have the control length 433 * right now, which if non-zero we assume to mean there might 434 * be file descriptors. 435 * 436 * In both cases, the reason of overestimating is that actually 437 * getting accurate sizes, by obtaining credentials or copying 438 * in control data, is very costly. We want to do that only 439 * when we are sure we will not suspend the send call after 440 * all. It is no problem to overestimate how much space will 441 * be needed here, but not to underestimate: that could cause 442 * applications that use select(2) and non-blocking sockets to 443 * end up in a busy-wait loop. 444 */ 445 if (!partial && (conn->uds_flags & UDSF_PASSCRED)) 446 credlen = 1 + UDS_MAXCREDLEN; 447 else 448 credlen = 0; 449 450 avail = UDS_BUF - conn->uds_len; 451 452 if (uds_get_type(uds) == SOCK_STREAM) { 453 /* 454 * Limit the low threshold to the maximum that can ever 455 * be sent at once. 456 */ 457 if (min > UDS_BUF - UDS_HDRLEN - credlen) 458 min = UDS_BUF - UDS_HDRLEN - credlen; 459 460 /* 461 * Suspend the call only if not even the low threshold 462 * is met. Otherwise we may make (partial) progress. 463 */ 464 if (len > min) 465 len = min; 466 467 /* 468 * If the receive buffer already has at least one 469 * segment, and there are certainly no file descriptors 470 * to transfer now, and we do not have to store 471 * credentials either, then this segment can be merged 472 * with the previous one. In that case, we need no 473 * space for a header. That is certainly the case if 474 * we are resuming an already partially completed send. 475 */ 476 hdrlen = (avail == UDS_BUF || ctl_len != 0 || 477 credlen > 0) ? UDS_HDRLEN : 0; 478 } else 479 hdrlen = UDS_HDRLEN; 480 481 if (avail < hdrlen + credlen + len) 482 return SUSPEND; 483 } 484 485 return OK; 486 } 487 488 /* 489 * Get the destination peer for a send request. The send test has already been 490 * performed first. On success, return OK, with a pointer to the peer socket 491 * stored in 'peerp'. On failure, return an appropriate error code. 492 */ 493 static int 494 uds_send_peer(struct udssock * uds, const struct sockaddr * addr, 495 socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp) 496 { 497 struct udssock *peer; 498 int r; 499 500 if (uds_get_type(uds) == SOCK_DGRAM) { 501 if (!uds_has_link(uds)) { 502 /* This was already checked in uds_pre_check(). */ 503 assert(addr != NULL); 504 505 /* 506 * Find the socket identified by the given address. 507 * If it exists at all, see if it is a proper match. 508 */ 509 if ((r = uds_lookup(uds, addr, addr_len, user_endpt, 510 &peer)) != OK) 511 return r; 512 513 /* 514 * If the peer socket is connected to a target, it 515 * must be this socket. Unfortunately, POSIX does not 516 * specify an error code for this. We borrow Linux's. 517 */ 518 if (uds_has_link(peer) && peer->uds_link != uds) 519 return EPERM; 520 } else 521 peer = uds->uds_link; 522 523 /* 524 * If the receiving end will never receive this packet, we 525 * might as well not send it, so drop it immeiately. Indicate 526 * as such to the caller, using NetBSD's chosen error code. 527 */ 528 if (uds_is_shutdown(peer, SFL_SHUT_RD)) 529 return ENOBUFS; 530 } else { 531 assert(uds_has_conn(uds)); 532 533 peer = uds->uds_conn; 534 } 535 536 *peerp = peer; 537 return OK; 538 } 539 540 /* 541 * Generate a new segment for the current send request, or arrange things such 542 * that new data can be merged with a previous segment. As part of this, 543 * decide whether we can merge data at all. The segment will be merged if, and 544 * only if, all of the following requirements are met: 545 * 546 * 1) the socket is of type SOCK_STREAM; 547 * 2) there is a previous segment in the receive buffer; 548 * 3) there is no ancillary data for the current send request. 549 * 550 * Also copy in regular data (if any), retrieve the sender's credentials (if 551 * needed), and copy over the source path (if applicable). However, do not yet 552 * commit the segment (or the new part to be merged), because the send request 553 * may still fail for other reasons. 554 * 555 * On success, return the length of the new segment (or, when merging, the 556 * length to be added to the last segment), as well as a flag indicating 557 * whether we are merging into the last segment in 'mergep', the length of the 558 * (new) data in the segment in 'datalenp', and the new segment's flags in 559 * 'segflagsp' (always zero when merging). Note that a return value of zero 560 * implies that we are merging zero extra bytes into the last segment, which 561 * means that effectively nothing changes; in that case the send call will be 562 * cut short and return zero to the caller as well. On failure, return a 563 * negative error code. 564 */ 565 static int 566 uds_send_data(struct udssock * uds, struct udssock * peer, 567 const struct sockdriver_data * data, size_t len, size_t off, 568 endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep, 569 size_t * __restrict datalenp, unsigned int * __restrict segflagsp) 570 { 571 struct sockcred sockcred; 572 gid_t groups[NGROUPS_MAX]; 573 iovec_t iov[2]; 574 unsigned int iovcnt, segflags; 575 unsigned char lenbyte; 576 size_t credlen, pathlen, datalen, seglen; 577 size_t avail, pos, left; 578 int r, merge; 579 580 /* 581 * At this point we should add the data to the peer's receive buffer. 582 * In the case of SOCK_STREAM sockets, we should add as much of the 583 * data as possible and suspend the call to send the rest later, if 584 * applicable. In the case of SOCK_DGRAM sockets, we should drop the 585 * packet if it does not fit in the buffer. 586 * 587 * Due to the checks in uds_can_send(), we know for sure that we no 588 * longer have to suspend without making any progress at this point. 589 */ 590 segflags = (nfds > 0) ? UDS_HAS_FDS : 0; 591 592 /* 593 * Obtain the credentials now. Doing so allows us to determine how 594 * much space we actually need for them. 595 */ 596 if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) { 597 memset(&sockcred, 0, sizeof(sockcred)); 598 599 if ((r = getsockcred(user_endpt, &sockcred, groups, 600 __arraycount(groups))) != OK) 601 return r; 602 603 credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups); 604 605 segflags |= UDS_HAS_CRED; 606 } else 607 credlen = 0; 608 609 /* For bound source datagram sockets, include the source path. */ 610 if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) { 611 pathlen = (size_t)uds->uds_pathlen + 1; 612 613 segflags |= UDS_HAS_PATH; 614 } else 615 pathlen = 0; 616 617 avail = UDS_BUF - peer->uds_len; 618 619 if (uds_get_type(uds) == SOCK_STREAM) { 620 /* 621 * Determine whether we can merge data into the previous 622 * segment. This is a more refined version of the test in 623 * uds_can_send(), as we now know whether there are actually 624 * any FDs to transfer. 625 */ 626 merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0); 627 628 /* Determine how much we can send at once. */ 629 if (!merge) { 630 assert(avail > UDS_HDRLEN + credlen); 631 datalen = avail - UDS_HDRLEN - credlen; 632 } else 633 datalen = avail; 634 635 if (datalen > len) 636 datalen = len; 637 638 /* If we cannot make progress, we should have suspended.. */ 639 assert(datalen != 0 || len == 0); 640 } else { 641 merge = FALSE; 642 643 datalen = len; 644 } 645 assert(datalen <= len); 646 assert(datalen <= UDS_BUF); 647 648 /* 649 * Compute the total amount of space we need for the segment in the 650 * receive buffer. Given that we have done will-it-fit tests in 651 * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one 652 * case left where the result may not fit, and that is for SOCK_DGRAM 653 * packets. In that case, we drop the packet. POSIX says we should 654 * throw an error in that case, and that is also what NetBSD does. 655 */ 656 if (!merge) 657 seglen = UDS_HDRLEN + credlen + pathlen + datalen; 658 else 659 seglen = datalen; 660 661 if (seglen > avail) { 662 assert(uds_get_type(uds) == SOCK_DGRAM); 663 664 /* Drop the packet, borrowing NetBSD's chosen error code. */ 665 return ENOBUFS; 666 } 667 668 /* 669 * Generate the full segment, but do not yet update the buffer head. 670 * We may still run into an error (copying in file descriptors) or even 671 * decide that nothing gets sent after all (if there are no data or 672 * file descriptors). If we are merging the new data into the previous 673 * segment, do not generate a header. 674 */ 675 pos = uds_get_head(peer); 676 677 /* Generate the header, if needed. */ 678 if (!merge) 679 pos = uds_store_hdr(peer, pos, seglen, datalen, segflags); 680 else 681 assert(segflags == 0); 682 683 /* Copy in and store the sender's credentials, if desired. */ 684 if (credlen > 0) { 685 assert(credlen >= 1 + sizeof(sockcred)); 686 assert(credlen <= UCHAR_MAX); 687 688 lenbyte = credlen - 1; 689 pos = uds_store(peer, pos, &lenbyte, 1); 690 691 if (sockcred.sc_ngroups > 0) { 692 pos = uds_store(peer, pos, &sockcred, 693 offsetof(struct sockcred, sc_groups)); 694 pos = uds_store(peer, pos, groups, 695 sockcred.sc_ngroups * sizeof(gid_t)); 696 } else 697 pos = uds_store(peer, pos, &sockcred, 698 sizeof(sockcred)); 699 } 700 701 /* Store the sender's address if any. Datagram sockets only. */ 702 if (pathlen > 0) { 703 assert(pathlen > 1); 704 assert(pathlen <= UCHAR_MAX); 705 706 lenbyte = uds->uds_pathlen; 707 pos = uds_store(peer, pos, &lenbyte, 1); 708 pos = uds_store(peer, pos, uds->uds_path, pathlen - 1); 709 } 710 711 /* Lastly, copy in the actual data (if any) from the caller. */ 712 if (datalen > 0) { 713 iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos]; 714 left = UDS_BUF - pos; 715 716 if (left < datalen) { 717 assert(left > 0); 718 iov[0].iov_size = left; 719 iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0]; 720 iov[1].iov_size = datalen - left; 721 iovcnt = 2; 722 } else { 723 iov[0].iov_size = datalen; 724 iovcnt = 1; 725 } 726 727 if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK) 728 return r; 729 } 730 731 *mergep = merge; 732 *datalenp = datalen; 733 *segflagsp = segflags; 734 return seglen; 735 } 736 737 /* 738 * Copy in control data for the current send request, and extract any file 739 * descriptors to be transferred. Do not yet duplicate the file descriptors, 740 * but rather store a list in a temporary buffer: the send request may still 741 * fail in which case we want to avoid having to undo the duplication. 742 * 743 * On success, return the number of (zero or more) file descriptors extracted 744 * from the request and stored in the temporary buffer. On failure, return a 745 * negative error code. 746 */ 747 static int 748 uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len, 749 endpoint_t user_endpt) 750 { 751 struct msghdr msghdr; 752 struct cmsghdr *cmsg; 753 socklen_t left; 754 unsigned int i, n, nfds; 755 int r; 756 757 /* 758 * Copy in the control data. We can spend a lot of effort copying in 759 * the data in small chunks, and change the receiving side to do the 760 * same, but it is really not worth it: applications never send a whole 761 * lot of file descriptors at once, and the buffer size is currently 762 * such that the UDS service itself will exhaust its OPEN_MAX limit 763 * anyway if they do. 764 */ 765 if (ctl_len > sizeof(uds_ctlbuf)) 766 return ENOBUFS; 767 768 if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK) 769 return r; 770 771 if (ctl_len < sizeof(uds_ctlbuf)) 772 memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len); 773 774 /* 775 * Look for any file descriptors, and store their remote file 776 * descriptor numbers into a temporary array. 777 */ 778 memset(&msghdr, 0, sizeof(msghdr)); 779 msghdr.msg_control = uds_ctlbuf; 780 msghdr.msg_controllen = ctl_len; 781 782 nfds = 0; 783 r = OK; 784 785 /* 786 * The sender may provide file descriptors in multiple chunks. 787 * Currently we do not preserve these chunk boundaries, instead 788 * generating one single chunk with all file descriptors for the 789 * segment upon receipt. If needed, we can fairly easily adapt this 790 * later. 791 */ 792 for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; 793 cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { 794 /* 795 * Check for bogus lengths. There is no excuse for this; 796 * either the caller does not know what they are doing or we 797 * are looking at a hacking attempt. 798 */ 799 assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len); 800 left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf); 801 assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ 802 803 if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { 804 printf("UDS: malformed control data from %u\n", 805 user_endpt); 806 r = EINVAL; 807 break; 808 } 809 810 if (cmsg->cmsg_level != SOL_SOCKET || 811 cmsg->cmsg_type != SCM_RIGHTS) 812 continue; 813 814 n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); 815 816 for (i = 0; i < n; i++) { 817 /* 818 * Copy the file descriptor to the temporary buffer, 819 * whose size is based on the control data buffer, so 820 * it is always large enough to contain all FDs. 821 */ 822 assert(nfds < __arraycount(uds_ctlfds)); 823 824 memcpy(&uds_ctlfds[nfds], 825 &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); 826 827 nfds++; 828 } 829 } 830 831 return nfds; 832 } 833 834 /* 835 * Actually duplicate any file descriptors that we extracted from the sender's 836 * control data and stored in our temporary buffer. On success, return OK, 837 * with all file descriptors stored in file descriptor objects that are 838 * appended to the socket's list of in-flight FD objects. Thus, on success, 839 * the send request may no longer fail. On failure, return a negative error 840 * code, with any partial duplication undone. 841 */ 842 static int 843 uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt) 844 { 845 SIMPLEQ_HEAD(, uds_fd) fds; 846 struct uds_fd *ufd; 847 unsigned int i; 848 int r; 849 850 SIMPLEQ_INIT(&fds); 851 852 for (i = 0; i < nfds; i++) { 853 if (SIMPLEQ_EMPTY(&uds_freefds)) { 854 /* UDS itself may already have OPEN_MAX FDs. */ 855 r = ENFILE; 856 break; 857 } 858 859 /* 860 * The caller may have given an invalid FD, or UDS itself may 861 * unexpectedly have run out of available file descriptors etc. 862 */ 863 if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0) 864 break; 865 866 ufd = SIMPLEQ_FIRST(&uds_freefds); 867 SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next); 868 869 ufd->ufd_fd = r; 870 ufd->ufd_count = 0; 871 872 SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next); 873 874 dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r)); 875 } 876 877 /* Did we experience an error while copying in the file descriptors? */ 878 if (r < 0) { 879 /* Revert the successful copyfd() calls made so far. */ 880 SIMPLEQ_FOREACH(ufd, &fds, ufd_next) { 881 dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); 882 883 closenb(ufd->ufd_fd); 884 } 885 886 SIMPLEQ_CONCAT(&uds_freefds, &fds); 887 888 return r; 889 } 890 891 /* 892 * Success. If there were any file descriptors at all, add them to the 893 * peer's list of in-flight file descriptors. Assign the number of 894 * file descriptors copied in to the first file descriptor object, so 895 * that we know how many to copy out (or discard) for this segment. 896 * Also set the UDS_HAS_FDS flag on the segment. 897 */ 898 ufd = SIMPLEQ_FIRST(&fds); 899 ufd->ufd_count = nfds; 900 901 SIMPLEQ_CONCAT(&peer->uds_fds, &fds); 902 903 return OK; 904 } 905 906 /* 907 * The current send request is successful or at least has made progress. 908 * Commit the new segment or, if we decided to merge the new data into the last 909 * segment, update the header of the last segment. Also wake up the receiving 910 * side, because there will now be new data to receive. 911 */ 912 static void 913 uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen, 914 int merge, size_t seglen, unsigned int segflags) 915 { 916 size_t pos, prevseglen, prevdatalen; 917 918 /* 919 * For non-datagram sockets, credentials are sent only once after 920 * setting the LOCAL_CREDS option. After that, the option is unset. 921 */ 922 if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM) 923 peer->uds_flags &= ~UDSF_PASSCRED; 924 925 if (merge) { 926 assert(segflags == 0); 927 928 pos = uds_get_last(peer); 929 930 (void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen, 931 &segflags); 932 933 peer->uds_len += seglen; 934 assert(peer->uds_len <= UDS_BUF); 935 936 seglen += prevseglen; 937 datalen += prevdatalen; 938 assert(seglen <= UDS_BUF); 939 940 uds_store_hdr(peer, pos, seglen, datalen, segflags); 941 } else { 942 peer->uds_last = peer->uds_len; 943 944 peer->uds_len += seglen; 945 assert(peer->uds_len <= UDS_BUF); 946 } 947 948 /* Now that there are new data, wake up the receiver side. */ 949 sockevent_raise(&peer->uds_sock, SEV_RECV); 950 } 951 952 /* 953 * Process a send request. Return OK if the send request has successfully 954 * completed, SUSPEND if it should be tried again later, or a negative error 955 * code on failure. In all cases, the values of 'off' and 'ctl_off' must be 956 * updated if any progress has been made; if either is non-zero, libsockevent 957 * will return the partial progress rather than an error code. 958 */ 959 int 960 uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len, 961 size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, 962 socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len, 963 endpoint_t user_endpt, int flags __unused, size_t min) 964 { 965 struct udssock *uds = (struct udssock *)sock; 966 struct udssock *peer; 967 size_t seglen, datalen = 0 /*gcc*/; 968 unsigned int nfds, segflags = 0 /*gcc*/; 969 int r, partial, merge = 0 /*gcc*/; 970 971 dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n", 972 uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, 973 (ctl_off != NULL) ? *ctl_off : 0, flags)); 974 975 partial = (off != NULL && *off > 0); 976 977 /* 978 * First see whether we can process this send call at all right now. 979 * Most importantly, for connected sockets, if the peer's receive 980 * buffer is full, we may have to suspend the call until some space has 981 * been freed up. 982 */ 983 if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK) 984 return r; 985 986 /* 987 * Then get the peer socket. For connected sockets, this is trivial. 988 * For unconnected sockets, it may involve a lookup of the given 989 * address. 990 */ 991 if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK) 992 return r; 993 994 /* 995 * We now know for sure that we will not suspend this call without 996 * making any progress. However, the call may still fail. Copy in 997 * control data first now, so that we know whether there are any file 998 * descriptors to transfer. This aspect may determine whether or not 999 * we can merge data with a previous segment. Do not actually copy in 1000 * the actual file descriptors yet, because that is much harder to undo 1001 * in case of a failure later on. 1002 */ 1003 if (ctl_len > 0) { 1004 /* We process control data once, in full. */ 1005 assert(*ctl_off == 0); 1006 1007 if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0) 1008 return r; 1009 nfds = (unsigned int)r; 1010 } else 1011 nfds = 0; 1012 1013 /* 1014 * Now generate a new segment, or (if possible) merge new data into the 1015 * last segment. Since the call may still fail, prepare the segment 1016 * but do not update the buffer head yet. Note that the segment 1017 * contains not just regular data (in fact it may contain no data at 1018 * all) but (also) certain ancillary data. 1019 */ 1020 if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds, 1021 &merge, &datalen, &segflags)) <= 0) 1022 return r; 1023 seglen = (size_t)r; 1024 1025 /* 1026 * If we extracted any file descriptors from the control data earlier, 1027 * copy them over to ourselves now. The resulting in-flight file 1028 * descriptors are stored in a separate data structure. This is the 1029 * last point where the send call may actually fail. 1030 */ 1031 if (nfds > 0) { 1032 if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK) 1033 return r; 1034 } 1035 1036 /* 1037 * The transmission is now known to be (partially) successful. Commit 1038 * the new work by moving the receive buffer head. 1039 */ 1040 uds_send_advance(uds, peer, datalen, merge, seglen, segflags); 1041 1042 /* 1043 * Register the result. For stream-type sockets, the expected behavior 1044 * is that all data be sent, and so we may still have to suspend the 1045 * call after partial progress. Otherwise, we are now done. Either 1046 * way, we are done with the control data, so mark it as consumed. 1047 */ 1048 *off += datalen; 1049 *ctl_off += ctl_len; 1050 if (uds_get_type(uds) == SOCK_STREAM && datalen < len) 1051 return SUSPEND; 1052 else 1053 return OK; 1054 } 1055 1056 /* 1057 * Test whether a send request would block. The given 'min' parameter contains 1058 * the minimum number of bytes that should be possible to send without blocking 1059 * (the low send watermark). Return SUSPEND if the send request would block, 1060 * or any other error code if it would not. 1061 */ 1062 int 1063 uds_test_send(struct sock * sock, size_t min) 1064 { 1065 struct udssock *uds = (struct udssock *)sock; 1066 1067 return uds_send_test(uds, min, 0, min, FALSE /*partial*/); 1068 } 1069 1070 /* 1071 * Perform initial checks on a receive request, before it may potentially be 1072 * suspended. Return OK if this receive request is valid, or a negative error 1073 * code if it is not. 1074 */ 1075 int 1076 uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, 1077 int flags) 1078 { 1079 1080 /* 1081 * Reject calls with unknown flags. TODO: ensure that we should really 1082 * reject all other flags rather than ignore them. 1083 */ 1084 if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0) 1085 return EOPNOTSUPP; 1086 1087 return OK; 1088 } 1089 1090 /* 1091 * Determine whether the (real or pretend) receive request should be processed 1092 * now, suspended until later, or rejected based on the current socket state. 1093 * Return OK if the receive request should be processed now, along with a first 1094 * indication whether the call may still be suspended later in 'may_block'. 1095 * Return SUSPEND if the receive request should be retried later. Return an 1096 * appropriate negative error code if the receive request should fail. 1097 */ 1098 static int 1099 uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial, 1100 int * may_block) 1101 { 1102 size_t seglen, datalen; 1103 unsigned int segflags; 1104 int r; 1105 1106 /* 1107 * If there are any pending data, those should always be received 1108 * first. However, if there is nothing to receive, then whether we 1109 * should suspend the receive call or fail immediately depends on other 1110 * conditions. We first look at these other conditions. 1111 */ 1112 r = OK; 1113 1114 if (uds_get_type(uds) != SOCK_DGRAM) { 1115 if (uds_is_connecting(uds)) 1116 r = SUSPEND; 1117 else if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) 1118 r = ENOTCONN; 1119 else if (!uds_has_conn(uds) || 1120 uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR)) 1121 r = SOCKEVENT_EOF; 1122 } 1123 1124 if (uds->uds_len == 0) { 1125 /* 1126 * For stream-type sockets, we use the policy: if no regular 1127 * data is requested, then end the call without receiving 1128 * anything. For packet-type sockets, the request should block 1129 * until there is a packet to discard, though. 1130 */ 1131 if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0)) 1132 return r; 1133 1134 return SUSPEND; 1135 } 1136 1137 /* 1138 * For stream-type sockets, we should still suspend the call if fewer 1139 * than 'min' bytes are available right now, and there is a possibility 1140 * that more data may arrive later. More may arrive later iff 'r' is 1141 * OK (i.e., no EOF or error will follow) and, in case we already 1142 * received some partial results, there is not already a next segment 1143 * with ancillary data (i.e, nonzero segment flags), or in any case 1144 * there isn't more than one segment in the buffer. Limit 'min' to the 1145 * maximum that can ever be received, though. Since that is difficult 1146 * in our case, we check whether the buffer is entirely full instead. 1147 */ 1148 if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 && 1149 uds->uds_len < UDS_BUF) { 1150 assert(uds->uds_len >= UDS_HDRLEN); 1151 1152 (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen, 1153 &segflags); 1154 1155 if (datalen < min && seglen == uds->uds_len && 1156 (!partial || segflags == 0)) 1157 return SUSPEND; 1158 } 1159 1160 /* 1161 * Also start the decision process as to whether we should suspend the 1162 * current call if MSG_WAITALL is given. Unfortunately there is no one 1163 * place where we can conveniently do all the required checks. 1164 */ 1165 if (may_block != NULL) 1166 *may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM); 1167 return OK; 1168 } 1169 1170 /* 1171 * Receive regular data, and possibly the source path, from the tail segment in 1172 * the receive buffer. On success, return the positive non-zero length of the 1173 * tail segment, with 'addr' and 'addr_len' modified to store the source 1174 * address if applicable, the result flags in 'rflags' updated as appropriate, 1175 * the tail segment's data length stored in 'datalen', the number of received 1176 * regular data bytes stored in 'reslen', the segment flags stored in 1177 * 'segflags', and the absolute receive buffer position of the credentials in 1178 * the segment stored in 'credpos' if applicable. Since the receive call may 1179 * still fail, this function must not yet update the tail or any other aspect 1180 * of the receive buffer. Return zero if the current receive call was already 1181 * partially successful (due to MSG_WAITALL) and can no longer make progress, 1182 * and thus should be ended. Return a negative error code on failure. 1183 */ 1184 static int 1185 uds_recv_data(struct udssock * uds, const struct sockdriver_data * data, 1186 size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len, 1187 int * __restrict rflags, size_t * __restrict datalen, 1188 size_t * __restrict reslen, unsigned int * __restrict segflags, 1189 size_t * __restrict credpos) 1190 { 1191 iovec_t iov[2]; 1192 unsigned char lenbyte; 1193 unsigned int iovcnt; 1194 size_t pos, seglen, left; 1195 int r; 1196 1197 pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags); 1198 1199 /* 1200 * If a partially completed receive now runs into a segment that cannot 1201 * be logically merged with the previous one (because it has at least 1202 * one segment flag set, meaning it has ancillary data), then we must 1203 * shortcut the receive now. 1204 */ 1205 if (off != 0 && *segflags != 0) 1206 return OK; 1207 1208 /* 1209 * As stated, for stream-type sockets, we choose to ignore zero-size 1210 * receive calls. This has the consequence that reading a zero-sized 1211 * segment (with ancillary data) requires a receive request for at 1212 * least one regular data byte. Such a receive call would then return 1213 * zero. The problem with handling zero-data receive requests is that 1214 * we need to know whether the current segment is terminated (i.e., no 1215 * more data can possibly be merged into it later), which is a test 1216 * that we rather not perform, not in the least because we do not know 1217 * whether there is an error pending on the socket. 1218 * 1219 * For datagrams, we currently allow a zero-size receive call to 1220 * discard the next datagram. 1221 * 1222 * TODO: compare this against policies on other platforms. 1223 */ 1224 if (len == 0 && uds_get_type(uds) == SOCK_STREAM) 1225 return OK; 1226 1227 /* 1228 * We have to skip the credentials for now: these are copied out as 1229 * control data, and thus will (well, may) be looked at when dealing 1230 * with the control data. For the same reason, we do not even look at 1231 * UDS_HAS_FDS here. 1232 */ 1233 if (*segflags & UDS_HAS_CRED) { 1234 *credpos = pos; 1235 1236 pos = uds_fetch(uds, pos, &lenbyte, 1); 1237 pos = uds_advance(pos, (size_t)lenbyte); 1238 } 1239 1240 /* 1241 * Copy out the source address, but only if the (datagram) socket is 1242 * not connected. TODO: even when it is connected, it may still 1243 * receive packets sent to it from other sockets *before* being 1244 * connected, and the receiver has no way of knowing that those packets 1245 * did not come from its new peer. Ideally, the older packets should 1246 * be dropped.. 1247 */ 1248 if (*segflags & UDS_HAS_PATH) { 1249 pos = uds_fetch(uds, pos, &lenbyte, 1); 1250 1251 if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds)) 1252 uds_make_addr((const char *)&uds->uds_buf[pos], 1253 (size_t)lenbyte, addr, addr_len); 1254 1255 pos = uds_advance(pos, (size_t)lenbyte); 1256 } 1257 1258 /* 1259 * We can receive no more data than those that are present in the 1260 * segment, obviously. For stream-type sockets, any more data that 1261 * could have been received along with the current data would have been 1262 * merged in the current segment, so we need not search for any next 1263 * segments. 1264 * 1265 * For non-stream sockets, the caller may receive less than a whole 1266 * packet if it supplied a small buffer. In that case, the rest of the 1267 * packet will be discarded (but not here yet!) and the caller gets 1268 * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway. 1269 */ 1270 if (len > *datalen) 1271 len = *datalen; 1272 else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM) 1273 *rflags |= MSG_TRUNC; 1274 1275 /* Copy out the data to the caller. */ 1276 if (len > 0) { 1277 iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos]; 1278 left = UDS_BUF - pos; 1279 1280 if (left < len) { 1281 iov[0].iov_size = left; 1282 iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0]; 1283 iov[1].iov_size = len - left; 1284 iovcnt = 2; 1285 } else { 1286 iov[0].iov_size = len; 1287 iovcnt = 1; 1288 } 1289 1290 if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK) 1291 return r; 1292 } 1293 1294 *reslen = len; 1295 assert(seglen > 0 && seglen <= INT_MAX); 1296 return (int)seglen; 1297 } 1298 1299 /* 1300 * The current segment has associated file descriptors. If possible, copy out 1301 * all file descriptors to the receiver, and generate and copy out a chunk of 1302 * control data that contains their file descriptor numbers. If not all 1303 * file descriptors fit in the receiver's buffer, or if any error occurs, no 1304 * file descriptors are copied out. 1305 */ 1306 static int 1307 uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl, 1308 socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags) 1309 { 1310 struct msghdr msghdr; 1311 struct cmsghdr *cmsg; 1312 struct uds_fd *ufd; 1313 unsigned int i, nfds; 1314 socklen_t chunklen, chunkspace; 1315 int r, fd, what; 1316 1317 /* See how many file descriptors should be part of this chunk. */ 1318 assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); 1319 ufd = SIMPLEQ_FIRST(&uds->uds_fds); 1320 nfds = ufd->ufd_count; 1321 assert(nfds > 0); 1322 1323 /* 1324 * We produce and copy out potentially unaligned chunks, using 1325 * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE. 1326 * This may leave "gap" bytes unchanged in userland, but that should 1327 * not be a problem. By producing unaligned chunks, we eliminate a 1328 * potential boundary case where the unaligned chunk passed in (by the 1329 * sender) no longer fits in the same buffer after being aligned here. 1330 */ 1331 chunklen = CMSG_LEN(sizeof(int) * nfds); 1332 chunkspace = CMSG_SPACE(sizeof(int) * nfds); 1333 assert(chunklen <= sizeof(uds_ctlbuf)); 1334 if (chunklen > ctl_len) 1335 return 0; /* chunk would not fit, so produce nothing instead */ 1336 if (chunkspace > ctl_len) 1337 chunkspace = ctl_len; 1338 1339 memset(&msghdr, 0, sizeof(msghdr)); 1340 msghdr.msg_control = uds_ctlbuf; 1341 msghdr.msg_controllen = sizeof(uds_ctlbuf); 1342 1343 memset(uds_ctlbuf, 0, chunklen); 1344 cmsg = CMSG_FIRSTHDR(&msghdr); 1345 cmsg->cmsg_len = chunklen; 1346 cmsg->cmsg_level = SOL_SOCKET; 1347 cmsg->cmsg_type = SCM_RIGHTS; 1348 1349 /* 1350 * Copy the group's local file descriptors to the target endpoint, and 1351 * store the resulting remote file descriptors in the chunk buffer. 1352 */ 1353 r = OK; 1354 1355 for (i = 0; i < nfds; i++) { 1356 assert(ufd != SIMPLEQ_END(&uds->uds_fds)); 1357 assert(i == 0 || ufd->ufd_count == 0); 1358 1359 what = COPYFD_TO; 1360 if (flags & MSG_CMSG_CLOEXEC) 1361 what |= COPYFD_CLOEXEC; 1362 1363 /* Failure may happen legitimately here (e.g., EMFILE). */ 1364 if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0) 1365 break; /* we keep our progress so far in 'i' */ 1366 1367 fd = r; 1368 1369 dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd)); 1370 1371 memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int)); 1372 1373 ufd = SIMPLEQ_NEXT(ufd, ufd_next); 1374 } 1375 1376 /* If everything went well so far, copy out the produced chunk. */ 1377 if (r >= 0) 1378 r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen); 1379 1380 /* 1381 * Handle errors. At this point, the 'i' variable contains the number 1382 * of file descriptors that have already been successfully copied out. 1383 */ 1384 if (r < 0) { 1385 /* Revert the successful copyfd() calls made so far. */ 1386 while (i-- > 0) { 1387 memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); 1388 1389 (void)copyfd(user_endpt, fd, COPYFD_CLOSE); 1390 } 1391 1392 return r; 1393 } 1394 1395 /* 1396 * Success. Return the aligned size of the produced chunk, if the 1397 * given length permits it. From here on, the receive call may no 1398 * longer fail, as that would result in lost file descriptors. 1399 */ 1400 return chunkspace; 1401 } 1402 1403 /* 1404 * Generate and copy out a chunk of control data with the sender's credentials. 1405 * Return the aligned chunk size on success, or a negative error code on 1406 * failure. 1407 */ 1408 static int 1409 uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl, 1410 socklen_t ctl_len, socklen_t ctl_off, size_t credpos) 1411 { 1412 struct msghdr msghdr; 1413 struct cmsghdr *cmsg; 1414 socklen_t chunklen, chunkspace; 1415 unsigned char lenbyte; 1416 size_t credlen; 1417 int r; 1418 1419 /* 1420 * Since the sender side already did the hard work of producing the 1421 * (variable-size) sockcred structure as it should be received, there 1422 * is relatively little work to be done here. 1423 */ 1424 credpos = uds_fetch(uds, credpos, &lenbyte, 1); 1425 credlen = (size_t)lenbyte; 1426 1427 chunklen = CMSG_LEN(credlen); 1428 chunkspace = CMSG_SPACE(credlen); 1429 assert(chunklen <= sizeof(uds_ctlbuf)); 1430 if (chunklen > ctl_len) 1431 return 0; /* chunk would not fit, so produce nothing instead */ 1432 if (chunkspace > ctl_len) 1433 chunkspace = ctl_len; 1434 1435 memset(&msghdr, 0, sizeof(msghdr)); 1436 msghdr.msg_control = uds_ctlbuf; 1437 msghdr.msg_controllen = sizeof(uds_ctlbuf); 1438 1439 memset(uds_ctlbuf, 0, chunklen); 1440 cmsg = CMSG_FIRSTHDR(&msghdr); 1441 cmsg->cmsg_len = chunklen; 1442 cmsg->cmsg_level = SOL_SOCKET; 1443 cmsg->cmsg_type = SCM_CREDS; 1444 1445 uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen); 1446 1447 if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK) 1448 return r; 1449 1450 return chunkspace; 1451 } 1452 1453 /* 1454 * Copy out control data for the ancillary data associated with the current 1455 * segment, if any. Return OK on success, at which point the current receive 1456 * call may no longer fail. 'rflags' may be updated with additional result 1457 * flags. Return a negative error code on failure. 1458 */ 1459 static int 1460 uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl, 1461 socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt, 1462 int flags, unsigned int segflags, size_t credpos, int * rflags) 1463 { 1464 int r; 1465 1466 /* 1467 * We first copy out all file descriptors, if any. We put them in one 1468 * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS 1469 * chunks. We believe that this should not cause application-level 1470 * issues, but if it does, we can change that later with some effort. 1471 * We then copy out credentials, if any. 1472 * 1473 * We copy out each control chunk independently of the others, and also 1474 * perform error recovery on a per-chunk basis. This implies the 1475 * following. If producing or copying out the first chunk fails, the 1476 * entire recvmsg(2) call will fail with an appropriate error. If 1477 * producing or copying out any subsequent chunk fails, the recvmsg(2) 1478 * call will still return the previously generated chunks (a "short 1479 * control read" if you will) as well as the MSG_CTRUNC flag. This 1480 * approach is simple and clean, and it guarantees that we can always 1481 * copy out at least as many file descriptors as we copied in for this 1482 * segment, even if credentials are present as well. However, the 1483 * approach does cause slightly more overhead when there are multiple 1484 * chunks per call, as those are copied out separately. 1485 * 1486 * Since the generated SCM_RIGHTS chunk is never larger than the 1487 * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf" 1488 * buffer is always large enough to contain the chunk in its entirety. 1489 * SCM_CREDS chunks should always fit easily as well. 1490 * 1491 * The MSG_CTRUNC flag will be returned iff not the entire user-given 1492 * control buffer was filled and not all control chunks were delivered. 1493 * Our current implementation does not deliver partial chunks. NetBSD 1494 * does, except for SCM_RIGHTS chunks. 1495 * 1496 * TODO: get rid of the redundancy in processing return values. 1497 */ 1498 if (segflags & UDS_HAS_FDS) { 1499 r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt, 1500 flags); 1501 1502 /* 1503 * At this point, 'r' contains one of the following: 1504 * 1505 * r > 0 a chunk of 'r' bytes was added successfully. 1506 * r == 0 not enough space left; the chunk was not added. 1507 * r < 0 an error occurred; the chunk was not added. 1508 */ 1509 if (r < 0 && *ctl_off == 0) 1510 return r; 1511 1512 if (r > 0) { 1513 ctl_len -= r; 1514 *ctl_off += r; 1515 } else 1516 *rflags |= MSG_CTRUNC; 1517 } 1518 1519 if (segflags & UDS_HAS_CRED) { 1520 r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos); 1521 1522 /* As above. */ 1523 if (r < 0 && *ctl_off == 0) 1524 return r; 1525 1526 if (r > 0) { 1527 ctl_len -= r; 1528 *ctl_off += r; 1529 } else 1530 *rflags |= MSG_CTRUNC; 1531 } 1532 1533 return OK; 1534 } 1535 1536 /* 1537 * The current receive request is successful or, in the case of MSG_WAITALL, 1538 * has made progress. Advance the receive buffer tail, either by discarding 1539 * the entire tail segment or by generating a new, smaller tail segment that 1540 * contains only the regular data left to be received from the original tail 1541 * segment. Also wake up the sending side for connection-oriented sockets if 1542 * applicable, because there may now be room for more data to be sent. Update 1543 * 'may_block' if we are now sure that the call may not block on MSG_WAITALL 1544 * after all. 1545 */ 1546 static void 1547 uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen, 1548 size_t reslen, unsigned int segflags, int * may_block) 1549 { 1550 struct udssock *conn; 1551 struct uds_fd *ufd; 1552 size_t delta, nseglen, advance; 1553 unsigned int nfds; 1554 1555 /* Note that 'reslen' may be legitimately zero. */ 1556 assert(reslen <= datalen); 1557 1558 if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen) 1559 reslen = datalen; 1560 1561 delta = datalen - reslen; 1562 1563 if (delta == 0) { 1564 /* 1565 * Fully consume the tail segment. We advance the tail by the 1566 * full segment length, thus moving up to either the next 1567 * segment in the receive buffer, or an empty receive buffer. 1568 */ 1569 advance = seglen; 1570 1571 uds->uds_tail = uds_advance(uds->uds_tail, advance); 1572 } else { 1573 /* 1574 * Partially consume the tail segment. We put a new segment 1575 * header right in front of the remaining data, which obviously 1576 * always fits. Since any ancillary data was consumed along 1577 * with the first data byte of the segment, the new segment has 1578 * no ancillary data anymore (and thus a zero flags field). 1579 */ 1580 nseglen = UDS_HDRLEN + delta; 1581 assert(nseglen < seglen); 1582 1583 advance = seglen - nseglen; 1584 1585 uds->uds_tail = uds_advance(uds->uds_tail, advance); 1586 1587 uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0); 1588 } 1589 1590 /* 1591 * For datagram-oriented sockets, we always consume at least a header. 1592 * For stream-type sockets, we either consume a zero-data segment along 1593 * with its ancillary data, or we consume at least one byte from a 1594 * segment that does have regular data. In all other cases, the 1595 * receive call has already been ended by now. Thus, we always advance 1596 * the tail of the receive buffer here. 1597 */ 1598 assert(advance > 0); 1599 1600 /* 1601 * The receive buffer's used length (uds_len) and pointer to the 1602 * previous segment header (uds_last) are offsets from the tail. Now 1603 * that we have moved the tail, we need to adjust these accordingly. 1604 * If the buffer is now empty, reset the tail to the buffer start so as 1605 * to avoid splitting inter-process copies whenever possible. 1606 */ 1607 assert(uds->uds_len >= advance); 1608 uds->uds_len -= advance; 1609 1610 if (uds->uds_len == 0) 1611 uds->uds_tail = 0; 1612 1613 /* 1614 * If uds_last is zero here, it was pointing to the segment we just 1615 * (partially) consumed. By leaving it zero, it will still point to 1616 * the new or next segment. 1617 */ 1618 if (uds->uds_last > 0) { 1619 assert(uds->uds_len > 0); 1620 assert(uds->uds_last >= advance); 1621 uds->uds_last -= advance; 1622 } 1623 1624 /* 1625 * If there were any file descriptors associated with this segment, 1626 * close and free them now. 1627 */ 1628 if (segflags & UDS_HAS_FDS) { 1629 assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); 1630 ufd = SIMPLEQ_FIRST(&uds->uds_fds); 1631 nfds = ufd->ufd_count; 1632 assert(nfds > 0); 1633 1634 while (nfds-- > 0) { 1635 assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); 1636 ufd = SIMPLEQ_FIRST(&uds->uds_fds); 1637 SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next); 1638 1639 dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); 1640 1641 closenb(ufd->ufd_fd); 1642 1643 SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next); 1644 } 1645 } 1646 1647 /* 1648 * If there is now any data left in the receive buffer, then there has 1649 * been a reason that we haven't received it. For stream sockets, that 1650 * reason is that the next segment has ancillary data. In any case, 1651 * this means we should never block the current receive operation 1652 * waiting for more data. Otherwise, we may block on MSG_WAITALL. 1653 */ 1654 if (uds->uds_len > 0) 1655 *may_block = FALSE; 1656 1657 /* 1658 * If the (non-datagram) socket has a peer that is not shut down for 1659 * writing, see if it can be woken up to send more data. Note that 1660 * the event will never be processed immediately. 1661 */ 1662 if (uds_is_connected(uds)) { 1663 assert(uds_get_type(uds) != SOCK_DGRAM); 1664 1665 conn = uds->uds_conn; 1666 1667 if (!uds_is_shutdown(conn, SFL_SHUT_WR)) 1668 sockevent_raise(&conn->uds_sock, SEV_SEND); 1669 } 1670 } 1671 1672 /* 1673 * Process a receive request. Return OK if the receive request has completed 1674 * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an 1675 * end-of-file condition is reached, or a negative error code on failure. In 1676 * all cases, the values of 'off' and 'ctl_off' must be updated if any progress 1677 * has been made; if either is non-zero, libsockevent will return the partial 1678 * progress rather than an error code or EOF. 1679 */ 1680 int 1681 uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len, 1682 size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, 1683 socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len, 1684 endpoint_t user_endpt, int flags, size_t min, int * rflags) 1685 { 1686 struct udssock *uds = (struct udssock *)sock; 1687 size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/; 1688 unsigned int segflags; 1689 int r, partial, may_block; 1690 1691 dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n", 1692 uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, 1693 (ctl_off != NULL) ? *ctl_off : 0, flags)); 1694 1695 /* 1696 * Start by testing whether anything can be received at all, or whether 1697 * an error or EOF should be returned instead, or whether the receive 1698 * call should be suspended until later otherwise. If no (regular or 1699 * control) data can be received, or if this was a test for select, 1700 * we bail out right after. 1701 */ 1702 partial = (off != NULL && *off > 0); 1703 1704 if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK) 1705 return r; 1706 1707 /* 1708 * Copy out regular data, if any. Do this before copying out control 1709 * data, because the latter is harder to undo on failure. This data 1710 * copy function returns returns OK (0) if we are to return a result of 1711 * zero bytes (which is *not* EOF) to the caller without doing anything 1712 * else. The function returns a nonzero positive segment length if we 1713 * should carry on with the receive call (as it happens, all its other 1714 * returned values may in fact be zero). 1715 */ 1716 if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags, 1717 &datalen, &reslen, &segflags, &credpos)) <= 0) 1718 return r; 1719 seglen = (size_t)r; 1720 1721 /* 1722 * Copy out control data, if any: transfer and copy out records of file 1723 * descriptors, and/or copy out sender credentials. This is the last 1724 * part of the call that may fail. 1725 */ 1726 if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags, 1727 segflags, credpos, rflags)) != OK) 1728 return r; 1729 1730 /* 1731 * Now that the call has succeeded, move the tail of the receive 1732 * buffer, unless we were merely peeking. 1733 */ 1734 if (!(flags & MSG_PEEK)) 1735 uds_recv_advance(uds, seglen, datalen, reslen, segflags, 1736 &may_block); 1737 else 1738 may_block = FALSE; 1739 1740 /* 1741 * If the MSG_WAITALL flag was given, we may still have to suspend the 1742 * call after partial success. In particular, the receive call may 1743 * suspend after partial success if all of these conditions are met: 1744 * 1745 * 1) the socket is a stream-type socket; 1746 * 2) MSG_WAITALL is set; 1747 * 3) MSG_PEEK is not set; 1748 * 4) MSG_DONTWAIT is not set (tested upon return); 1749 * 5) the socket must not have a pending error (tested upon return); 1750 * 6) the socket must not be shut down for reading (tested later); 1751 * 7) the socket must still be connected to a peer (no EOF); 1752 * 8) the peer must not have been shut down for writing (no EOF); 1753 * 9) the next segment, if any, contains no ancillary data. 1754 * 1755 * Together, these points guarantee that the call could conceivably 1756 * receive more after being resumed. Points 4 to 6 are covered by 1757 * libsockevent, which will end the call even if we return SUSPEND 1758 * here. Due to segment merging, we cover point 9 by checking that 1759 * there is currently no next segment at all. Once a new segment 1760 * arrives, the ancillary-data test is done then. 1761 */ 1762 *off += reslen; 1763 if ((flags & MSG_WAITALL) && reslen < len && may_block) 1764 return SUSPEND; 1765 else 1766 return OK; 1767 } 1768 1769 /* 1770 * Test whether a receive request would block. The given 'min' parameter 1771 * contains the minimum number of bytes that should be possible to receive 1772 * without blocking (the low receive watermark). Return SUSPEND if the send 1773 * request would block. Otherwise, return any other error code (including OK 1774 * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled 1775 * with the number of bytes available for receipt right now (if not zero). 1776 * Note that if 'size' is not NULL, 'min' will always be zero. 1777 */ 1778 int 1779 uds_test_recv(struct sock * sock, size_t min, size_t * size) 1780 { 1781 struct udssock *uds = (struct udssock *)sock; 1782 size_t seglen; 1783 unsigned int segflags; 1784 int r; 1785 1786 if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/, 1787 NULL /*may_block*/)) == SUSPEND) 1788 return r; 1789 1790 if (size != NULL && uds->uds_len > 0) 1791 (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size, 1792 &segflags); 1793 1794 return r; 1795 } 1796