xref: /minix/minix/net/uds/io.c (revision 27852ebe)
1 /* UNIX Domain Sockets - io.c - sending and receiving */
2 
3 #include "uds.h"
4 #include <sys/mman.h>
5 
6 /*
7  * Our UDS sockets do not have a send buffer.  They only have a receive buffer.
8  * This receive buffer, when not empty, is split up in segments.  Each segment
9  * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and
10  * (SOCK_DGRAM) neither.  There are two types of ancillary data: in-flight file
11  * descriptors and sender credentials.  In addition, for SOCK_DGRAM sockets,
12  * the segment may contain the sender's socket path (if the sender's socket is
13  * bound).  Each segment has has a header, containing the full segment size,
14  * the size of the actual data in the segment (if any), and a flags field that
15  * states which ancillary are associated with the segment (if any).  For
16  * SOCK_STREAM type sockets, new data may be merged into a previous segment,
17  * but only if it has no ancillary data.  For the other two socket types, each
18  * packet has its own header.  The resulting behavior should be in line with
19  * the POSIX "Socket Receive Queue" specification.
20  *
21  * More specifically, each segment consists of the following parts:
22  * - always a five-byte header, containing a two-byte segment length (including
23  *   the header, so always non-zero), a two-byte regular data length (zero or
24  *   more), and a one-byte flags field which is a bitwise combination of
25  *   UDS_HAS_{FD,CRED,PATH} flags;
26  * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure;
27  *   since this structure is variable-size, the structure is prepended by a
28  *   single byte that contains the length of the structure (excluding the byte
29  *   itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN);
30  * - next, if UDS_HAS_PATH is set in the segment header:
31  * - next, if the data length is non-zero, the actual regular data.
32  * If the segment is not the last in the receive buffer, it is followed by the
33  * next segment immediately afterward.  There is no alignment.
34  *
35  * It is the sender's responsibility to merge new data into the last segment
36  * whenever possible, so that the receiver side never needs to consider more
37  * than one segment at once.  In order to allow such merging, each receive
38  * buffer has not only a tail and in-use length (pointing to the head when
39  * combined) but also an offset from the tail to the last header, if any.  Note
40  * that the receiver may over time still look at multiple segments for a single
41  * request: this happens when a MSG_WAITALL request empties the buffer and then
42  * blocks - the next piece of arriving data can then obviously not be merged.
43  *
44  * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file
45  * descriptors are associated with the segment.  These are stored in a separate
46  * data structure, mainly to simplify cleaning up when the socket is shut down
47  * for reading or closed.  That structure also contains the number of file
48  * descriptors associated with the current segment, so this is not stored in
49  * the segment itself.  As mentioned later, this may be changed in the future.
50  *
51  * On the sender side, there is a trade-off between fully utilizing the receive
52  * buffer, and not repeatedly performing expensive actions for the same call:
53  * it may be costly to determine exactly how many in-flight file descriptors
54  * there will be (if any) and/or how much space is needed to store credentials.
55  * We currently use the policy that we rather block/reject a send request that
56  * may (just) have fit in the remaining part of the receive buffer, than obtain
57  * the same information multiple times or keep state between callbacks.  In
58  * practice this is not expected to make a difference, especially since
59  * transfer of ancillary data should be rare anyway.
60  */
61 /*
62  * The current layout of the segment header is as follows.
63  *
64  * The first byte contains the upper eight bits of the total segment length.
65  * The second byte contains the lower eight bits of the total segment length.
66  * The third byte contains the upper eight bits of the data length.
67  * The fourth byte contains the lower eight bits of the data length.
68  * The fifth byte is a bitmask for ancillary data associated with the segment.
69  */
70 #define UDS_HDRLEN	5
71 
72 #define UDS_HAS_FDS	0x01	/* segment has in-flight file descriptors */
73 #define UDS_HAS_CRED	0x02	/* segment has sender credentials */
74 #define UDS_HAS_PATH	0x04	/* segment has source socket path */
75 
76 #define UDS_MAXCREDLEN	SOCKCREDSIZE(NGROUPS_MAX)
77 
78 #define uds_get_head(uds) 	\
79 	((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF)
80 #define uds_get_last(uds)	\
81 	((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF)
82 #define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF)
83 
84 /*
85  * All in-flight file descriptors are (co-)owned by the UDS driver itself, as
86  * local open file descriptors.  Like any other process, the UDS driver can not
87  * have more than OPEN_MAX open file descriptors at any time.  Thus, this is
88  * also the inherent maximum number of in-flight file descriptors.  Therefore,
89  * we maintain a single pool of in-flight FD structures, and we associate these
90  * structures with sockets as needed.
91  */
92 static struct uds_fd uds_fds[OPEN_MAX];
93 static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds;
94 
95 static char uds_ctlbuf[UDS_CTL_MAX];
96 static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)];
97 
98 /*
99  * Initialize the input/output part of the UDS service.
100  */
101 void
102 uds_io_init(void)
103 {
104 	unsigned int slot;
105 
106 	SIMPLEQ_INIT(&uds_freefds);
107 
108 	for (slot = 0; slot < __arraycount(uds_fds); slot++)
109 		SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next);
110 }
111 
112 /*
113  * Set up all input/output state for the given socket, which has just been
114  * allocated.  As part of this, allocate memory for the receive buffer of the
115  * socket.  Return OK or a negative error code.
116  */
117 int
118 uds_io_setup(struct udssock * uds)
119 {
120 
121 	/* TODO: decide if we should preallocate the memory. */
122 	if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
123 	    MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
124 		return ENOMEM;
125 
126 	uds->uds_tail = 0;
127 	uds->uds_len = 0;
128 	uds->uds_last = 0;
129 
130 	SIMPLEQ_INIT(&uds->uds_fds);
131 
132 	return OK;
133 }
134 
135 /*
136  * Clean up the input/output state for the given socket, which is about to be
137  * freed.  As part of this, deallocate memory for the receive buffer and close
138  * any file descriptors still in flight on the socket.
139  */
140 void
141 uds_io_cleanup(struct udssock * uds)
142 {
143 
144 	/* Close any in-flight file descriptors. */
145 	uds_io_reset(uds);
146 
147 	/* Free the receive buffer memory. */
148 	if (munmap(uds->uds_buf, UDS_BUF) != 0)
149 		panic("UDS: munmap failed: %d", errno);
150 }
151 
152 /*
153  * The socket is being closed or shut down for reading.  If there are still any
154  * in-flight file descriptors, theey will never be received anymore, so close
155  * them now.
156  */
157 void
158 uds_io_reset(struct udssock * uds)
159 {
160 	struct uds_fd *ufd;
161 
162 	/*
163 	 * The UDS service may have the last and only reference to any of these
164 	 * file descriptors here.  For that reason, we currently disallow
165 	 * transfer of UDS file descriptors, because the close(2) here could
166 	 * block on a socket close operation back to us, leading to a deadlock.
167 	 * Also, we use a non-blocking variant of close(2), to prevent that we
168 	 * end up hanging on sockets with SO_LINGER turned on.
169 	 */
170 	SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) {
171 		dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
172 
173 		closenb(ufd->ufd_fd);
174 	}
175 
176 	SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds);
177 
178 	/*
179 	 * If this reset happens as part of a shutdown, it might be done
180 	 * again on close, so ensure that it will find a clean state.  The
181 	 * receive buffer should never be looked at again either way, but reset
182 	 * it too just to be sure.
183 	 */
184 	uds->uds_tail = 0;
185 	uds->uds_len = 0;
186 	uds->uds_last = 0;
187 
188 	SIMPLEQ_INIT(&uds->uds_fds);
189 }
190 
191 /*
192  * Return the maximum usable part of the receive buffer, in bytes.  The return
193  * value is used for the SO_SNDBUF and SO_RCVBUF socket options.
194  */
195 size_t
196 uds_io_buflen(void)
197 {
198 
199 	/*
200 	 * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we
201 	 * could use the full receive buffer for data.  This would require that
202 	 * we store up to one header in the socket object rather than in the
203 	 * receive buffer.
204 	 */
205 	return UDS_BUF - UDS_HDRLEN;
206 }
207 
208 /*
209  * Fetch 'len' bytes starting from absolute position 'pos' into the receive
210  * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'.
211  * Return the absolute position of the first byte after the fetched data in the
212  * receive buffer.
213  */
214 static size_t
215 uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len)
216 {
217 	size_t left;
218 
219 	assert(off < UDS_BUF);
220 
221 	left = UDS_BUF - off;
222 	if (len >= left) {
223 		memcpy(ptr, &uds->uds_buf[off], left);
224 
225 		if ((len -= left) > 0)
226 			memcpy((char *)ptr + left, &uds->uds_buf[0], len);
227 
228 		return len;
229 	} else {
230 		memcpy(ptr, &uds->uds_buf[off], len);
231 
232 		return off + len;
233 	}
234 }
235 
236 /*
237  * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive
238  * buffer of socket 'uds', starting at absolute position 'pos' into the receive
239  * buffer.  Return the absolute position of the first byte after the stored
240  * data in the receive buffer.
241  */
242 static size_t
243 uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len)
244 {
245 	size_t left;
246 
247 	assert(off < UDS_BUF);
248 
249 	left = UDS_BUF - off;
250 	if (len >= left) {
251 		memcpy(&uds->uds_buf[off], ptr, left);
252 
253 		if ((len -= left) > 0)
254 			memcpy(&uds->uds_buf[0], (const char *)ptr + left,
255 			    len);
256 
257 		return len;
258 	} else {
259 		memcpy(&uds->uds_buf[off], ptr, len);
260 
261 		return off + len;
262 	}
263 }
264 
265 /*
266  * Fetch a segment header previously stored in the receive buffer of socket
267  * 'uds' at absolute position 'off'.  Return the absolute position of the first
268  * byte after the header, as well as the entire segment length in 'seglen', the
269  * length of the data in the segment in 'datalen', and the segment flags in
270  * 'segflags'.
271  */
272 static size_t
273 uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen,
274 	size_t * datalen, unsigned int * segflags)
275 {
276 	unsigned char hdr[UDS_HDRLEN];
277 
278 	off = uds_fetch(uds, off, hdr, sizeof(hdr));
279 
280 	*seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1];
281 	*datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3];
282 	*segflags = hdr[4];
283 
284 	assert(*seglen >= UDS_HDRLEN);
285 	assert(*seglen <= uds->uds_len);
286 	assert(*datalen <= *seglen - UDS_HDRLEN);
287 	assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN);
288 	assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
289 
290 	return off;
291 }
292 
293 /*
294  * Store a segment header in the receive buffer of socket 'uds' at absolute
295  * position 'off', with the segment length 'seglen', the segment data length
296  * 'datalen', and the segment flags 'segflags'.  Return the absolute receive
297  * buffer position of the first data byte after the stored header.
298  */
299 static size_t
300 uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen,
301 	unsigned int segflags)
302 {
303 	unsigned char hdr[UDS_HDRLEN];
304 
305 	assert(seglen <= USHRT_MAX);
306 	assert(datalen <= seglen);
307 	assert(segflags <= UCHAR_MAX);
308 	assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
309 
310 	hdr[0] = (seglen >> 8) & 0xff;
311 	hdr[1] = seglen & 0xff;
312 	hdr[2] = (datalen >> 8) & 0xff;
313 	hdr[3] = datalen & 0xff;
314 	hdr[4] = segflags;
315 
316 	return uds_store(uds, off, hdr, sizeof(hdr));
317 }
318 
319 /*
320  * Perform initial checks on a send request, before it may potentially be
321  * suspended.  Return OK if this send request is valid, or a negative error
322  * code if it is not.
323  */
324 int
325 uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
326 	const struct sockaddr * addr, socklen_t addr_len __unused,
327 	endpoint_t user_endpt __unused, int flags)
328 {
329 	struct udssock *uds = (struct udssock *)sock;
330 	size_t pathlen;
331 
332 	/*
333 	 * Reject calls with unknown flags.  Besides the flags handled entirely
334 	 * by libsockevent (which are not part of 'flags' here), that is all of
335 	 * them.  TODO: ensure that we should really reject all other flags
336 	 * rather than ignore them.
337 	 */
338 	if (flags != 0)
339 		return EOPNOTSUPP;
340 
341 	/*
342 	 * Perform very basic address and message size checks on the send call.
343 	 * For non-stream sockets, we must reject packets that may never fit in
344 	 * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the
345 	 * send call may end up being suspended indefinitely.  Therefore, we
346 	 * assume the worst-case scenario, which is that a full set of
347 	 * credentials must be associated with the packet.  As a result, we may
348 	 * reject some large packets that could actually just fit.  Checking
349 	 * the peer's LOCAL_CREDS setting here is not safe: even if we know the
350 	 * peer already at all (for SOCK_DGRAM we do not), the send may still
351 	 * block and the option toggled before it unblocks.
352 	 */
353 	switch (uds_get_type(uds)) {
354 	case SOCK_STREAM:
355 		/* Nothing to check for this case. */
356 		break;
357 
358 	case SOCK_SEQPACKET:
359 		if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN)
360 			return EMSGSIZE;
361 
362 		break;
363 
364 	case SOCK_DGRAM:
365 		if (!uds_has_link(uds) && addr == NULL)
366 			return EDESTADDRREQ;
367 
368 		/*
369 		 * The path is stored without null terminator, but with leading
370 		 * byte containing the path length--if there is a path at all.
371 		 */
372 		pathlen = (size_t)uds->uds_pathlen;
373 		if (pathlen > 0)
374 			pathlen++;
375 
376 		if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN)
377 			return EMSGSIZE;
378 
379 		break;
380 
381 	default:
382 		assert(0);
383 	}
384 
385 	return OK;
386 }
387 
388 /*
389  * Determine whether the (real or pretend) send request should be processed
390  * now, suspended until later, or rejected based on the current socket state.
391  * Return OK if the send request should be processed now.  Return SUSPEND if
392  * the send request should be retried later.  Return an appropriate negative
393  * error code if the send request should fail.
394  */
395 static int
396 uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min,
397 	int partial)
398 {
399 	struct udssock *conn;
400 	size_t avail, hdrlen, credlen;
401 
402 	assert(!uds_is_shutdown(uds, SFL_SHUT_WR));
403 
404 	if (uds_get_type(uds) != SOCK_DGRAM) {
405 		if (uds_is_connecting(uds))
406 			return SUSPEND;
407 		if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
408 			return ENOTCONN;
409 		if (!uds_has_conn(uds))
410 			return EPIPE;
411 
412 		conn = uds->uds_conn;
413 
414 		if (uds_is_shutdown(conn, SFL_SHUT_RD))
415 			return EPIPE;
416 
417 		/*
418 		 * For connection-type sockets, we now have to check if there
419 		 * is enough room in the receive buffer.  For SOCK_STREAM
420 		 * sockets, we must check if at least 'min' bytes can be moved
421 		 * into the receive buffer, at least if that is a reasonable
422 		 * value for ever making any forward progress at all.  For
423 		 * SOCK_SEQPACKET sockets, we must check if the entire packet
424 		 * of size 'len' can be stored in the receive buffer.  In both
425 		 * cases, we must take into account any metadata to store along
426 		 * with the data.
427 		 *
428 		 * Unlike in uds_pre_send(), we can now check safely whether
429 		 * the peer is expecting credentials, but we still don't know
430 		 * the actual size of the credentials, so again we take the
431 		 * maximum possible size.  The same applies to file descriptors
432 		 * transferred via control data: all we have the control length
433 		 * right now, which if non-zero we assume to mean there might
434 		 * be file descriptors.
435 		 *
436 		 * In both cases, the reason of overestimating is that actually
437 		 * getting accurate sizes, by obtaining credentials or copying
438 		 * in control data, is very costly.  We want to do that only
439 		 * when we are sure we will not suspend the send call after
440 		 * all.  It is no problem to overestimate how much space will
441 		 * be needed here, but not to underestimate: that could cause
442 		 * applications that use select(2) and non-blocking sockets to
443 		 * end up in a busy-wait loop.
444 		 */
445 		if (!partial && (conn->uds_flags & UDSF_PASSCRED))
446 			credlen = 1 + UDS_MAXCREDLEN;
447 		else
448 			credlen = 0;
449 
450 		avail = UDS_BUF - conn->uds_len;
451 
452 		if (uds_get_type(uds) == SOCK_STREAM) {
453 			/*
454 			 * Limit the low threshold to the maximum that can ever
455 			 * be sent at once.
456 			 */
457 			if (min > UDS_BUF - UDS_HDRLEN - credlen)
458 				min = UDS_BUF - UDS_HDRLEN - credlen;
459 
460 			/*
461 			 * Suspend the call only if not even the low threshold
462 			 * is met.  Otherwise we may make (partial) progress.
463 			 */
464 			if (len > min)
465 				len = min;
466 
467 			/*
468 			 * If the receive buffer already has at least one
469 			 * segment, and there are certainly no file descriptors
470 			 * to transfer now, and we do not have to store
471 			 * credentials either, then this segment can be merged
472 			 * with the previous one.  In that case, we need no
473 			 * space for a header.  That is certainly the case if
474 			 * we are resuming an already partially completed send.
475 			 */
476 			hdrlen = (avail == UDS_BUF || ctl_len != 0 ||
477 			    credlen > 0) ? UDS_HDRLEN : 0;
478 		} else
479 			hdrlen = UDS_HDRLEN;
480 
481 		if (avail < hdrlen + credlen + len)
482 			return SUSPEND;
483 	}
484 
485 	return OK;
486 }
487 
488 /*
489  * Get the destination peer for a send request.  The send test has already been
490  * performed first.  On success, return OK, with a pointer to the peer socket
491  * stored in 'peerp'.  On failure, return an appropriate error code.
492  */
493 static int
494 uds_send_peer(struct udssock * uds, const struct sockaddr * addr,
495 	socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
496 {
497 	struct udssock *peer;
498 	int r;
499 
500 	if (uds_get_type(uds) == SOCK_DGRAM) {
501 		if (!uds_has_link(uds)) {
502 			/* This was already checked in uds_pre_check(). */
503 			assert(addr != NULL);
504 
505 			/*
506 			 * Find the socket identified by the given address.
507 			 * If it exists at all, see if it is a proper match.
508 			 */
509 			if ((r = uds_lookup(uds, addr, addr_len, user_endpt,
510 			    &peer)) != OK)
511 				return r;
512 
513 			/*
514 			 * If the peer socket is connected to a target, it
515 			 * must be this socket.  Unfortunately, POSIX does not
516 			 * specify an error code for this.  We borrow Linux's.
517 			 */
518 			if (uds_has_link(peer) && peer->uds_link != uds)
519 				return EPERM;
520 		} else
521 			peer = uds->uds_link;
522 
523 		/*
524 		 * If the receiving end will never receive this packet, we
525 		 * might as well not send it, so drop it immeiately.  Indicate
526 		 * as such to the caller, using NetBSD's chosen error code.
527 		 */
528 		if (uds_is_shutdown(peer, SFL_SHUT_RD))
529 			return ENOBUFS;
530 	} else {
531 		assert(uds_has_conn(uds));
532 
533 		peer = uds->uds_conn;
534 	}
535 
536 	*peerp = peer;
537 	return OK;
538 }
539 
540 /*
541  * Generate a new segment for the current send request, or arrange things such
542  * that new data can be merged with a previous segment.  As part of this,
543  * decide whether we can merge data at all.  The segment will be merged if, and
544  * only if, all of the following requirements are met:
545  *
546  *   1) the socket is of type SOCK_STREAM;
547  *   2) there is a previous segment in the receive buffer;
548  *   3) there is no ancillary data for the current send request.
549  *
550  * Also copy in regular data (if any), retrieve the sender's credentials (if
551  * needed), and copy over the source path (if applicable).  However, do not yet
552  * commit the segment (or the new part to be merged), because the send request
553  * may still fail for other reasons.
554  *
555  * On success, return the length of the new segment (or, when merging, the
556  * length to be added to the last segment), as well as a flag indicating
557  * whether we are merging into the last segment in 'mergep', the length of the
558  * (new) data in the segment in 'datalenp', and the new segment's flags in
559  * 'segflagsp' (always zero when merging).  Note that a return value of zero
560  * implies that we are merging zero extra bytes into the last segment, which
561  * means that effectively nothing changes; in that case the send call will be
562  * cut short and return zero to the caller as well.  On failure, return a
563  * negative error code.
564  */
565 static int
566 uds_send_data(struct udssock * uds, struct udssock * peer,
567 	const struct sockdriver_data * data, size_t len, size_t off,
568 	endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep,
569 	size_t * __restrict datalenp, unsigned int * __restrict segflagsp)
570 {
571 	struct sockcred sockcred;
572 	gid_t groups[NGROUPS_MAX];
573 	iovec_t iov[2];
574 	unsigned int iovcnt, segflags;
575 	unsigned char lenbyte;
576 	size_t credlen, pathlen, datalen, seglen;
577 	size_t avail, pos, left;
578 	int r, merge;
579 
580 	/*
581 	 * At this point we should add the data to the peer's receive buffer.
582 	 * In the case of SOCK_STREAM sockets, we should add as much of the
583 	 * data as possible and suspend the call to send the rest later, if
584 	 * applicable.  In the case of SOCK_DGRAM sockets, we should drop the
585 	 * packet if it does not fit in the buffer.
586 	 *
587 	 * Due to the checks in uds_can_send(), we know for sure that we no
588 	 * longer have to suspend without making any progress at this point.
589 	 */
590 	segflags = (nfds > 0) ? UDS_HAS_FDS : 0;
591 
592 	/*
593 	 * Obtain the credentials now.  Doing so allows us to determine how
594 	 * much space we actually need for them.
595 	 */
596 	if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) {
597 		memset(&sockcred, 0, sizeof(sockcred));
598 
599 		if ((r = getsockcred(user_endpt, &sockcred, groups,
600 		    __arraycount(groups))) != OK)
601 			return r;
602 
603 		credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups);
604 
605 		segflags |= UDS_HAS_CRED;
606 	} else
607 		credlen = 0;
608 
609 	/* For bound source datagram sockets, include the source path. */
610 	if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) {
611 		pathlen = (size_t)uds->uds_pathlen + 1;
612 
613 		segflags |= UDS_HAS_PATH;
614 	} else
615 		pathlen = 0;
616 
617 	avail = UDS_BUF - peer->uds_len;
618 
619 	if (uds_get_type(uds) == SOCK_STREAM) {
620 		/*
621 		 * Determine whether we can merge data into the previous
622 		 * segment.  This is a more refined version of the test in
623 		 * uds_can_send(), as we now know whether there are actually
624 		 * any FDs to transfer.
625 		 */
626 		merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0);
627 
628 		/* Determine how much we can send at once. */
629 		if (!merge) {
630 			assert(avail > UDS_HDRLEN + credlen);
631 			datalen = avail - UDS_HDRLEN - credlen;
632 		} else
633 			datalen = avail;
634 
635 		if (datalen > len)
636 			datalen = len;
637 
638 		/* If we cannot make progress, we should have suspended.. */
639 		assert(datalen != 0 || len == 0);
640 	} else {
641 		merge = FALSE;
642 
643 		datalen = len;
644 	}
645 	assert(datalen <= len);
646 	assert(datalen <= UDS_BUF);
647 
648 	/*
649 	 * Compute the total amount of space we need for the segment in the
650 	 * receive buffer.  Given that we have done will-it-fit tests in
651 	 * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one
652 	 * case left where the result may not fit, and that is for SOCK_DGRAM
653 	 * packets.  In that case, we drop the packet.  POSIX says we should
654 	 * throw an error in that case, and that is also what NetBSD does.
655 	 */
656 	if (!merge)
657 		seglen = UDS_HDRLEN + credlen + pathlen + datalen;
658 	else
659 		seglen = datalen;
660 
661 	if (seglen > avail) {
662 		assert(uds_get_type(uds) == SOCK_DGRAM);
663 
664 		/* Drop the packet, borrowing NetBSD's chosen error code. */
665 		return ENOBUFS;
666 	}
667 
668 	/*
669 	 * Generate the full segment, but do not yet update the buffer head.
670 	 * We may still run into an error (copying in file descriptors) or even
671 	 * decide that nothing gets sent after all (if there are no data or
672 	 * file descriptors).  If we are merging the new data into the previous
673 	 * segment, do not generate a header.
674 	 */
675 	pos = uds_get_head(peer);
676 
677 	/* Generate the header, if needed. */
678 	if (!merge)
679 		pos = uds_store_hdr(peer, pos, seglen, datalen, segflags);
680 	else
681 		assert(segflags == 0);
682 
683 	/* Copy in and store the sender's credentials, if desired. */
684 	if (credlen > 0) {
685 		assert(credlen >= 1 + sizeof(sockcred));
686 		assert(credlen <= UCHAR_MAX);
687 
688 		lenbyte = credlen - 1;
689 		pos = uds_store(peer, pos, &lenbyte, 1);
690 
691 		if (sockcred.sc_ngroups > 0) {
692 			pos = uds_store(peer, pos, &sockcred,
693 			    offsetof(struct sockcred, sc_groups));
694 			pos = uds_store(peer, pos, groups,
695 			    sockcred.sc_ngroups * sizeof(gid_t));
696 		} else
697 			pos = uds_store(peer, pos, &sockcred,
698 			    sizeof(sockcred));
699 	}
700 
701 	/* Store the sender's address if any.  Datagram sockets only. */
702 	if (pathlen > 0) {
703 		assert(pathlen > 1);
704 		assert(pathlen <= UCHAR_MAX);
705 
706 		lenbyte = uds->uds_pathlen;
707 		pos = uds_store(peer, pos, &lenbyte, 1);
708 		pos = uds_store(peer, pos, uds->uds_path, pathlen - 1);
709 	}
710 
711 	/* Lastly, copy in the actual data (if any) from the caller. */
712 	if (datalen > 0) {
713 		iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos];
714 		left = UDS_BUF - pos;
715 
716 		if (left < datalen) {
717 			assert(left > 0);
718 			iov[0].iov_size = left;
719 			iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0];
720 			iov[1].iov_size = datalen - left;
721 			iovcnt = 2;
722 		} else {
723 			iov[0].iov_size = datalen;
724 			iovcnt = 1;
725 		}
726 
727 		if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK)
728 			return r;
729 	}
730 
731 	*mergep = merge;
732 	*datalenp = datalen;
733 	*segflagsp = segflags;
734 	return seglen;
735 }
736 
737 /*
738  * Copy in control data for the current send request, and extract any file
739  * descriptors to be transferred.  Do not yet duplicate the file descriptors,
740  * but rather store a list in a temporary buffer: the send request may still
741  * fail in which case we want to avoid having to undo the duplication.
742  *
743  * On success, return the number of (zero or more) file descriptors extracted
744  * from the request and stored in the temporary buffer.  On failure, return a
745  * negative error code.
746  */
747 static int
748 uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len,
749 	endpoint_t user_endpt)
750 {
751 	struct msghdr msghdr;
752 	struct cmsghdr *cmsg;
753 	socklen_t left;
754 	unsigned int i, n, nfds;
755 	int r;
756 
757 	/*
758 	 * Copy in the control data.  We can spend a lot of effort copying in
759 	 * the data in small chunks, and change the receiving side to do the
760 	 * same, but it is really not worth it: applications never send a whole
761 	 * lot of file descriptors at once, and the buffer size is currently
762 	 * such that the UDS service itself will exhaust its OPEN_MAX limit
763 	 * anyway if they do.
764 	 */
765 	if (ctl_len > sizeof(uds_ctlbuf))
766 		return ENOBUFS;
767 
768 	if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK)
769 		return r;
770 
771 	if (ctl_len < sizeof(uds_ctlbuf))
772 		memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len);
773 
774 	/*
775 	 * Look for any file descriptors, and store their remote file
776 	 * descriptor numbers into a temporary array.
777 	 */
778 	memset(&msghdr, 0, sizeof(msghdr));
779 	msghdr.msg_control = uds_ctlbuf;
780 	msghdr.msg_controllen = ctl_len;
781 
782 	nfds = 0;
783 	r = OK;
784 
785 	/*
786 	 * The sender may provide file descriptors in multiple chunks.
787 	 * Currently we do not preserve these chunk boundaries, instead
788 	 * generating one single chunk with all file descriptors for the
789 	 * segment upon receipt.  If needed, we can fairly easily adapt this
790 	 * later.
791 	 */
792 	for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
793 	    cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
794 		/*
795 		 * Check for bogus lengths.  There is no excuse for this;
796 		 * either the caller does not know what they are doing or we
797 		 * are looking at a hacking attempt.
798 		 */
799 		assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len);
800 		left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf);
801 		assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
802 
803 		if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) {
804 			printf("UDS: malformed control data from %u\n",
805 			    user_endpt);
806 			r = EINVAL;
807 			break;
808 		}
809 
810 		if (cmsg->cmsg_level != SOL_SOCKET ||
811 		    cmsg->cmsg_type != SCM_RIGHTS)
812 			continue;
813 
814 		n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
815 
816 		for (i = 0; i < n; i++) {
817 			/*
818 			 * Copy the file descriptor to the temporary buffer,
819 			 * whose size is based on the control data buffer, so
820 			 * it is always large enough to contain all FDs.
821 			 */
822 			assert(nfds < __arraycount(uds_ctlfds));
823 
824 			memcpy(&uds_ctlfds[nfds],
825 			    &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
826 
827 			nfds++;
828 		}
829 	}
830 
831 	return nfds;
832 }
833 
834 /*
835  * Actually duplicate any file descriptors that we extracted from the sender's
836  * control data and stored in our temporary buffer.  On success, return OK,
837  * with all file descriptors stored in file descriptor objects that are
838  * appended to the socket's list of in-flight FD objects.  Thus, on success,
839  * the send request may no longer fail.  On failure, return a negative error
840  * code, with any partial duplication undone.
841  */
842 static int
843 uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt)
844 {
845 	SIMPLEQ_HEAD(, uds_fd) fds;
846 	struct uds_fd *ufd;
847 	unsigned int i;
848 	int r;
849 
850 	SIMPLEQ_INIT(&fds);
851 
852 	for (i = 0; i < nfds; i++) {
853 		if (SIMPLEQ_EMPTY(&uds_freefds)) {
854 			/* UDS itself may already have OPEN_MAX FDs. */
855 			r = ENFILE;
856 			break;
857 		}
858 
859 		/*
860 		 * The caller may have given an invalid FD, or UDS itself may
861 		 * unexpectedly have run out of available file descriptors etc.
862 		 */
863 		if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0)
864 			break;
865 
866 		ufd = SIMPLEQ_FIRST(&uds_freefds);
867 		SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next);
868 
869 		ufd->ufd_fd = r;
870 		ufd->ufd_count = 0;
871 
872 		SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next);
873 
874 		dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r));
875 	}
876 
877 	/* Did we experience an error while copying in the file descriptors? */
878 	if (r < 0) {
879 		/* Revert the successful copyfd() calls made so far. */
880 		SIMPLEQ_FOREACH(ufd, &fds, ufd_next) {
881 			dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
882 
883 			closenb(ufd->ufd_fd);
884 		}
885 
886 		SIMPLEQ_CONCAT(&uds_freefds, &fds);
887 
888 		return r;
889 	}
890 
891 	/*
892 	 * Success.  If there were any file descriptors at all, add them to the
893 	 * peer's list of in-flight file descriptors.  Assign the number of
894 	 * file descriptors copied in to the first file descriptor object, so
895 	 * that we know how many to copy out (or discard) for this segment.
896 	 * Also set the UDS_HAS_FDS flag on the segment.
897 	 */
898 	ufd = SIMPLEQ_FIRST(&fds);
899 	ufd->ufd_count = nfds;
900 
901 	SIMPLEQ_CONCAT(&peer->uds_fds, &fds);
902 
903 	return OK;
904 }
905 
906 /*
907  * The current send request is successful or at least has made progress.
908  * Commit the new segment or, if we decided to merge the new data into the last
909  * segment, update the header of the last segment.  Also wake up the receiving
910  * side, because there will now be new data to receive.
911  */
912 static void
913 uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen,
914 	int merge, size_t seglen, unsigned int segflags)
915 {
916 	size_t pos, prevseglen, prevdatalen;
917 
918 	/*
919 	 * For non-datagram sockets, credentials are sent only once after
920 	 * setting the LOCAL_CREDS option.  After that, the option is unset.
921 	 */
922 	if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM)
923 		peer->uds_flags &= ~UDSF_PASSCRED;
924 
925 	if (merge) {
926 		assert(segflags == 0);
927 
928 		pos = uds_get_last(peer);
929 
930 		(void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen,
931 		    &segflags);
932 
933 		peer->uds_len += seglen;
934 		assert(peer->uds_len <= UDS_BUF);
935 
936 		seglen += prevseglen;
937 		datalen += prevdatalen;
938 		assert(seglen <= UDS_BUF);
939 
940 		uds_store_hdr(peer, pos, seglen, datalen, segflags);
941 	} else {
942 		peer->uds_last = peer->uds_len;
943 
944 		peer->uds_len += seglen;
945 		assert(peer->uds_len <= UDS_BUF);
946 	}
947 
948 	/* Now that there are new data, wake up the receiver side. */
949 	sockevent_raise(&peer->uds_sock, SEV_RECV);
950 }
951 
952 /*
953  * Process a send request.  Return OK if the send request has successfully
954  * completed, SUSPEND if it should be tried again later, or a negative error
955  * code on failure.  In all cases, the values of 'off' and 'ctl_off' must be
956  * updated if any progress has been made; if either is non-zero, libsockevent
957  * will return the partial progress rather than an error code.
958  */
959 int
960 uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len,
961 	size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
962 	socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len,
963 	endpoint_t user_endpt, int flags __unused, size_t min)
964 {
965 	struct udssock *uds = (struct udssock *)sock;
966 	struct udssock *peer;
967 	size_t seglen, datalen = 0 /*gcc*/;
968 	unsigned int nfds, segflags = 0 /*gcc*/;
969 	int r, partial, merge = 0 /*gcc*/;
970 
971 	dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n",
972 	    uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
973 	    (ctl_off != NULL) ? *ctl_off : 0, flags));
974 
975 	partial = (off != NULL && *off > 0);
976 
977 	/*
978 	 * First see whether we can process this send call at all right now.
979 	 * Most importantly, for connected sockets, if the peer's receive
980 	 * buffer is full, we may have to suspend the call until some space has
981 	 * been freed up.
982 	 */
983 	if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK)
984 		return r;
985 
986 	/*
987 	 * Then get the peer socket.  For connected sockets, this is trivial.
988 	 * For unconnected sockets, it may involve a lookup of the given
989 	 * address.
990 	 */
991 	if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK)
992 		return r;
993 
994 	/*
995 	 * We now know for sure that we will not suspend this call without
996 	 * making any progress.  However, the call may still fail.  Copy in
997 	 * control data first now, so that we know whether there are any file
998 	 * descriptors to transfer.  This aspect may determine whether or not
999 	 * we can merge data with a previous segment.  Do not actually copy in
1000 	 * the actual file descriptors yet, because that is much harder to undo
1001 	 * in case of a failure later on.
1002 	 */
1003 	if (ctl_len > 0) {
1004 		/* We process control data once, in full. */
1005 		assert(*ctl_off == 0);
1006 
1007 		if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0)
1008 			return r;
1009 		nfds = (unsigned int)r;
1010 	} else
1011 		nfds = 0;
1012 
1013 	/*
1014 	 * Now generate a new segment, or (if possible) merge new data into the
1015 	 * last segment.  Since the call may still fail, prepare the segment
1016 	 * but do not update the buffer head yet.  Note that the segment
1017 	 * contains not just regular data (in fact it may contain no data at
1018 	 * all) but (also) certain ancillary data.
1019 	 */
1020 	if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds,
1021 	    &merge, &datalen, &segflags)) <= 0)
1022 		return r;
1023 	seglen = (size_t)r;
1024 
1025 	/*
1026 	 * If we extracted any file descriptors from the control data earlier,
1027 	 * copy them over to ourselves now.  The resulting in-flight file
1028 	 * descriptors are stored in a separate data structure.  This is the
1029 	 * last point where the send call may actually fail.
1030 	 */
1031 	if (nfds > 0) {
1032 		if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK)
1033 			return r;
1034 	}
1035 
1036 	/*
1037 	 * The transmission is now known to be (partially) successful.  Commit
1038 	 * the new work by moving the receive buffer head.
1039 	 */
1040 	uds_send_advance(uds, peer, datalen, merge, seglen, segflags);
1041 
1042 	/*
1043 	 * Register the result.  For stream-type sockets, the expected behavior
1044 	 * is that all data be sent, and so we may still have to suspend the
1045 	 * call after partial progress.  Otherwise, we are now done.  Either
1046 	 * way, we are done with the control data, so mark it as consumed.
1047 	 */
1048 	*off += datalen;
1049 	*ctl_off += ctl_len;
1050 	if (uds_get_type(uds) == SOCK_STREAM && datalen < len)
1051 		return SUSPEND;
1052 	else
1053 		return OK;
1054 }
1055 
1056 /*
1057  * Test whether a send request would block.  The given 'min' parameter contains
1058  * the minimum number of bytes that should be possible to send without blocking
1059  * (the low send watermark).  Return SUSPEND if the send request would block,
1060  * or any other error code if it would not.
1061  */
1062 int
1063 uds_test_send(struct sock * sock, size_t min)
1064 {
1065 	struct udssock *uds = (struct udssock *)sock;
1066 
1067 	return uds_send_test(uds, min, 0, min, FALSE /*partial*/);
1068 }
1069 
1070 /*
1071  * Perform initial checks on a receive request, before it may potentially be
1072  * suspended.  Return OK if this receive request is valid, or a negative error
1073  * code if it is not.
1074  */
1075 int
1076 uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
1077 	int flags)
1078 {
1079 
1080 	/*
1081 	 * Reject calls with unknown flags.  TODO: ensure that we should really
1082 	 * reject all other flags rather than ignore them.
1083 	 */
1084 	if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0)
1085 		return EOPNOTSUPP;
1086 
1087 	return OK;
1088 }
1089 
1090 /*
1091  * Determine whether the (real or pretend) receive request should be processed
1092  * now, suspended until later, or rejected based on the current socket state.
1093  * Return OK if the receive request should be processed now, along with a first
1094  * indication whether the call may still be suspended later in 'may_block'.
1095  * Return SUSPEND if the receive request should be retried later.  Return an
1096  * appropriate negative error code if the receive request should fail.
1097  */
1098 static int
1099 uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial,
1100 	int * may_block)
1101 {
1102 	size_t seglen, datalen;
1103 	unsigned int segflags;
1104 	int r;
1105 
1106 	/*
1107 	 * If there are any pending data, those should always be received
1108 	 * first.  However, if there is nothing to receive, then whether we
1109 	 * should suspend the receive call or fail immediately depends on other
1110 	 * conditions.  We first look at these other conditions.
1111 	 */
1112 	r = OK;
1113 
1114 	if (uds_get_type(uds) != SOCK_DGRAM) {
1115 		if (uds_is_connecting(uds))
1116 			r = SUSPEND;
1117 		else if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
1118 			r = ENOTCONN;
1119 		else if (!uds_has_conn(uds) ||
1120 		    uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR))
1121 			r = SOCKEVENT_EOF;
1122 	}
1123 
1124 	if (uds->uds_len == 0) {
1125 		/*
1126 		 * For stream-type sockets, we use the policy: if no regular
1127 		 * data is requested, then end the call without receiving
1128 		 * anything.  For packet-type sockets, the request should block
1129 		 * until there is a packet to discard, though.
1130 		 */
1131 		if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0))
1132 			return r;
1133 
1134 		return SUSPEND;
1135 	}
1136 
1137 	/*
1138 	 * For stream-type sockets, we should still suspend the call if fewer
1139 	 * than 'min' bytes are available right now, and there is a possibility
1140 	 * that more data may arrive later.  More may arrive later iff 'r' is
1141 	 * OK (i.e., no EOF or error will follow) and, in case we already
1142 	 * received some partial results, there is not already a next segment
1143 	 * with ancillary data (i.e, nonzero segment flags), or in any case
1144 	 * there isn't more than one segment in the buffer.  Limit 'min' to the
1145 	 * maximum that can ever be received, though.  Since that is difficult
1146 	 * in our case, we check whether the buffer is entirely full instead.
1147 	 */
1148 	if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 &&
1149 	    uds->uds_len < UDS_BUF) {
1150 		assert(uds->uds_len >= UDS_HDRLEN);
1151 
1152 		(void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen,
1153 		    &segflags);
1154 
1155 		if (datalen < min && seglen == uds->uds_len &&
1156 		    (!partial || segflags == 0))
1157 			return SUSPEND;
1158 	}
1159 
1160 	/*
1161 	 * Also start the decision process as to whether we should suspend the
1162 	 * current call if MSG_WAITALL is given.  Unfortunately there is no one
1163 	 * place where we can conveniently do all the required checks.
1164 	 */
1165 	if (may_block != NULL)
1166 		*may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM);
1167 	return OK;
1168 }
1169 
1170 /*
1171  * Receive regular data, and possibly the source path, from the tail segment in
1172  * the receive buffer.  On success, return the positive non-zero length of the
1173  * tail segment, with 'addr' and 'addr_len' modified to store the source
1174  * address if applicable, the result flags in 'rflags' updated as appropriate,
1175  * the tail segment's data length stored in 'datalen', the number of received
1176  * regular data bytes stored in 'reslen', the segment flags stored in
1177  * 'segflags', and the absolute receive buffer position of the credentials in
1178  * the segment stored in 'credpos' if applicable.  Since the receive call may
1179  * still fail, this function must not yet update the tail or any other aspect
1180  * of the receive buffer.  Return zero if the current receive call was already
1181  * partially successful (due to MSG_WAITALL) and can no longer make progress,
1182  * and thus should be ended.  Return a negative error code on failure.
1183  */
1184 static int
1185 uds_recv_data(struct udssock * uds, const struct sockdriver_data * data,
1186 	size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len,
1187 	int * __restrict rflags, size_t * __restrict datalen,
1188 	size_t * __restrict reslen, unsigned int * __restrict segflags,
1189 	size_t * __restrict credpos)
1190 {
1191 	iovec_t iov[2];
1192 	unsigned char lenbyte;
1193 	unsigned int iovcnt;
1194 	size_t pos, seglen, left;
1195 	int r;
1196 
1197 	pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags);
1198 
1199 	/*
1200 	 * If a partially completed receive now runs into a segment that cannot
1201 	 * be logically merged with the previous one (because it has at least
1202 	 * one segment flag set, meaning it has ancillary data), then we must
1203 	 * shortcut the receive now.
1204 	 */
1205 	if (off != 0 && *segflags != 0)
1206 		return OK;
1207 
1208 	/*
1209 	 * As stated, for stream-type sockets, we choose to ignore zero-size
1210 	 * receive calls.  This has the consequence that reading a zero-sized
1211 	 * segment (with ancillary data) requires a receive request for at
1212 	 * least one regular data byte.  Such a receive call would then return
1213 	 * zero.  The problem with handling zero-data receive requests is that
1214 	 * we need to know whether the current segment is terminated (i.e., no
1215 	 * more data can possibly be merged into it later), which is a test
1216 	 * that we rather not perform, not in the least because we do not know
1217 	 * whether there is an error pending on the socket.
1218 	 *
1219 	 * For datagrams, we currently allow a zero-size receive call to
1220 	 * discard the next datagram.
1221 	 *
1222 	 * TODO: compare this against policies on other platforms.
1223 	 */
1224 	if (len == 0 && uds_get_type(uds) == SOCK_STREAM)
1225 		return OK;
1226 
1227 	/*
1228 	 * We have to skip the credentials for now: these are copied out as
1229 	 * control data, and thus will (well, may) be looked at when dealing
1230 	 * with the control data.  For the same reason, we do not even look at
1231 	 * UDS_HAS_FDS here.
1232 	 */
1233 	if (*segflags & UDS_HAS_CRED) {
1234 		*credpos = pos;
1235 
1236 		pos = uds_fetch(uds, pos, &lenbyte, 1);
1237 		pos = uds_advance(pos, (size_t)lenbyte);
1238 	}
1239 
1240 	/*
1241 	 * Copy out the source address, but only if the (datagram) socket is
1242 	 * not connected.  TODO: even when it is connected, it may still
1243 	 * receive packets sent to it from other sockets *before* being
1244 	 * connected, and the receiver has no way of knowing that those packets
1245 	 * did not come from its new peer.  Ideally, the older packets should
1246 	 * be dropped..
1247 	 */
1248 	if (*segflags & UDS_HAS_PATH) {
1249 		pos = uds_fetch(uds, pos, &lenbyte, 1);
1250 
1251 		if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds))
1252 			uds_make_addr((const char *)&uds->uds_buf[pos],
1253 			    (size_t)lenbyte, addr, addr_len);
1254 
1255 		pos = uds_advance(pos, (size_t)lenbyte);
1256 	}
1257 
1258 	/*
1259 	 * We can receive no more data than those that are present in the
1260 	 * segment, obviously.  For stream-type sockets, any more data that
1261 	 * could have been received along with the current data would have been
1262 	 * merged in the current segment, so we need not search for any next
1263 	 * segments.
1264 	 *
1265 	 * For non-stream sockets, the caller may receive less than a whole
1266 	 * packet if it supplied a small buffer.  In that case, the rest of the
1267 	 * packet will be discarded (but not here yet!) and the caller gets
1268 	 * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway.
1269 	 */
1270 	if (len > *datalen)
1271 		len = *datalen;
1272 	else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM)
1273 		*rflags |= MSG_TRUNC;
1274 
1275 	/* Copy out the data to the caller. */
1276 	if (len > 0) {
1277 		iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos];
1278 		left = UDS_BUF - pos;
1279 
1280 		if (left < len) {
1281 			iov[0].iov_size = left;
1282 			iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0];
1283 			iov[1].iov_size = len - left;
1284 			iovcnt = 2;
1285 		} else {
1286 			iov[0].iov_size = len;
1287 			iovcnt = 1;
1288 		}
1289 
1290 		if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK)
1291 			return r;
1292 	}
1293 
1294 	*reslen = len;
1295 	assert(seglen > 0 && seglen <= INT_MAX);
1296 	return (int)seglen;
1297 }
1298 
1299 /*
1300  * The current segment has associated file descriptors.  If possible, copy out
1301  * all file descriptors to the receiver, and generate and copy out a chunk of
1302  * control data that contains their file descriptor numbers.  If not all
1303  * file descriptors fit in the receiver's buffer, or if any error occurs, no
1304  * file descriptors are copied out.
1305  */
1306 static int
1307 uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl,
1308 	socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags)
1309 {
1310 	struct msghdr msghdr;
1311 	struct cmsghdr *cmsg;
1312 	struct uds_fd *ufd;
1313 	unsigned int i, nfds;
1314 	socklen_t chunklen, chunkspace;
1315 	int r, fd, what;
1316 
1317 	/* See how many file descriptors should be part of this chunk. */
1318 	assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
1319 	ufd = SIMPLEQ_FIRST(&uds->uds_fds);
1320 	nfds = ufd->ufd_count;
1321 	assert(nfds > 0);
1322 
1323 	/*
1324 	 * We produce and copy out potentially unaligned chunks, using
1325 	 * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE.
1326 	 * This may leave "gap" bytes unchanged in userland, but that should
1327 	 * not be a problem.  By producing unaligned chunks, we eliminate a
1328 	 * potential boundary case where the unaligned chunk passed in (by the
1329 	 * sender) no longer fits in the same buffer after being aligned here.
1330 	 */
1331 	chunklen = CMSG_LEN(sizeof(int) * nfds);
1332 	chunkspace = CMSG_SPACE(sizeof(int) * nfds);
1333 	assert(chunklen <= sizeof(uds_ctlbuf));
1334 	if (chunklen > ctl_len)
1335 		return 0; /* chunk would not fit, so produce nothing instead */
1336 	if (chunkspace > ctl_len)
1337 		chunkspace = ctl_len;
1338 
1339 	memset(&msghdr, 0, sizeof(msghdr));
1340 	msghdr.msg_control = uds_ctlbuf;
1341 	msghdr.msg_controllen = sizeof(uds_ctlbuf);
1342 
1343 	memset(uds_ctlbuf, 0, chunklen);
1344 	cmsg = CMSG_FIRSTHDR(&msghdr);
1345 	cmsg->cmsg_len = chunklen;
1346 	cmsg->cmsg_level = SOL_SOCKET;
1347 	cmsg->cmsg_type = SCM_RIGHTS;
1348 
1349 	/*
1350 	 * Copy the group's local file descriptors to the target endpoint, and
1351 	 * store the resulting remote file descriptors in the chunk buffer.
1352 	 */
1353 	r = OK;
1354 
1355 	for (i = 0; i < nfds; i++) {
1356 		assert(ufd != SIMPLEQ_END(&uds->uds_fds));
1357 		assert(i == 0 || ufd->ufd_count == 0);
1358 
1359 		what = COPYFD_TO;
1360 		if (flags & MSG_CMSG_CLOEXEC)
1361 			what |= COPYFD_CLOEXEC;
1362 
1363 		/* Failure may happen legitimately here (e.g., EMFILE). */
1364 		if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0)
1365 			break; /* we keep our progress so far in 'i' */
1366 
1367 		fd = r;
1368 
1369 		dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd));
1370 
1371 		memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int));
1372 
1373 		ufd = SIMPLEQ_NEXT(ufd, ufd_next);
1374 	}
1375 
1376 	/* If everything went well so far, copy out the produced chunk. */
1377 	if (r >= 0)
1378 		r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen);
1379 
1380 	/*
1381 	 * Handle errors.  At this point, the 'i' variable contains the number
1382 	 * of file descriptors that have already been successfully copied out.
1383 	 */
1384 	if (r < 0) {
1385 		/* Revert the successful copyfd() calls made so far. */
1386 		while (i-- > 0) {
1387 			memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
1388 
1389 			(void)copyfd(user_endpt, fd, COPYFD_CLOSE);
1390 		}
1391 
1392 		return r;
1393 	}
1394 
1395 	/*
1396 	 * Success.  Return the aligned size of the produced chunk, if the
1397 	 * given length permits it.  From here on, the receive call may no
1398 	 * longer fail, as that would result in lost file descriptors.
1399 	 */
1400 	return chunkspace;
1401 }
1402 
1403 /*
1404  * Generate and copy out a chunk of control data with the sender's credentials.
1405  * Return the aligned chunk size on success, or a negative error code on
1406  * failure.
1407  */
1408 static int
1409 uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl,
1410 	socklen_t ctl_len, socklen_t ctl_off, size_t credpos)
1411 {
1412 	struct msghdr msghdr;
1413 	struct cmsghdr *cmsg;
1414 	socklen_t chunklen, chunkspace;
1415 	unsigned char lenbyte;
1416 	size_t credlen;
1417 	int r;
1418 
1419 	/*
1420 	 * Since the sender side already did the hard work of producing the
1421 	 * (variable-size) sockcred structure as it should be received, there
1422 	 * is relatively little work to be done here.
1423 	 */
1424 	credpos = uds_fetch(uds, credpos, &lenbyte, 1);
1425 	credlen = (size_t)lenbyte;
1426 
1427 	chunklen = CMSG_LEN(credlen);
1428 	chunkspace = CMSG_SPACE(credlen);
1429 	assert(chunklen <= sizeof(uds_ctlbuf));
1430 	if (chunklen > ctl_len)
1431 		return 0; /* chunk would not fit, so produce nothing instead */
1432 	if (chunkspace > ctl_len)
1433 		chunkspace = ctl_len;
1434 
1435 	memset(&msghdr, 0, sizeof(msghdr));
1436 	msghdr.msg_control = uds_ctlbuf;
1437 	msghdr.msg_controllen = sizeof(uds_ctlbuf);
1438 
1439 	memset(uds_ctlbuf, 0, chunklen);
1440 	cmsg = CMSG_FIRSTHDR(&msghdr);
1441 	cmsg->cmsg_len = chunklen;
1442 	cmsg->cmsg_level = SOL_SOCKET;
1443 	cmsg->cmsg_type = SCM_CREDS;
1444 
1445 	uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen);
1446 
1447 	if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK)
1448 		return r;
1449 
1450 	return chunkspace;
1451 }
1452 
1453 /*
1454  * Copy out control data for the ancillary data associated with the current
1455  * segment, if any.  Return OK on success, at which point the current receive
1456  * call may no longer fail.  'rflags' may be updated with additional result
1457  * flags.  Return a negative error code on failure.
1458  */
1459 static int
1460 uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl,
1461 	socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt,
1462 	int flags, unsigned int segflags, size_t credpos, int * rflags)
1463 {
1464 	int r;
1465 
1466 	/*
1467 	 * We first copy out all file descriptors, if any.  We put them in one
1468 	 * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS
1469 	 * chunks.  We believe that this should not cause application-level
1470 	 * issues, but if it does, we can change that later with some effort.
1471 	 * We then copy out credentials, if any.
1472 	 *
1473 	 * We copy out each control chunk independently of the others, and also
1474 	 * perform error recovery on a per-chunk basis.  This implies the
1475 	 * following.  If producing or copying out the first chunk fails, the
1476 	 * entire recvmsg(2) call will fail with an appropriate error.  If
1477 	 * producing or copying out any subsequent chunk fails, the recvmsg(2)
1478 	 * call will still return the previously generated chunks (a "short
1479 	 * control read" if you will) as well as the MSG_CTRUNC flag.  This
1480 	 * approach is simple and clean, and it guarantees that we can always
1481 	 * copy out at least as many file descriptors as we copied in for this
1482 	 * segment, even if credentials are present as well.  However, the
1483 	 * approach does cause slightly more overhead when there are multiple
1484 	 * chunks per call, as those are copied out separately.
1485 	 *
1486 	 * Since the generated SCM_RIGHTS chunk is never larger than the
1487 	 * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf"
1488 	 * buffer is always large enough to contain the chunk in its entirety.
1489 	 * SCM_CREDS chunks should always fit easily as well.
1490 	 *
1491 	 * The MSG_CTRUNC flag will be returned iff not the entire user-given
1492 	 * control buffer was filled and not all control chunks were delivered.
1493 	 * Our current implementation does not deliver partial chunks.  NetBSD
1494 	 * does, except for SCM_RIGHTS chunks.
1495 	 *
1496 	 * TODO: get rid of the redundancy in processing return values.
1497 	 */
1498 	if (segflags & UDS_HAS_FDS) {
1499 		r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt,
1500 		    flags);
1501 
1502 		/*
1503 		 * At this point, 'r' contains one of the following:
1504 		 *
1505 		 *   r > 0	a chunk of 'r' bytes was added successfully.
1506 		 *   r == 0	not enough space left; the chunk was not added.
1507 		 *   r < 0	an error occurred; the chunk was not added.
1508 		 */
1509 		if (r < 0 && *ctl_off == 0)
1510 			return r;
1511 
1512 		if (r > 0) {
1513 			ctl_len -= r;
1514 			*ctl_off += r;
1515 		} else
1516 			*rflags |= MSG_CTRUNC;
1517 	}
1518 
1519 	if (segflags & UDS_HAS_CRED) {
1520 		r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos);
1521 
1522 		/* As above. */
1523 		if (r < 0 && *ctl_off == 0)
1524 			return r;
1525 
1526 		if (r > 0) {
1527 			ctl_len -= r;
1528 			*ctl_off += r;
1529 		} else
1530 			*rflags |= MSG_CTRUNC;
1531 	}
1532 
1533 	return OK;
1534 }
1535 
1536 /*
1537  * The current receive request is successful or, in the case of MSG_WAITALL,
1538  * has made progress.  Advance the receive buffer tail, either by discarding
1539  * the entire tail segment or by generating a new, smaller tail segment that
1540  * contains only the regular data left to be received from the original tail
1541  * segment.  Also wake up the sending side for connection-oriented sockets if
1542  * applicable, because there may now be room for more data to be sent.  Update
1543  * 'may_block' if we are now sure that the call may not block on MSG_WAITALL
1544  * after all.
1545  */
1546 static void
1547 uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen,
1548 	size_t reslen, unsigned int segflags, int * may_block)
1549 {
1550 	struct udssock *conn;
1551 	struct uds_fd *ufd;
1552 	size_t delta, nseglen, advance;
1553 	unsigned int nfds;
1554 
1555 	/* Note that 'reslen' may be legitimately zero. */
1556 	assert(reslen <= datalen);
1557 
1558 	if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen)
1559 		reslen = datalen;
1560 
1561 	delta = datalen - reslen;
1562 
1563 	if (delta == 0) {
1564 		/*
1565 		 * Fully consume the tail segment.  We advance the tail by the
1566 		 * full segment length, thus moving up to either the next
1567 		 * segment in the receive buffer, or an empty receive buffer.
1568 		 */
1569 		advance = seglen;
1570 
1571 		uds->uds_tail = uds_advance(uds->uds_tail, advance);
1572 	} else {
1573 		/*
1574 		 * Partially consume the tail segment.  We put a new segment
1575 		 * header right in front of the remaining data, which obviously
1576 		 * always fits.  Since any ancillary data was consumed along
1577 		 * with the first data byte of the segment, the new segment has
1578 		 * no ancillary data anymore (and thus a zero flags field).
1579 		 */
1580 		nseglen = UDS_HDRLEN + delta;
1581 		assert(nseglen < seglen);
1582 
1583 		advance = seglen - nseglen;
1584 
1585 		uds->uds_tail = uds_advance(uds->uds_tail, advance);
1586 
1587 		uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0);
1588 	}
1589 
1590 	/*
1591 	 * For datagram-oriented sockets, we always consume at least a header.
1592 	 * For stream-type sockets, we either consume a zero-data segment along
1593 	 * with its ancillary data, or we consume at least one byte from a
1594 	 * segment that does have regular data.  In all other cases, the
1595 	 * receive call has already been ended by now.  Thus, we always advance
1596 	 * the tail of the receive buffer here.
1597 	 */
1598 	assert(advance > 0);
1599 
1600 	/*
1601 	 * The receive buffer's used length (uds_len) and pointer to the
1602 	 * previous segment header (uds_last) are offsets from the tail.  Now
1603 	 * that we have moved the tail, we need to adjust these accordingly.
1604 	 * If the buffer is now empty, reset the tail to the buffer start so as
1605 	 * to avoid splitting inter-process copies whenever possible.
1606 	 */
1607 	assert(uds->uds_len >= advance);
1608 	uds->uds_len -= advance;
1609 
1610 	if (uds->uds_len == 0)
1611 		uds->uds_tail = 0;
1612 
1613 	/*
1614 	 * If uds_last is zero here, it was pointing to the segment we just
1615 	 * (partially) consumed.  By leaving it zero, it will still point to
1616 	 * the new or next segment.
1617 	 */
1618 	if (uds->uds_last > 0) {
1619 		assert(uds->uds_len > 0);
1620 		assert(uds->uds_last >= advance);
1621 		uds->uds_last -= advance;
1622 	}
1623 
1624 	/*
1625 	 * If there were any file descriptors associated with this segment,
1626 	 * close and free them now.
1627 	 */
1628 	if (segflags & UDS_HAS_FDS) {
1629 		assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
1630 		ufd = SIMPLEQ_FIRST(&uds->uds_fds);
1631 		nfds = ufd->ufd_count;
1632 		assert(nfds > 0);
1633 
1634 		while (nfds-- > 0) {
1635 			assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
1636 			ufd = SIMPLEQ_FIRST(&uds->uds_fds);
1637 			SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next);
1638 
1639 			dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
1640 
1641 			closenb(ufd->ufd_fd);
1642 
1643 			SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next);
1644 		}
1645 	}
1646 
1647 	/*
1648 	 * If there is now any data left in the receive buffer, then there has
1649 	 * been a reason that we haven't received it.  For stream sockets, that
1650 	 * reason is that the next segment has ancillary data.  In any case,
1651 	 * this means we should never block the current receive operation
1652 	 * waiting for more data.  Otherwise, we may block on MSG_WAITALL.
1653 	 */
1654 	if (uds->uds_len > 0)
1655 		*may_block = FALSE;
1656 
1657 	/*
1658 	 * If the (non-datagram) socket has a peer that is not shut down for
1659 	 * writing, see if it can be woken up to send more data.  Note that
1660 	 * the event will never be processed immediately.
1661 	 */
1662 	if (uds_is_connected(uds)) {
1663 		assert(uds_get_type(uds) != SOCK_DGRAM);
1664 
1665 		conn = uds->uds_conn;
1666 
1667 		if (!uds_is_shutdown(conn, SFL_SHUT_WR))
1668 			sockevent_raise(&conn->uds_sock, SEV_SEND);
1669 	}
1670 }
1671 
1672 /*
1673  * Process a receive request.  Return OK if the receive request has completed
1674  * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an
1675  * end-of-file condition is reached, or a negative error code on failure.  In
1676  * all cases, the values of 'off' and 'ctl_off' must be updated if any progress
1677  * has been made; if either is non-zero, libsockevent will return the partial
1678  * progress rather than an error code or EOF.
1679  */
1680 int
1681 uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len,
1682 	size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
1683 	socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len,
1684 	endpoint_t user_endpt, int flags, size_t min, int * rflags)
1685 {
1686 	struct udssock *uds = (struct udssock *)sock;
1687 	size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/;
1688 	unsigned int segflags;
1689 	int r, partial, may_block;
1690 
1691 	dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n",
1692 	    uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
1693 	    (ctl_off != NULL) ? *ctl_off : 0, flags));
1694 
1695 	/*
1696 	 * Start by testing whether anything can be received at all, or whether
1697 	 * an error or EOF should be returned instead, or whether the receive
1698 	 * call should be suspended until later otherwise.  If no (regular or
1699 	 * control) data can be received, or if this was a test for select,
1700 	 * we bail out right after.
1701 	 */
1702 	partial = (off != NULL && *off > 0);
1703 
1704 	if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK)
1705 		return r;
1706 
1707 	/*
1708 	 * Copy out regular data, if any.  Do this before copying out control
1709 	 * data, because the latter is harder to undo on failure.  This data
1710 	 * copy function returns returns OK (0) if we are to return a result of
1711 	 * zero bytes (which is *not* EOF) to the caller without doing anything
1712 	 * else.  The function returns a nonzero positive segment length if we
1713 	 * should carry on with the receive call (as it happens, all its other
1714 	 * returned values may in fact be zero).
1715 	 */
1716 	if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags,
1717 	    &datalen, &reslen, &segflags, &credpos)) <= 0)
1718 		return r;
1719 	seglen = (size_t)r;
1720 
1721 	/*
1722 	 * Copy out control data, if any: transfer and copy out records of file
1723 	 * descriptors, and/or copy out sender credentials.  This is the last
1724 	 * part of the call that may fail.
1725 	 */
1726 	if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags,
1727 	    segflags, credpos, rflags)) != OK)
1728 		return r;
1729 
1730 	/*
1731 	 * Now that the call has succeeded, move the tail of the receive
1732 	 * buffer, unless we were merely peeking.
1733 	 */
1734 	if (!(flags & MSG_PEEK))
1735 		uds_recv_advance(uds, seglen, datalen, reslen, segflags,
1736 		    &may_block);
1737 	else
1738 		may_block = FALSE;
1739 
1740 	/*
1741 	 * If the MSG_WAITALL flag was given, we may still have to suspend the
1742 	 * call after partial success.  In particular, the receive call may
1743 	 * suspend after partial success if all of these conditions are met:
1744 	 *
1745 	 *   1) the socket is a stream-type socket;
1746 	 *   2) MSG_WAITALL is set;
1747 	 *   3) MSG_PEEK is not set;
1748 	 *   4) MSG_DONTWAIT is not set (tested upon return);
1749 	 *   5) the socket must not have a pending error (tested upon return);
1750 	 *   6) the socket must not be shut down for reading (tested later);
1751 	 *   7) the socket must still be connected to a peer (no EOF);
1752 	 *   8) the peer must not have been shut down for writing (no EOF);
1753 	 *   9) the next segment, if any, contains no ancillary data.
1754 	 *
1755 	 * Together, these points guarantee that the call could conceivably
1756 	 * receive more after being resumed.  Points 4 to 6 are covered by
1757 	 * libsockevent, which will end the call even if we return SUSPEND
1758 	 * here.  Due to segment merging, we cover point 9 by checking that
1759 	 * there is currently no next segment at all.  Once a new segment
1760 	 * arrives, the ancillary-data test is done then.
1761 	 */
1762 	*off += reslen;
1763 	if ((flags & MSG_WAITALL) && reslen < len && may_block)
1764 		return SUSPEND;
1765 	else
1766 		return OK;
1767 }
1768 
1769 /*
1770  * Test whether a receive request would block.  The given 'min' parameter
1771  * contains the minimum number of bytes that should be possible to receive
1772  * without blocking (the low receive watermark).  Return SUSPEND if the send
1773  * request would block.  Otherwise, return any other error code (including OK
1774  * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled
1775  * with the number of bytes available for receipt right now (if not zero).
1776  * Note that if 'size' is not NULL, 'min' will always be zero.
1777  */
1778 int
1779 uds_test_recv(struct sock * sock, size_t min, size_t * size)
1780 {
1781 	struct udssock *uds = (struct udssock *)sock;
1782 	size_t seglen;
1783 	unsigned int segflags;
1784 	int r;
1785 
1786 	if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/,
1787 	    NULL /*may_block*/)) == SUSPEND)
1788 		return r;
1789 
1790 	if (size != NULL && uds->uds_len > 0)
1791 		(void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size,
1792 		    &segflags);
1793 
1794 	return r;
1795 }
1796