xref: /minix/minix/lib/libsockevent/sockevent.c (revision 03ac74ed)
1 /* Socket event dispatching library - by D.C. van Moolenbroek */
2 
3 #include <minix/drivers.h>
4 #include <minix/sockdriver.h>
5 #include <minix/sockevent.h>
6 #include <sys/ioctl.h>
7 
8 #include "sockevent_proc.h"
9 
10 #define US		1000000UL	/* microseconds per second */
11 
12 #define SOCKHASH_SLOTS	256		/* # slots in ID-to-sock hash table */
13 
14 static SLIST_HEAD(, sock) sockhash[SOCKHASH_SLOTS];
15 
16 static SLIST_HEAD(, sock) socktimer;
17 
18 static minix_timer_t sockevent_timer;
19 
20 static SIMPLEQ_HEAD(, sock) sockevent_pending;
21 
22 static sockevent_socket_cb_t sockevent_socket_cb = NULL;
23 
24 static int sockevent_working;
25 
26 static void socktimer_del(struct sock * sock);
27 static void sockevent_cancel_send(struct sock * sock,
28 	struct sockevent_proc * spr, int err);
29 static void sockevent_cancel_recv(struct sock * sock,
30 	struct sockevent_proc * spr, int err);
31 
32 /*
33  * Initialize the hash table of sock objects.
34  */
35 static void
36 sockhash_init(void)
37 {
38 	unsigned int slot;
39 
40 	for (slot = 0; slot < __arraycount(sockhash); slot++)
41 		SLIST_INIT(&sockhash[slot]);
42 }
43 
44 /*
45  * Given a socket identifier, return a hash table slot number.
46  */
47 static unsigned int
48 sockhash_slot(sockid_t id)
49 {
50 
51 	/*
52 	 * The idea of the shift is that a socket driver may offer multiple
53 	 * classes of sockets, and put the class in the higher bits.  The shift
54 	 * aims to prevent that all classes' first sockets end up in the same
55 	 * hash slot.
56 	 */
57 	return (id + (id >> 16)) % SOCKHASH_SLOTS;
58 }
59 
60 /*
61  * Obtain a sock object from the hash table using its unique identifier.
62  * Return a pointer to the object if found, or NULL otherwise.
63  */
64 static struct sock *
65 sockhash_get(sockid_t id)
66 {
67 	struct sock *sock;
68 	unsigned int slot;
69 
70 	slot = sockhash_slot(id);
71 
72 	SLIST_FOREACH(sock, &sockhash[slot], sock_hash) {
73 		if (sock->sock_id == id)
74 			return sock;
75 	}
76 
77 	return NULL;
78 }
79 
80 /*
81  * Add a sock object to the hash table.  The sock object must have a valid ID
82  * in its 'sock_id' field, and must not be in the hash table already.
83  */
84 static void
85 sockhash_add(struct sock * sock)
86 {
87 	unsigned int slot;
88 
89 	slot = sockhash_slot(sock->sock_id);
90 
91 	SLIST_INSERT_HEAD(&sockhash[slot], sock, sock_hash);
92 }
93 
94 /*
95  * Remove a sock object from the hash table.  The sock object must be in the
96  * hash table.
97  */
98 static void
99 sockhash_del(struct sock * sock)
100 {
101 	unsigned int slot;
102 
103 	slot = sockhash_slot(sock->sock_id);
104 
105 	/* This macro is O(n). */
106 	SLIST_REMOVE(&sockhash[slot], sock, sock, sock_hash);
107 }
108 
109 /*
110  * Reset a socket object to a proper initial state, with a particular socket
111  * identifier, a SOCK_ type, and a socket operations table.  The socket is
112  * added to the ID-to-object hash table.  This function always succeeds.
113  */
114 static void
115 sockevent_reset(struct sock * sock, sockid_t id, int domain, int type,
116 	const struct sockevent_ops * ops)
117 {
118 
119 	assert(sock != NULL);
120 
121 	memset(sock, 0, sizeof(*sock));
122 
123 	sock->sock_id = id;
124 	sock->sock_domain = domain;
125 	sock->sock_type = type;
126 
127 	sock->sock_slowat = 1;
128 	sock->sock_rlowat = 1;
129 
130 	sock->sock_ops = ops;
131 	sock->sock_proc = NULL;
132 	sock->sock_select.ss_endpt = NONE;
133 
134 	sockhash_add(sock);
135 }
136 
137 /*
138  * Initialize a new socket that will serve as an accepted socket on the given
139  * listening socket 'sock'.  The new socket is given as 'newsock', and its new
140  * socket identifier is given as 'newid'.  This function always succeeds.
141  */
142 void
143 sockevent_clone(struct sock * sock, struct sock * newsock, sockid_t newid)
144 {
145 
146 	sockevent_reset(newsock, newid, (int)sock->sock_domain,
147 	    sock->sock_type, sock->sock_ops);
148 
149 	/* These are the settings that are currently inherited. */
150 	newsock->sock_opt = sock->sock_opt & ~SO_ACCEPTCONN;
151 	newsock->sock_linger = sock->sock_linger;
152 	newsock->sock_stimeo = sock->sock_stimeo;
153 	newsock->sock_rtimeo = sock->sock_rtimeo;
154 	newsock->sock_slowat = sock->sock_slowat;
155 	newsock->sock_rlowat = sock->sock_rlowat;
156 
157 	newsock->sock_flags |= SFL_CLONED;
158 }
159 
160 /*
161  * A new socket has just been accepted.  The corresponding listening socket is
162  * given as 'sock'.  The new socket has ID 'newid', and if it had not already
163  * been added to the hash table through sockevent_clone() before, 'newsock' is
164  * a non-NULL pointer which identifies the socket object to clone into.
165  */
166 static void
167 sockevent_accepted(struct sock * sock, struct sock * newsock, sockid_t newid)
168 {
169 
170 	if (newsock == NULL) {
171 		if ((newsock = sockhash_get(newid)) == NULL)
172 			panic("libsockdriver: socket driver returned unknown "
173 			    "ID %d from accept callback", newid);
174 	} else
175 		sockevent_clone(sock, newsock, newid);
176 
177 	assert(newsock->sock_flags & SFL_CLONED);
178 	newsock->sock_flags &= ~SFL_CLONED;
179 }
180 
181 /*
182  * Allocate a sock object, by asking the socket driver for one.  On success,
183  * return OK, with a pointer to the new object stored in 'sockp'.  This new
184  * object has all its fields set to initial values, in part based on the given
185  * parameters.  On failure, return an error code.  Failure has two typical
186  * cause: either the given domain, type, protocol combination is not supported,
187  * or the socket driver is out of sockets (globally or for this combination).
188  */
189 static int
190 sockevent_alloc(int domain, int type, int protocol, endpoint_t user_endpt,
191 	struct sock ** sockp)
192 {
193 	struct sock *sock;
194 	const struct sockevent_ops *ops;
195 	sockid_t r;
196 
197 	/*
198 	 * Verify that the given domain is sane.  Unlike the type and protocol,
199 	 * the domain is already verified by VFS, so we do not limit ourselves
200 	 * here.  The result is that we can store the domain in just a byte.
201 	 */
202 	if (domain < 0 || domain > UINT8_MAX)
203 		return EAFNOSUPPORT;
204 
205 	/* Make sure that the library has actually been initialized. */
206 	if (sockevent_socket_cb == NULL)
207 		panic("libsockevent: not initialized");
208 
209 	sock = NULL;
210 	ops = NULL;
211 
212 	/*
213 	 * Ask the socket driver to create a socket for the given combination
214 	 * of domain, type, and protocol.  If so, let it return a new sock
215 	 * object, a unique socket identifier for that object, and an
216 	 * operations table for it.
217 	 */
218 	if ((r = sockevent_socket_cb(domain, type, protocol, user_endpt, &sock,
219 	    &ops)) < 0)
220 		return r;
221 
222 	assert(sock != NULL);
223 	assert(ops != NULL);
224 
225 	sockevent_reset(sock, r, domain, type, ops);
226 
227 	*sockp = sock;
228 	return OK;
229 }
230 
231 /*
232  * Free a previously allocated sock object.
233  */
234 static void
235 sockevent_free(struct sock * sock)
236 {
237 	const struct sockevent_ops *ops;
238 
239 	assert(sock->sock_proc == NULL);
240 
241 	socktimer_del(sock);
242 
243 	sockhash_del(sock);
244 
245 	/*
246 	 * Invalidate the operations table on the socket, before freeing the
247 	 * socket.  This allows us to detect cases where sockevent functions
248 	 * are called on sockets that have already been freed.
249 	 */
250 	ops = sock->sock_ops;
251 	sock->sock_ops = NULL;
252 
253 	assert(ops != NULL);
254 	assert(ops->sop_free != NULL);
255 
256 	ops->sop_free(sock);
257 }
258 
259 /*
260  * Create a new socket.
261  */
262 static sockid_t
263 sockevent_socket(int domain, int type, int protocol, endpoint_t user_endpt)
264 {
265 	struct sock *sock;
266 	int r;
267 
268 	if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
269 	    &sock)) != OK)
270 		return r;
271 
272 	return sock->sock_id;
273 }
274 
275 /*
276  * Create a pair of connected sockets.
277  */
278 static int
279 sockevent_socketpair(int domain, int type, int protocol, endpoint_t user_endpt,
280 	sockid_t id[2])
281 {
282 	struct sock *sock1, *sock2;
283 	int r;
284 
285 	if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
286 	    &sock1)) != OK)
287 		return r;
288 
289 	/* Creating socket pairs is not always supported. */
290 	if (sock1->sock_ops->sop_pair == NULL) {
291 		sockevent_free(sock1);
292 
293 		return EOPNOTSUPP;
294 	}
295 
296 	if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
297 	    &sock2)) != OK) {
298 		sockevent_free(sock1);
299 
300 		return r;
301 	}
302 
303 	assert(sock1->sock_ops == sock2->sock_ops);
304 
305 	r = sock1->sock_ops->sop_pair(sock1, sock2, user_endpt);
306 
307 	if (r != OK) {
308 		sockevent_free(sock2);
309 		sockevent_free(sock1);
310 
311 		return r;
312 	}
313 
314 	id[0] = sock1->sock_id;
315 	id[1] = sock2->sock_id;
316 	return OK;
317 }
318 
319 /*
320  * A send request returned EPIPE.  If desired, send a SIGPIPE signal to the
321  * user process that issued the request.
322  */
323 static void
324 sockevent_sigpipe(struct sock * sock, endpoint_t user_endpt, int flags)
325 {
326 
327 	/*
328 	 * POSIX says that pipe signals should be generated for SOCK_STREAM
329 	 * sockets.  Linux does just this, NetBSD raises signals for all socket
330 	 * types.
331 	 */
332 	if (sock->sock_type != SOCK_STREAM)
333 		return;
334 
335 	/*
336 	 * Why would there be fewer than four ways to do the same thing?
337 	 * O_NOSIGPIPE, MSG_NOSIGNAL, SO_NOSIGPIPE, and of course blocking
338 	 * SIGPIPE.  VFS already sets MSG_NOSIGNAL for calls on sockets with
339 	 * O_NOSIGPIPE.  The fact that SO_NOSIGPIPE is a thing, is also the
340 	 * reason why we cannot let VFS handle signal generation altogether.
341 	 */
342 	if (flags & MSG_NOSIGNAL)
343 		return;
344 	if (sock->sock_opt & SO_NOSIGPIPE)
345 		return;
346 
347 	/*
348 	 * Send a SIGPIPE signal to the user process.  Unfortunately we cannot
349 	 * guarantee that the SIGPIPE reaches the user process before the send
350 	 * call returns.  Usually, the scheduling priorities of system services
351 	 * are such that the signal is likely to arrive first anyway, but if
352 	 * timely arrival of the signal is required, a more fundamental change
353 	 * to the system would be needed.
354 	 */
355 	sys_kill(user_endpt, SIGPIPE);
356 }
357 
358 /*
359  * Suspend a request without data, that is, a bind, connect, accept, or close
360  * request.
361  */
362 static void
363 sockevent_suspend(struct sock * sock, unsigned int event,
364 	const struct sockdriver_call * __restrict call, endpoint_t user_endpt)
365 {
366 	struct sockevent_proc *spr, **sprp;
367 
368 	/* There is one slot for each process, so this should never fail. */
369 	if ((spr = sockevent_proc_alloc()) == NULL)
370 		panic("libsockevent: too many suspended processes");
371 
372 	spr->spr_next = NULL;
373 	spr->spr_event = event;
374 	spr->spr_timer = FALSE;
375 	spr->spr_call = *call;
376 	spr->spr_endpt = user_endpt;
377 
378 	/*
379 	 * Add the request to the tail of the queue.  This operation is O(n),
380 	 * but the number of suspended requests per socket is expected to be
381 	 * low at all times.
382 	 */
383 	for (sprp = &sock->sock_proc; *sprp != NULL;
384 	     sprp = &(*sprp)->spr_next);
385 	*sprp = spr;
386 }
387 
388 /*
389  * Suspend a request with data, that is, a send or receive request.
390  */
391 static void
392 sockevent_suspend_data(struct sock * sock, unsigned int event, int timer,
393 	const struct sockdriver_call * __restrict call, endpoint_t user_endpt,
394 	const struct sockdriver_data * __restrict data, size_t len, size_t off,
395 	const struct sockdriver_data * __restrict ctl, socklen_t ctl_len,
396 	socklen_t ctl_off, int flags, int rflags, clock_t time)
397 {
398 	struct sockevent_proc *spr, **sprp;
399 
400 	/* There is one slot for each process, so this should never fail. */
401 	if ((spr = sockevent_proc_alloc()) == NULL)
402 		panic("libsockevent: too many suspended processes");
403 
404 	spr->spr_next = NULL;
405 	spr->spr_event = event;
406 	spr->spr_timer = timer;
407 	spr->spr_call = *call;
408 	spr->spr_endpt = user_endpt;
409 	sockdriver_pack_data(&spr->spr_data, call, data, len);
410 	spr->spr_datalen = len;
411 	spr->spr_dataoff = off;
412 	sockdriver_pack_data(&spr->spr_ctl, call, ctl, ctl_len);
413 	spr->spr_ctllen = ctl_len;
414 	spr->spr_ctloff = ctl_off;
415 	spr->spr_flags = flags;
416 	spr->spr_rflags = rflags;
417 	spr->spr_time = time;
418 
419 	/*
420 	 * Add the request to the tail of the queue.  This operation is O(n),
421 	 * but the number of suspended requests per socket is expected to be
422 	 * low at all times.
423 	 */
424 	for (sprp = &sock->sock_proc; *sprp != NULL;
425 	     sprp = &(*sprp)->spr_next);
426 	*sprp = spr;
427 }
428 
429 /*
430  * Return TRUE if there are any suspended requests on the given socket's queue
431  * that match any of the events in the given event mask, or FALSE otherwise.
432  */
433 static int
434 sockevent_has_suspended(struct sock * sock, unsigned int mask)
435 {
436 	struct sockevent_proc *spr;
437 
438 	for (spr = sock->sock_proc; spr != NULL; spr = spr->spr_next)
439 		if (spr->spr_event & mask)
440 			return TRUE;
441 
442 	return FALSE;
443 }
444 
445 /*
446  * Check whether the given call is on the given socket's queue of suspended
447  * requests.  If so, remove it from the queue and return a pointer to the
448  * suspension data structure.  The caller is then responsible for freeing that
449  * data structure using sockevent_proc_free().  If the call was not found, the
450  * function returns NULL.
451  */
452 static struct sockevent_proc *
453 sockevent_unsuspend(struct sock * sock, const struct sockdriver_call * call)
454 {
455 	struct sockevent_proc *spr, **sprp;
456 
457 	/* Find the suspended request being canceled. */
458 	for (sprp = &sock->sock_proc; (spr = *sprp) != NULL;
459 	    sprp = &spr->spr_next) {
460 		if (spr->spr_call.sc_endpt == call->sc_endpt &&
461 		    spr->spr_call.sc_req == call->sc_req) {
462 			/* Found; remove and return it. */
463 			*sprp = spr->spr_next;
464 
465 			return spr;
466 		}
467 	}
468 
469 	return NULL;
470 }
471 
472 /*
473  * Attempt to resume the given suspended request for the given socket object.
474  * Return TRUE if the suspended request has been fully resumed and can be
475  * removed from the queue of suspended requests, or FALSE if it has not been
476  * fully resumed and should stay on the queue.  In the latter case, no
477  * resumption will be attempted for other suspended requests of the same type.
478  */
479 static int
480 sockevent_resume(struct sock * sock, struct sockevent_proc * spr)
481 {
482 	struct sock *newsock;
483 	struct sockdriver_data data, ctl;
484 	char addr[SOCKADDR_MAX];
485 	socklen_t addr_len;
486 	size_t len, min;
487 	sockid_t r;
488 
489 	switch (spr->spr_event) {
490 	case SEV_CONNECT:
491 		/*
492 		 * If the connect call was suspended for the purpose of
493 		 * intercepting resumption, simply remove it from the queue.
494 		 */
495 		if (spr->spr_call.sc_endpt == NONE)
496 			return TRUE;
497 
498 		/* FALLTHROUGH */
499 	case SEV_BIND:
500 		if ((r = sock->sock_err) != OK)
501 			sock->sock_err = OK;
502 
503 		sockdriver_reply_generic(&spr->spr_call, r);
504 
505 		return TRUE;
506 
507 	case SEV_ACCEPT:
508 		/*
509 		 * A previous accept call may not have blocked on a socket that
510 		 * was not in listening mode.
511 		 */
512 		assert(sock->sock_opt & SO_ACCEPTCONN);
513 
514 		addr_len = 0;
515 		newsock = NULL;
516 
517 		/*
518 		 * This call is suspended, which implies that the call table
519 		 * pointer has already tested to be non-NULL.
520 		 */
521 		if ((r = sock->sock_ops->sop_accept(sock,
522 		    (struct sockaddr *)&addr, &addr_len, spr->spr_endpt,
523 		    &newsock)) == SUSPEND)
524 			return FALSE;
525 
526 		if (r >= 0) {
527 			assert(addr_len <= sizeof(addr));
528 
529 			sockevent_accepted(sock, newsock, r);
530 		}
531 
532 		sockdriver_reply_accept(&spr->spr_call, r,
533 		    (struct sockaddr *)&addr, addr_len);
534 
535 		return TRUE;
536 
537 	case SEV_SEND:
538 		if (sock->sock_err != OK || (sock->sock_flags & SFL_SHUT_WR)) {
539 			if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
540 				r = (int)spr->spr_dataoff;
541 			else if ((r = sock->sock_err) != OK)
542 				sock->sock_err = OK;
543 			else
544 				r = EPIPE;
545 		} else {
546 			sockdriver_unpack_data(&data, &spr->spr_call,
547 			    &spr->spr_data, spr->spr_datalen);
548 			sockdriver_unpack_data(&ctl, &spr->spr_call,
549 			    &spr->spr_ctl, spr->spr_ctllen);
550 
551 			len = spr->spr_datalen - spr->spr_dataoff;
552 
553 			min = sock->sock_slowat;
554 			if (min > len)
555 				min = len;
556 
557 			/*
558 			 * As mentioned elsewhere, we do not save the address
559 			 * upon suspension so we cannot supply it anymore here.
560 			 */
561 			r = sock->sock_ops->sop_send(sock, &data, len,
562 			    &spr->spr_dataoff, &ctl,
563 			    spr->spr_ctllen - spr->spr_ctloff,
564 			    &spr->spr_ctloff, NULL, 0, spr->spr_endpt,
565 			    spr->spr_flags, min);
566 
567 			assert(r <= 0);
568 
569 			if (r == SUSPEND)
570 				return FALSE;
571 
572 			/*
573 			 * If an error occurred but some data were already
574 			 * sent, return the progress rather than the error.
575 			 * Note that if the socket driver detects an
576 			 * asynchronous error during the send, it itself must
577 			 * perform this check and call sockevent_set_error() as
578 			 * needed, to make sure the error does not get lost.
579 			 */
580 			if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
581 				r = spr->spr_dataoff;
582 		}
583 
584 		if (r == EPIPE)
585 			sockevent_sigpipe(sock, spr->spr_endpt,
586 			    spr->spr_flags);
587 
588 		sockdriver_reply_generic(&spr->spr_call, r);
589 
590 		return TRUE;
591 
592 	case SEV_RECV:
593 		addr_len = 0;
594 
595 		if (sock->sock_flags & SFL_SHUT_RD)
596 			r = SOCKEVENT_EOF;
597 		else {
598 			len = spr->spr_datalen - spr->spr_dataoff;
599 
600 			if (sock->sock_err == OK) {
601 				min = sock->sock_rlowat;
602 				if (min > len)
603 					min = len;
604 			} else
605 				min = 0;
606 
607 			sockdriver_unpack_data(&data, &spr->spr_call,
608 			    &spr->spr_data, spr->spr_datalen);
609 			sockdriver_unpack_data(&ctl, &spr->spr_call,
610 			    &spr->spr_ctl, spr->spr_ctllen);
611 
612 			r = sock->sock_ops->sop_recv(sock, &data, len,
613 			    &spr->spr_dataoff, &ctl,
614 			    spr->spr_ctllen - spr->spr_ctloff,
615 			    &spr->spr_ctloff, (struct sockaddr *)&addr,
616 			    &addr_len, spr->spr_endpt, spr->spr_flags, min,
617 			    &spr->spr_rflags);
618 
619 			/*
620 			 * If the call remains suspended but a socket error is
621 			 * pending, return the pending socket error instead.
622 			 */
623 			if (r == SUSPEND) {
624 				if (sock->sock_err == OK)
625 					return FALSE;
626 
627 				r = SOCKEVENT_EOF;
628 			}
629 
630 			assert(addr_len <= sizeof(addr));
631 		}
632 
633 		/*
634 		 * If the receive call reported success, or if some data were
635 		 * already received, return the (partial) result.  Otherwise,
636 		 * return a pending error if any, or otherwise a regular error
637 		 * or 0 for EOF.
638 		 */
639 		if (r == OK || spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
640 			r = (int)spr->spr_dataoff;
641 		else if (sock->sock_err != OK) {
642 			r = sock->sock_err;
643 
644 			sock->sock_err = OK;
645 		} else if (r == SOCKEVENT_EOF)
646 			r = 0; /* EOF */
647 
648 		sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff,
649 		    (struct sockaddr *)&addr, addr_len, spr->spr_rflags);
650 
651 		return TRUE;
652 
653 	case SEV_CLOSE:
654 		sockdriver_reply_generic(&spr->spr_call, OK);
655 
656 		return TRUE;
657 
658 	default:
659 		panic("libsockevent: process suspended on unknown event 0x%x",
660 		    spr->spr_event);
661 	}
662 }
663 
664 /*
665  * Return TRUE if the given socket is ready for reading for a select call, or
666  * FALSE otherwise.
667  */
668 static int
669 sockevent_test_readable(struct sock * sock)
670 {
671 	int r;
672 
673 	/*
674 	 * The meaning of "ready-to-read" depends on whether the socket is a
675 	 * listening socket or not.  For the former, it is a test on whether
676 	 * there are any new sockets to accept.  However, shutdown flags take
677 	 * precedence in both cases.
678 	 */
679 	if (sock->sock_flags & SFL_SHUT_RD)
680 		return TRUE;
681 
682 	if (sock->sock_err != OK)
683 		return TRUE;
684 
685 	/*
686 	 * Depending on whether this is a listening-mode socket, test whether
687 	 * either accepts or receives would block.
688 	 */
689 	if (sock->sock_opt & SO_ACCEPTCONN) {
690 		if (sock->sock_ops->sop_test_accept == NULL)
691 			return TRUE;
692 
693 		r = sock->sock_ops->sop_test_accept(sock);
694 	} else {
695 		if (sock->sock_ops->sop_test_recv == NULL)
696 			return TRUE;
697 
698 		r = sock->sock_ops->sop_test_recv(sock, sock->sock_rlowat,
699 		    NULL);
700 	}
701 
702 	return (r != SUSPEND);
703 }
704 
705 /*
706  * Return TRUE if the given socket is ready for writing for a select call, or
707  * FALSE otherwise.
708  */
709 static int
710 sockevent_test_writable(struct sock * sock)
711 {
712 	int r;
713 
714 	if (sock->sock_err != OK)
715 		return TRUE;
716 
717 	if (sock->sock_flags & SFL_SHUT_WR)
718 		return TRUE;
719 
720 	if (sock->sock_ops->sop_test_send == NULL)
721 		return TRUE;
722 
723 	/*
724 	 * Test whether sends would block.  The low send watermark is relevant
725 	 * for stream-type sockets only.
726 	 */
727 	r = sock->sock_ops->sop_test_send(sock, sock->sock_slowat);
728 
729 	return (r != SUSPEND);
730 }
731 
732 /*
733  * Test whether any of the given select operations are ready on the given
734  * socket.  Return the subset of ready operations; zero if none.
735  */
736 static unsigned int
737 sockevent_test_select(struct sock * sock, unsigned int ops)
738 {
739 	unsigned int ready_ops;
740 
741 	assert(!(ops & ~(SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR)));
742 
743 	/*
744 	 * We do not support the "bind in progress" case here.  If a blocking
745 	 * bind call is in progress, the file descriptor should not be ready
746 	 * for either reading or writing.  Currently, socket drivers will have
747 	 * to cover this case themselves.  Otherwise we would have to check the
748 	 * queue of suspended calls, or create a custom flag for this.
749 	 */
750 
751 	ready_ops = 0;
752 
753 	if ((ops & SDEV_OP_RD) && sockevent_test_readable(sock))
754 		ready_ops |= SDEV_OP_RD;
755 
756 	if ((ops & SDEV_OP_WR) && sockevent_test_writable(sock))
757 		ready_ops |= SDEV_OP_WR;
758 
759 	/* TODO: OOB receive support. */
760 
761 	return ready_ops;
762 }
763 
764 /*
765  * Fire the given mask of events on the given socket object now.
766  */
767 static void
768 sockevent_fire(struct sock * sock, unsigned int mask)
769 {
770 	struct sockevent_proc *spr, **sprp;
771 	unsigned int r, flag, ops;
772 
773 	/*
774 	 * A completed connection attempt (successful or not) also always
775 	 * implies that the socket becomes writable.  For convenience we
776 	 * enforce this rule here, because it is easy to forget.  Note that in
777 	 * any case, a suspended connect request should be the first in the
778 	 * list, so we do not risk returning 0 from a connect call as a result
779 	 * of sock_err getting eaten by another resumed call.
780 	 */
781 	if (mask & SEV_CONNECT)
782 		mask |= SEV_SEND;
783 
784 	/*
785 	 * First try resuming regular system calls.
786 	 */
787 	for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) {
788 		flag = spr->spr_event;
789 
790 		if ((mask & flag) && sockevent_resume(sock, spr)) {
791 			*sprp = spr->spr_next;
792 
793 			sockevent_proc_free(spr);
794 		} else {
795 			mask &= ~flag;
796 
797 			sprp = &spr->spr_next;
798 		}
799 	}
800 
801 	/*
802 	 * Then see if we can satisfy pending select queries.
803 	 */
804 	if ((mask & (SEV_ACCEPT | SEV_SEND | SEV_RECV)) &&
805 	    sock->sock_select.ss_endpt != NONE) {
806 		assert(sock->sock_selops != 0);
807 
808 		/*
809 		 * Only retest select operations that, based on the given event
810 		 * mask, could possibly be satisfied now.
811 		 */
812 		ops = sock->sock_selops;
813 		if (!(mask & (SEV_ACCEPT | SEV_RECV)))
814 			ops &= ~SDEV_OP_RD;
815 		if (!(mask & SEV_SEND))
816 			ops &= ~SDEV_OP_WR;
817 		if (!(0))			/* TODO: OOB receive support */
818 			ops &= ~SDEV_OP_ERR;
819 
820 		/* Are there any operations to test? */
821 		if (ops != 0) {
822 			/* Test those operations. */
823 			r = sockevent_test_select(sock, ops);
824 
825 			/* Were any satisfied? */
826 			if (r != 0) {
827 				/* Let the caller know. */
828 				sockdriver_reply_select(&sock->sock_select,
829 				    sock->sock_id, r);
830 
831 				sock->sock_selops &= ~r;
832 
833 				/* Are there any saved operations left now? */
834 				if (sock->sock_selops == 0)
835 					sock->sock_select.ss_endpt = NONE;
836 			}
837 		}
838 	}
839 
840 	/*
841 	 * Finally, a SEV_CLOSE event unconditionally frees the sock object.
842 	 * This event should be fired only for sockets that are either not yet,
843 	 * or not anymore, in use by userland.
844 	 */
845 	if (mask & SEV_CLOSE) {
846 		assert(sock->sock_flags & (SFL_CLONED | SFL_CLOSING));
847 
848 		sockevent_free(sock);
849 	}
850 }
851 
852 /*
853  * Process all pending events.  Events must still be blocked, so that if
854  * handling one event generates a new event, that event is handled from here
855  * rather than immediately.
856  */
857 static void
858 sockevent_pump(void)
859 {
860 	struct sock *sock;
861 	unsigned int mask;
862 
863 	assert(sockevent_working);
864 
865 	while (!SIMPLEQ_EMPTY(&sockevent_pending)) {
866 		sock = SIMPLEQ_FIRST(&sockevent_pending);
867 		SIMPLEQ_REMOVE_HEAD(&sockevent_pending, sock_next);
868 
869 		mask = sock->sock_events;
870 		assert(mask != 0);
871 		sock->sock_events = 0;
872 
873 		sockevent_fire(sock, mask);
874 		/*
875 		 * At this point, the sock object may already have been readded
876 		 * to the event list, or even be deallocated altogether.
877 		 */
878 	}
879 }
880 
881 /*
882  * Return TRUE if any events are pending on any sockets, or FALSE otherwise.
883  */
884 static int
885 sockevent_has_events(void)
886 {
887 
888 	return (!SIMPLEQ_EMPTY(&sockevent_pending));
889 }
890 
891 /*
892  * Raise the given bitwise-OR'ed set of events on the given socket object.
893  * Depending on the context of the call, they events may or may not be
894  * processed immediately.
895  */
896 void
897 sockevent_raise(struct sock * sock, unsigned int mask)
898 {
899 
900 	assert(sock->sock_ops != NULL);
901 
902 	/*
903 	 * Handle SEV_CLOSE first.  This event must not be deferred, so as to
904 	 * let socket drivers recycle sock objects as they are needed.  For
905 	 * example, a user-closed TCP socket may stay open to transmit the
906 	 * remainder of its send buffer, until the TCP driver runs out of
907 	 * sockets, in which case the connection is aborted.  The driver would
908 	 * then raise SEV_CLOSE on the sock object so as to clean it up, and
909 	 * immediately reuse it afterward.  If the close event were to be
910 	 * deferred, this immediate reuse would not be possible.
911 	 *
912 	 * The sop_free() callback routine may not raise new events, and thus,
913 	 * the state of 'sockevent_working' need not be checked or set here.
914 	 */
915 	if (mask & SEV_CLOSE) {
916 		assert(mask == SEV_CLOSE);
917 
918 		sockevent_fire(sock, mask);
919 
920 		return;
921 	}
922 
923 	/*
924 	 * If we are currently processing a socket message, store the event for
925 	 * later.  If not, this call is not coming from inside libsockevent,
926 	 * and we must handle the event immediately.
927 	 */
928 	if (sockevent_working) {
929 		assert(mask != 0);
930 		assert(mask <= UCHAR_MAX); /* sock_events field size check */
931 
932 		if (sock->sock_events == 0)
933 			SIMPLEQ_INSERT_TAIL(&sockevent_pending, sock,
934 			    sock_next);
935 
936 		sock->sock_events |= mask;
937 	} else {
938 		sockevent_working = TRUE;
939 
940 		sockevent_fire(sock, mask);
941 
942 		if (sockevent_has_events())
943 			sockevent_pump();
944 
945 		sockevent_working = FALSE;
946 	}
947 }
948 
949 /*
950  * Set a pending error on the socket object, and wake up any suspended
951  * operations that are affected by this.
952  */
953 void
954 sockevent_set_error(struct sock * sock, int err)
955 {
956 
957 	assert(err < 0);
958 	assert(sock->sock_ops != NULL);
959 
960 	/* If an error was set already, it will be overridden. */
961 	sock->sock_err = err;
962 
963 	sockevent_raise(sock, SEV_BIND | SEV_CONNECT | SEV_SEND | SEV_RECV);
964 }
965 
966 /*
967  * Initialize timer-related data structures.
968  */
969 static void
970 socktimer_init(void)
971 {
972 
973 	SLIST_INIT(&socktimer);
974 
975 	init_timer(&sockevent_timer);
976 }
977 
978 /*
979  * Check whether the given socket object has any suspended requests that have
980  * now expired.  If so, cancel them.  Also, if the socket object has any
981  * suspended requests with a timeout that has not yet expired, return the
982  * earliest (relative) timeout of all of them, or TMR_NEVER if no such requests
983  * are present.
984  */
985 static clock_t
986 sockevent_expire(struct sock * sock, clock_t now)
987 {
988 	struct sockevent_proc *spr, **sprp;
989 	clock_t lowest, left;
990 	int r;
991 
992 	/*
993 	 * First handle the case that the socket is closed.  In this case,
994 	 * there may be a linger timer, although the socket may also simply
995 	 * still be on the timer list because of a request that did not time
996 	 * out right before the socket was closed.
997 	 */
998 	if (sock->sock_flags & SFL_CLOSING) {
999 		/* Was there a linger timer and has it expired? */
1000 		if ((sock->sock_opt & SO_LINGER) &&
1001 		    tmr_is_first(sock->sock_linger, now)) {
1002 			assert(sock->sock_ops->sop_close != NULL);
1003 
1004 			/*
1005 			 * Whatever happens next, we must now resume the
1006 			 * pending close operation, if it was not canceled
1007 			 * earlier.  As before, we return OK rather than the
1008 			 * standardized EWOULDBLOCK, to ensure that the user
1009 			 * process knows the file descriptor has been closed.
1010 			 */
1011 			if ((spr = sock->sock_proc) != NULL) {
1012 				assert(spr->spr_event == SEV_CLOSE);
1013 				assert(spr->spr_next == NULL);
1014 
1015 				sock->sock_proc = NULL;
1016 
1017 				sockdriver_reply_generic(&spr->spr_call, OK);
1018 
1019 				sockevent_proc_free(spr);
1020 			}
1021 
1022 			/*
1023 			 * Tell the socket driver that closing the socket is
1024 			 * now a bit more desired than the last time we asked.
1025 			 */
1026 			r = sock->sock_ops->sop_close(sock, TRUE /*force*/);
1027 
1028 			assert(r == OK || r == SUSPEND);
1029 
1030 			/*
1031 			 * The linger timer fires once.  After that, the socket
1032 			 * driver is free to decide that it still will not
1033 			 * close the socket.  If it does, do not fire the
1034 			 * linger timer again.
1035 			 */
1036 			if (r == SUSPEND)
1037 				sock->sock_opt &= ~SO_LINGER;
1038 			else
1039 				sockevent_free(sock);
1040 		}
1041 
1042 		return TMR_NEVER;
1043 	}
1044 
1045 	/*
1046 	 * Then see if any send and/or receive requests have expired.  Also see
1047 	 * if there are any send and/or receive requests left that have not yet
1048 	 * expired but do have a timeout, so that we can return the lowest of
1049 	 * those timeouts.
1050 	 */
1051 	lowest = TMR_NEVER;
1052 
1053 	for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) {
1054 		/* Skip requests without a timeout. */
1055 		if (spr->spr_timer == 0) {
1056 			sprp = &spr->spr_next;
1057 
1058 			continue;
1059 		}
1060 
1061 		assert(spr->spr_event == SEV_SEND ||
1062 		    spr->spr_event == SEV_RECV);
1063 
1064 		/*
1065 		 * If the request has expired, cancel it and remove it from the
1066 		 * list.  Otherwise, see if the request has the lowest number
1067 		 * of ticks until its timeout so far.
1068 		 */
1069 		if (tmr_is_first(spr->spr_time, now)) {
1070 			*sprp = spr->spr_next;
1071 
1072 			if (spr->spr_event == SEV_SEND)
1073 				sockevent_cancel_send(sock, spr, EWOULDBLOCK);
1074 			else
1075 				sockevent_cancel_recv(sock, spr, EWOULDBLOCK);
1076 
1077 			sockevent_proc_free(spr);
1078 		} else {
1079 			left = spr->spr_time - now;
1080 
1081 			if (lowest == TMR_NEVER || lowest > left)
1082 				lowest = left;
1083 
1084 			sprp = &spr->spr_next;
1085 		}
1086 	}
1087 
1088 	return lowest;
1089 }
1090 
1091 /*
1092  * The socket event alarm went off.  Go through the set of socket objects with
1093  * timers, and see if any of their requests have now expired.  Set a new alarm
1094  * as necessary.
1095  */
1096 static void
1097 socktimer_expire(int arg __unused)
1098 {
1099 	SLIST_HEAD(, sock) oldtimer;
1100 	struct sock *sock, *tsock;
1101 	clock_t now, lowest, left;
1102 	int working;
1103 
1104 	/*
1105 	 * This function may or may not be called from a context where we are
1106 	 * already deferring events, so we have to cover both cases here.
1107 	 */
1108 	if ((working = sockevent_working) == FALSE)
1109 		sockevent_working = TRUE;
1110 
1111 	/* Start a new list. */
1112 	memcpy(&oldtimer, &socktimer, sizeof(oldtimer));
1113 	SLIST_INIT(&socktimer);
1114 
1115 	now = getticks();
1116 	lowest = TMR_NEVER;
1117 
1118 	/*
1119 	 * Go through all sockets that have or had a request with a timeout,
1120 	 * canceling any expired requests and building a new list of sockets
1121 	 * that still have requests with timeouts as we go.
1122 	 */
1123 	SLIST_FOREACH_SAFE(sock, &oldtimer, sock_timer, tsock) {
1124 		assert(sock->sock_flags & SFL_TIMER);
1125 		sock->sock_flags &= ~SFL_TIMER;
1126 
1127 		left = sockevent_expire(sock, now);
1128 		/*
1129 		 * The sock object may already have been deallocated now.
1130 		 * If 'next' is TMR_NEVER, do not touch 'sock' anymore.
1131 		 */
1132 
1133 		if (left != TMR_NEVER) {
1134 			if (lowest == TMR_NEVER || lowest > left)
1135 				lowest = left;
1136 
1137 			SLIST_INSERT_HEAD(&socktimer, sock, sock_timer);
1138 
1139 			sock->sock_flags |= SFL_TIMER;
1140 		}
1141 	}
1142 
1143 	/* If there is a new lowest timeout at all, set a new timer. */
1144 	if (lowest != TMR_NEVER)
1145 		set_timer(&sockevent_timer, lowest, socktimer_expire, 0);
1146 
1147 	if (!working) {
1148 		/* If any new events were raised, process them now. */
1149 		if (sockevent_has_events())
1150 			sockevent_pump();
1151 
1152 		sockevent_working = FALSE;
1153 	}
1154 }
1155 
1156 /*
1157  * Set a timer for the given (relative) number of clock ticks, adding the
1158  * associated socket object to the set of socket objects with timers, if it was
1159  * not already in that set.  Set a new alarm if necessary, and return the
1160  * absolute timeout for the timer.  Since the timers list is maintained lazily,
1161  * the caller need not take the object off the set if the call was canceled
1162  * later; see also socktimer_del().
1163  */
1164 static clock_t
1165 socktimer_add(struct sock * sock, clock_t ticks)
1166 {
1167 	clock_t now;
1168 
1169 	/*
1170 	 * Relative time comparisons require that any two times are no more
1171 	 * than half the comparison space (clock_t, unsigned long) apart.
1172 	 */
1173 	assert(ticks <= TMRDIFF_MAX);
1174 
1175 	/* If the socket was not already on the timers list, put it on. */
1176 	if (!(sock->sock_flags & SFL_TIMER)) {
1177 		SLIST_INSERT_HEAD(&socktimer, sock, sock_timer);
1178 
1179 		sock->sock_flags |= SFL_TIMER;
1180 	}
1181 
1182 	/*
1183 	 * (Re)set the timer if either it was not running at all or this new
1184 	 * timeout will occur sooner than the currently scheduled alarm.  Note
1185 	 * that setting a timer that was already set is allowed.
1186 	 */
1187 	now = getticks();
1188 
1189 	if (!tmr_is_set(&sockevent_timer) ||
1190 	    tmr_is_first(now + ticks, tmr_exp_time(&sockevent_timer)))
1191 		set_timer(&sockevent_timer, ticks, socktimer_expire, 0);
1192 
1193 	/* Return the absolute timeout. */
1194 	return now + ticks;
1195 }
1196 
1197 /*
1198  * Remove a socket object from the set of socket objects with timers.  Since
1199  * the timer list is maintained lazily, this needs to be done only right before
1200  * the socket object is freed.
1201  */
1202 static void
1203 socktimer_del(struct sock * sock)
1204 {
1205 
1206 	if (sock->sock_flags & SFL_TIMER) {
1207 		/* This macro is O(n). */
1208 		SLIST_REMOVE(&socktimer, sock, sock, sock_timer);
1209 
1210 		sock->sock_flags &= ~SFL_TIMER;
1211 	}
1212 }
1213 
1214 /*
1215  * Bind a socket to a local address.
1216  */
1217 static int
1218 sockevent_bind(sockid_t id, const struct sockaddr * __restrict addr,
1219 	socklen_t addr_len, endpoint_t user_endpt,
1220 	const struct sockdriver_call * __restrict call)
1221 {
1222 	struct sock *sock;
1223 	int r;
1224 
1225 	if ((sock = sockhash_get(id)) == NULL)
1226 		return EINVAL;
1227 
1228 	if (sock->sock_ops->sop_bind == NULL)
1229 		return EOPNOTSUPP;
1230 
1231 	/* Binding a socket in listening mode is never supported. */
1232 	if (sock->sock_opt & SO_ACCEPTCONN)
1233 		return EINVAL;
1234 
1235 	r = sock->sock_ops->sop_bind(sock, addr, addr_len, user_endpt);
1236 
1237 	if (r == SUSPEND) {
1238 		if (call == NULL)
1239 			return EINPROGRESS;
1240 
1241 		sockevent_suspend(sock, SEV_BIND, call, user_endpt);
1242 	}
1243 
1244 	return r;
1245 }
1246 
1247 /*
1248  * Connect a socket to a remote address.
1249  */
1250 static int
1251 sockevent_connect(sockid_t id, const struct sockaddr * __restrict addr,
1252 	socklen_t addr_len, endpoint_t user_endpt,
1253 	const struct sockdriver_call * call)
1254 {
1255 	struct sockdriver_call fakecall;
1256 	struct sockevent_proc *spr;
1257 	struct sock *sock;
1258 	int r;
1259 
1260 	if ((sock = sockhash_get(id)) == NULL)
1261 		return EINVAL;
1262 
1263 	if (sock->sock_ops->sop_connect == NULL)
1264 		return EOPNOTSUPP;
1265 
1266 	/* Connecting a socket in listening mode is never supported. */
1267 	if (sock->sock_opt & SO_ACCEPTCONN)
1268 		return EOPNOTSUPP;
1269 
1270 	/*
1271 	 * The upcoming connect call may fire an accept event for which the
1272 	 * handler may in turn fire a connect event on this socket.  Since we
1273 	 * delay event processing until after processing calls, this would
1274 	 * create the problem that even if the connection is accepted right
1275 	 * away, non-blocking connect requests would return EINPROGRESS.  For
1276 	 * UDS, this is undesirable behavior.  To remedy this, we use a hack:
1277 	 * we temporarily suspend the connect even if non-blocking, then
1278 	 * process events, and then cancel the connect request again.  If the
1279 	 * connection was accepted immediately, the cancellation will have no
1280 	 * effect, since the request has already been replied to.  In order not
1281 	 * to violate libsockdriver rules with this hack, we fabricate a fake
1282 	 * 'conn' object.
1283 	 */
1284 	r = sock->sock_ops->sop_connect(sock, addr, addr_len, user_endpt);
1285 
1286 	if (r == SUSPEND) {
1287 		if (call != NULL || sockevent_has_events()) {
1288 			if (call == NULL) {
1289 				fakecall.sc_endpt = NONE;
1290 
1291 				call = &fakecall;
1292 			}
1293 
1294 			assert(!sockevent_has_suspended(sock,
1295 			    SEV_SEND | SEV_RECV));
1296 
1297 			sockevent_suspend(sock, SEV_CONNECT, call, user_endpt);
1298 
1299 			if (call == &fakecall) {
1300 				/* Process any pending events first now. */
1301 				sockevent_pump();
1302 
1303 				/*
1304 				 * If the connect request has not been resumed
1305 				 * yet now, we must remove it from the queue
1306 				 * again, and return EINPROGRESS ourselves.
1307 				 * Otherwise, return OK or a pending error.
1308 				 */
1309 				spr = sockevent_unsuspend(sock, call);
1310 				if (spr != NULL) {
1311 					sockevent_proc_free(spr);
1312 
1313 					r = EINPROGRESS;
1314 				} else if ((r = sock->sock_err) != OK)
1315 					sock->sock_err = OK;
1316 			}
1317 		} else
1318 			r = EINPROGRESS;
1319 	}
1320 
1321 	if (r == OK) {
1322 		/*
1323 		 * A completed connection attempt also always implies that the
1324 		 * socket becomes writable.  For convenience we enforce this
1325 		 * rule here, because it is easy to forget.
1326 		 */
1327 		sockevent_raise(sock, SEV_SEND);
1328 	}
1329 
1330 	return r;
1331 }
1332 
1333 /*
1334  * Put a socket in listening mode.
1335  */
1336 static int
1337 sockevent_listen(sockid_t id, int backlog)
1338 {
1339 	struct sock *sock;
1340 	int r;
1341 
1342 	if ((sock = sockhash_get(id)) == NULL)
1343 		return EINVAL;
1344 
1345 	if (sock->sock_ops->sop_listen == NULL)
1346 		return EOPNOTSUPP;
1347 
1348 	/*
1349 	 * Perform a general adjustment on the backlog value, applying the
1350 	 * customary BSD "fudge factor" of 1.5x.  Keep the value within bounds
1351 	 * though.  POSIX imposes that a negative backlog value is equal to a
1352 	 * backlog value of zero.  A backlog value of zero, in turn, may mean
1353 	 * anything; we take it to be one.  POSIX also imposes that all socket
1354 	 * drivers accept up to at least SOMAXCONN connections on the queue.
1355 	 */
1356 	if (backlog < 0)
1357 		backlog = 0;
1358 	if (backlog < SOMAXCONN)
1359 		backlog += 1 + ((unsigned int)backlog >> 1);
1360 	if (backlog > SOMAXCONN)
1361 		backlog = SOMAXCONN;
1362 
1363 	r = sock->sock_ops->sop_listen(sock, backlog);
1364 
1365 	/*
1366 	 * On success, the socket is now in listening mode.  As part of that,
1367 	 * a select(2) ready-to-read condition now indicates that a connection
1368 	 * may be accepted on the socket, rather than that data may be read.
1369 	 * Since libsockevent is responsible for this distinction, we keep
1370 	 * track of the listening mode at this level.  Conveniently, there is a
1371 	 * socket option for this, which we support out of the box as a result.
1372 	 */
1373 	if (r == OK) {
1374 		sock->sock_opt |= SO_ACCEPTCONN;
1375 
1376 		/*
1377 		 * For the extremely unlikely case that right after the socket
1378 		 * is put into listening mode, it has a connection ready to
1379 		 * accept, we retest blocked ready-to-read select queries now.
1380 		 */
1381 		sockevent_raise(sock, SEV_ACCEPT);
1382 	}
1383 
1384 	return r;
1385 }
1386 
1387 /*
1388  * Accept a connection on a listening socket, creating a new socket.
1389  */
1390 static sockid_t
1391 sockevent_accept(sockid_t id, struct sockaddr * __restrict addr,
1392 	socklen_t * __restrict addr_len, endpoint_t user_endpt,
1393 	const struct sockdriver_call * __restrict call)
1394 {
1395 	struct sock *sock, *newsock;
1396 	sockid_t r;
1397 
1398 	if ((sock = sockhash_get(id)) == NULL)
1399 		return EINVAL;
1400 
1401 	if (sock->sock_ops->sop_accept == NULL)
1402 		return EOPNOTSUPP;
1403 
1404 	/*
1405 	 * Attempt to accept a connection.  The socket driver is responsible
1406 	 * for allocating a sock object (and identifier) on success.  It may
1407 	 * already have done so before, in which case it should leave newsock
1408 	 * filled with NULL; otherwise, the returned sock object is cloned from
1409 	 * the listening socket.  The socket driver is also responsible for
1410 	 * failing the call if the socket is not in listening mode, because it
1411 	 * must specify the error to return: EOPNOTSUPP or EINVAL.
1412 	 */
1413 	newsock = NULL;
1414 
1415 	if ((r = sock->sock_ops->sop_accept(sock, addr, addr_len, user_endpt,
1416 	    &newsock)) == SUSPEND) {
1417 		assert(sock->sock_opt & SO_ACCEPTCONN);
1418 
1419 		if (call == NULL)
1420 			return EWOULDBLOCK;
1421 
1422 		sockevent_suspend(sock, SEV_ACCEPT, call, user_endpt);
1423 
1424 		return SUSPEND;
1425 	}
1426 
1427 	if (r >= 0)
1428 		sockevent_accepted(sock, newsock, r);
1429 
1430 	return r;
1431 }
1432 
1433 /*
1434  * Send regular and/or control data.
1435  */
1436 static int
1437 sockevent_send(sockid_t id, const struct sockdriver_data * __restrict data,
1438 	size_t len, const struct sockdriver_data * __restrict ctl_data,
1439 	socklen_t ctl_len, const struct sockaddr * __restrict addr,
1440 	socklen_t addr_len, endpoint_t user_endpt, int flags,
1441 	const struct sockdriver_call * __restrict call)
1442 {
1443 	struct sock *sock;
1444 	clock_t time;
1445 	size_t min, off;
1446 	socklen_t ctl_off;
1447 	int r, timer;
1448 
1449 	if ((sock = sockhash_get(id)) == NULL)
1450 		return EINVAL;
1451 
1452 	/*
1453 	 * The order of the following checks is not necessarily fixed, and may
1454 	 * be changed later.  As far as applicable, they should match the order
1455 	 * of the checks during call resumption, though.
1456 	 */
1457 	if ((r = sock->sock_err) != OK) {
1458 		sock->sock_err = OK;
1459 
1460 		return r;
1461 	}
1462 
1463 	if (sock->sock_flags & SFL_SHUT_WR) {
1464 		sockevent_sigpipe(sock, user_endpt, flags);
1465 
1466 		return EPIPE;
1467 	}
1468 
1469 	/*
1470 	 * Translate the sticky SO_DONTROUTE option to a per-request
1471 	 * MSG_DONTROUTE flag.  This achieves two purposes: socket drivers have
1472 	 * to check only one flag, and socket drivers that do not support the
1473 	 * flag will fail send requests in a consistent way.
1474 	 */
1475 	if (sock->sock_opt & SO_DONTROUTE)
1476 		flags |= MSG_DONTROUTE;
1477 
1478 	/*
1479 	 * Check if this is a valid send request as far as the socket driver is
1480 	 * concerned.  We do this separately from sop_send for the reason that
1481 	 * this send request may immediately be queued behind other pending
1482 	 * send requests (without a call to sop_send), which means even invalid
1483 	 * requests would be queued and not return failure until much later.
1484 	 */
1485 	if (sock->sock_ops->sop_pre_send != NULL &&
1486 	    (r = sock->sock_ops->sop_pre_send(sock, len, ctl_len, addr,
1487 	    addr_len, user_endpt,
1488 	    flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK)
1489 		return r;
1490 
1491 	if (sock->sock_ops->sop_send == NULL)
1492 		return EOPNOTSUPP;
1493 
1494 	off = 0;
1495 	ctl_off = 0;
1496 
1497 	/*
1498 	 * Sending out-of-band data is treated differently from regular data:
1499 	 *
1500 	 * - sop_send is called immediately, even if a partial non-OOB send
1501 	 *   operation is currently suspended (TODO: it may have to be aborted
1502 	 *   in order to maintain atomicity guarantees - that should be easy);
1503 	 * - sop_send must not return SUSPEND; instead, if it cannot process
1504 	 *   the OOB data immediately, it must return an appropriate error;
1505 	 * - the send low watermark is ignored.
1506 	 *
1507 	 * Given that none of the current socket drivers support OOB data at
1508 	 * all, more sophisticated approaches would have no added value now.
1509 	 */
1510 	if (flags & MSG_OOB) {
1511 		r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data,
1512 		    ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, 0);
1513 
1514 		if (r == SUSPEND)
1515 			panic("libsockevent: MSG_OOB send calls may not be "
1516 			    "suspended");
1517 
1518 		return (r == OK) ? (int)off : r;
1519 	}
1520 
1521 	/*
1522 	 * Only call the actual sop_send function now if no other send calls
1523 	 * are suspended already.
1524 	 *
1525 	 * Call sop_send with 'min' set to the minimum of the request size and
1526 	 * the socket's send low water mark, but only if the call is non-
1527 	 * blocking.  For stream-oriented sockets, this should have the effect
1528 	 * that non-blocking calls fail with EWOULDBLOCK if not at least that
1529 	 * much can be sent immediately. For consistency, we choose to apply
1530 	 * the same threshold to blocking calls.  For datagram-oriented
1531 	 * sockets, the minimum is not a factor to be considered.
1532 	 */
1533 	if (!sockevent_has_suspended(sock, SEV_SEND)) {
1534 		min = sock->sock_slowat;
1535 		if (min > len)
1536 			min = len;
1537 
1538 		r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data,
1539 		    ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, min);
1540 	} else
1541 		r = SUSPEND;
1542 
1543 	if (r == SUSPEND) {
1544 		/*
1545 		 * We do not store the target's address on suspension, because
1546 		 * that would add significantly to the per-process suspension
1547 		 * state.  As a result, we disallow socket drivers from
1548 		 * suspending send calls with addresses, because we would no
1549 		 * longer have the address for proper call resumption.
1550 		 * However, we do not know here whether the socket is in
1551 		 * connection-oriented mode; if it is, the address is to be
1552 		 * ignored altogether.  Therefore, there is no test on 'addr'
1553 		 * here.  Resumed calls will get a NULL address pointer, and
1554 		 * the socket driver is expected to do the right thing.
1555 		 */
1556 
1557 		/*
1558 		 * For non-blocking socket calls, return an error only if we
1559 		 * were not able to send anything at all.  If only control data
1560 		 * were sent, the return value is therefore zero.
1561 		 */
1562 		if (call != NULL) {
1563 			if (sock->sock_stimeo != 0) {
1564 				timer = TRUE;
1565 				time = socktimer_add(sock, sock->sock_stimeo);
1566 			} else {
1567 				timer = FALSE;
1568 				time = 0;
1569 			}
1570 
1571 			sockevent_suspend_data(sock, SEV_SEND, timer, call,
1572 			    user_endpt, data, len, off, ctl_data, ctl_len,
1573 			    ctl_off, flags, 0, time);
1574 		} else
1575 			r = (off > 0 || ctl_off > 0) ? OK : EWOULDBLOCK;
1576 	} else if (r == EPIPE)
1577 		sockevent_sigpipe(sock, user_endpt, flags);
1578 
1579 	return (r == OK) ? (int)off : r;
1580 }
1581 
1582 /*
1583  * The inner part of the receive request handler.  An error returned from here
1584  * may be overridden by an error pending on the socket, although data returned
1585  * from here trumps such pending errors.
1586  */
1587 static int
1588 sockevent_recv_inner(struct sock * sock,
1589 	const struct sockdriver_data * __restrict data,
1590 	size_t len, size_t * __restrict off,
1591 	const struct sockdriver_data * __restrict ctl_data,
1592 	socklen_t ctl_len, socklen_t * __restrict ctl_off,
1593 	struct sockaddr * __restrict addr,
1594 	socklen_t * __restrict addr_len, endpoint_t user_endpt,
1595 	int * __restrict flags, const struct sockdriver_call * __restrict call)
1596 {
1597 	clock_t time;
1598 	size_t min;
1599 	int r, oob, inflags, timer;
1600 
1601 	/*
1602 	 * Check if this is a valid receive request as far as the socket driver
1603 	 * is concerned.  We do this separately from sop_recv for the reason
1604 	 * that this receive request may immediately be queued behind other
1605 	 * pending receive requests (without a call to sop_recv), which means
1606 	 * even invalid requests would be queued and not return failure until
1607 	 * much later.
1608 	 */
1609 	inflags = *flags;
1610 	*flags = 0;
1611 
1612 	if (sock->sock_ops->sop_pre_recv != NULL &&
1613 	    (r = sock->sock_ops->sop_pre_recv(sock, user_endpt,
1614 	    inflags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK)
1615 		return r;
1616 
1617 	/*
1618 	 * The order of the following checks is not necessarily fixed, and may
1619 	 * be changed later.  As far as applicable, they should match the order
1620 	 * of the checks during call resumption, though.
1621 	 */
1622 	if (sock->sock_flags & SFL_SHUT_RD)
1623 		return SOCKEVENT_EOF;
1624 
1625 	if (sock->sock_ops->sop_recv == NULL)
1626 		return EOPNOTSUPP;
1627 
1628 	/*
1629 	 * Receiving out-of-band data is treated differently from regular data:
1630 	 *
1631 	 * - sop_recv is called immediately, even if a partial non-OOB receive
1632 	 *   operation is currently suspended (TODO: it may have to be aborted
1633 	 *   in order to maintain atomicity guarantees - that should be easy);
1634 	 * - sop_recv must not return SUSPEND; instead, if it cannot return any
1635 	 *   the OOB data immediately, it must return an appropriate error;
1636 	 * - the receive low watermark is ignored.
1637 	 *
1638 	 * Given that none of the current socket drivers support OOB data at
1639 	 * all, more sophisticated approaches would have no added value now.
1640 	 */
1641 	oob = (inflags & MSG_OOB);
1642 
1643 	if (oob && (sock->sock_opt & SO_OOBINLINE))
1644 		return EINVAL;
1645 
1646 	/*
1647 	 * Only call the actual sop_recv function now if no other receive
1648 	 * calls are suspended already.
1649 	 *
1650 	 * Call sop_recv with 'min' set to the minimum of the request size and
1651 	 * the socket's socket's low water mark, unless there is a pending
1652 	 * error.  As a result, blocking calls will block, and non-blocking
1653 	 * calls will yield EWOULDBLOCK, if at least that much can be received,
1654 	 * unless another condition (EOF or that pending error) prevents more
1655 	 * from being received anyway.  For datagram-oriented sockets, the
1656 	 * minimum is not a factor to be considered.
1657 	 */
1658 	if (oob || !sockevent_has_suspended(sock, SEV_RECV)) {
1659 		if (!oob && sock->sock_err == OK) {
1660 			min = sock->sock_rlowat;
1661 			if (min > len)
1662 				min = len;
1663 		} else
1664 			min = 0; /* receive even no-data segments */
1665 
1666 		r = sock->sock_ops->sop_recv(sock, data, len, off, ctl_data,
1667 		    ctl_len, ctl_off, addr, addr_len, user_endpt, inflags, min,
1668 		    flags);
1669 	} else
1670 		r = SUSPEND;
1671 
1672 	assert(r <= 0 || r == SOCKEVENT_EOF);
1673 
1674 	if (r == SUSPEND) {
1675 		if (oob)
1676 			panic("libsockevent: MSG_OOB receive calls may not be "
1677 			    "suspended");
1678 
1679 		/*
1680 		 * For non-blocking socket calls, return EWOULDBLOCK only if we
1681 		 * did not receive anything at all.  If only control data were
1682 		 * received, the return value is therefore zero.  Suspension
1683 		 * implies that there is nothing to read.  For the purpose of
1684 		 * the calling wrapper function, never suspend a call when
1685 		 * there is a pending error.
1686 		 */
1687 		if (call != NULL && sock->sock_err == OK) {
1688 			if (sock->sock_rtimeo != 0) {
1689 				timer = TRUE;
1690 				time = socktimer_add(sock, sock->sock_rtimeo);
1691 			} else {
1692 				timer = FALSE;
1693 				time = 0;
1694 			}
1695 
1696 			sockevent_suspend_data(sock, SEV_RECV, timer, call,
1697 			    user_endpt, data, len, *off, ctl_data,
1698 			    ctl_len, *ctl_off, inflags, *flags, time);
1699 		} else
1700 			r = EWOULDBLOCK;
1701 	}
1702 
1703 	return r;
1704 }
1705 
1706 /*
1707  * Receive regular and/or control data.
1708  */
1709 static int
1710 sockevent_recv(sockid_t id, const struct sockdriver_data * __restrict data,
1711 	size_t len, const struct sockdriver_data * __restrict ctl_data,
1712 	socklen_t * __restrict ctl_len, struct sockaddr * __restrict addr,
1713 	socklen_t * __restrict addr_len, endpoint_t user_endpt,
1714 	int * __restrict flags, const struct sockdriver_call * __restrict call)
1715 {
1716 	struct sock *sock;
1717 	size_t off;
1718 	socklen_t ctl_inlen;
1719 	int r;
1720 
1721 	if ((sock = sockhash_get(id)) == NULL)
1722 		return EINVAL;
1723 
1724 	/*
1725 	 * This function is a wrapper around the actual receive functionality.
1726 	 * The reason for this is that receiving data should take precedence
1727 	 * over a pending socket error, while a pending socket error should
1728 	 * take precedence over both regular errors as well as EOF.  In other
1729 	 * words: if there is a pending error, we must try to receive anything
1730 	 * at all; if receiving does not work, we must fail the call with the
1731 	 * pending error.  However, until we call the receive callback, we have
1732 	 * no way of telling whether any data can be received.  So we must try
1733 	 * that before we can decide whether to return a pending error.
1734 	 */
1735 	off = 0;
1736 	ctl_inlen = *ctl_len;
1737 	*ctl_len = 0;
1738 
1739 	/*
1740 	 * Attempt to perform the actual receive call.
1741 	 */
1742 	r = sockevent_recv_inner(sock, data, len, &off, ctl_data, ctl_inlen,
1743 	    ctl_len, addr, addr_len, user_endpt, flags, call);
1744 
1745 	/*
1746 	 * If the receive request succeeded, or it failed but yielded a partial
1747 	 * result, then return the (partal) result.  Otherwise, if an error is
1748 	 * pending, return that error.  Otherwise, return either a regular
1749 	 * error or 0 for EOF.
1750 	 */
1751 	if (r == OK || (r != SUSPEND && (off > 0 || *ctl_len > 0)))
1752 		r = (int)off;
1753 	else if (sock->sock_err != OK) {
1754 		assert(r != SUSPEND);
1755 
1756 		r = sock->sock_err;
1757 
1758 		sock->sock_err = OK;
1759 	} else if (r == SOCKEVENT_EOF)
1760 		r = 0;
1761 
1762 	return r;
1763 }
1764 
1765 /*
1766  * Process an I/O control call.
1767  */
1768 static int
1769 sockevent_ioctl(sockid_t id, unsigned long request,
1770 	const struct sockdriver_data * __restrict data, endpoint_t user_endpt,
1771 	const struct sockdriver_call * __restrict call __unused)
1772 {
1773 	struct sock *sock;
1774 	size_t size;
1775 	int r, val;
1776 
1777 	if ((sock = sockhash_get(id)) == NULL)
1778 		return EINVAL;
1779 
1780 	/* We handle a very small subset of generic IOCTLs here. */
1781 	switch (request) {
1782 	case FIONREAD:
1783 		size = 0;
1784 		if (!(sock->sock_flags & SFL_SHUT_RD) &&
1785 		    sock->sock_ops->sop_test_recv != NULL)
1786 			(void)sock->sock_ops->sop_test_recv(sock, 0, &size);
1787 
1788 		val = (int)size;
1789 
1790 		return sockdriver_copyout(data, 0, &val, sizeof(val));
1791 	}
1792 
1793 	if (sock->sock_ops->sop_ioctl == NULL)
1794 		return ENOTTY;
1795 
1796 	r = sock->sock_ops->sop_ioctl(sock, request, data, user_endpt);
1797 
1798 	/*
1799 	 * Suspending IOCTL requests is not currently supported by this
1800 	 * library, even though the VFS protocol and libsockdriver do support
1801 	 * it.  The reason is that IOCTLs do not match our proces suspension
1802 	 * model: they could be neither queued nor repeated.  For now, it seems
1803 	 * that this feature is not needed by the socket drivers either.  Thus,
1804 	 * even though there are possible solutions, we defer implementing them
1805 	 * until we know what exactly is needed.
1806 	 */
1807 	if (r == SUSPEND)
1808 		panic("libsockevent: socket driver suspended IOCTL 0x%lx",
1809 		    request);
1810 
1811 	return r;
1812 }
1813 
1814 /*
1815  * Set socket options.
1816  */
1817 static int
1818 sockevent_setsockopt(sockid_t id, int level, int name,
1819 	const struct sockdriver_data * data, socklen_t len)
1820 {
1821 	struct sock *sock;
1822 	struct linger linger;
1823 	struct timeval tv;
1824 	clock_t secs, ticks;
1825 	int r, val;
1826 
1827 	if ((sock = sockhash_get(id)) == NULL)
1828 		return EINVAL;
1829 
1830 	if (level == SOL_SOCKET) {
1831 		/*
1832 		 * Handle a subset of the socket-level options here.  For most
1833 		 * of them, this means that the socket driver itself need not
1834 		 * handle changing or returning the options, but still needs to
1835 		 * implement the correct behavior based on them where needed.
1836 		 * A few of them are handled exclusively in this library:
1837 		 * SO_ACCEPTCONN, SO_NOSIGPIPE, SO_ERROR, SO_TYPE, SO_LINGER,
1838 		 * SO_SNDLOWAT, SO_RCVLOWAT, SO_SNDTIMEO, and SO_RCVTIMEO.
1839 		 * The SO_USELOOPBACK option is explicitly absent, as it is
1840 		 * valid for routing sockets only and is set by default there.
1841 		 */
1842 		switch (name) {
1843 		case SO_DEBUG:
1844 		case SO_REUSEADDR:
1845 		case SO_KEEPALIVE:
1846 		case SO_DONTROUTE:
1847 		case SO_BROADCAST:
1848 		case SO_OOBINLINE:
1849 		case SO_REUSEPORT:
1850 		case SO_NOSIGPIPE:
1851 		case SO_TIMESTAMP:
1852 			/*
1853 			 * Simple on-off options.  Changing them does not
1854 			 * involve the socket driver.
1855 			 */
1856 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1857 			    len)) != OK)
1858 				return r;
1859 
1860 			if (val)
1861 				sock->sock_opt |= (unsigned int)name;
1862 			else
1863 				sock->sock_opt &= ~(unsigned int)name;
1864 
1865 			/*
1866 			 * In priciple these on-off options are maintained in
1867 			 * this library, but some socket drivers may need to
1868 			 * apply the options elsewhere, so we notify them that
1869 			 * something has changed.  Using the sop_setsockopt
1870 			 * callback would be inconvenient for this for two
1871 			 * reasons: multiple value copy-ins and default errors.
1872 			 */
1873 			if (sock->sock_ops->sop_setsockmask != NULL)
1874 				sock->sock_ops->sop_setsockmask(sock,
1875 				    sock->sock_opt);
1876 
1877 			/*
1878 			 * The inlining of OOB data may make new data available
1879 			 * through regular receive calls.  Thus, see if we can
1880 			 * wake up any suspended receive calls now.
1881 			 */
1882 			if (name == SO_OOBINLINE && val)
1883 				sockevent_raise(sock, SEV_RECV);
1884 
1885 			return OK;
1886 
1887 		case SO_LINGER:
1888 			/* The only on-off option with an associated value. */
1889 			if ((r = sockdriver_copyin_opt(data, &linger,
1890 			    sizeof(linger), len)) != OK)
1891 				return r;
1892 
1893 			if (linger.l_onoff) {
1894 				if (linger.l_linger < 0)
1895 					return EINVAL;
1896 				/* EDOM is the closest applicable error.. */
1897 				secs = (clock_t)linger.l_linger;
1898 				if (secs >= TMRDIFF_MAX / sys_hz())
1899 					return EDOM;
1900 
1901 				sock->sock_opt |= SO_LINGER;
1902 				sock->sock_linger = secs * sys_hz();
1903 			} else {
1904 				sock->sock_opt &= ~SO_LINGER;
1905 				sock->sock_linger = 0;
1906 			}
1907 
1908 			return OK;
1909 
1910 		case SO_SNDLOWAT:
1911 		case SO_RCVLOWAT:
1912 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1913 			    len)) != OK)
1914 				return r;
1915 
1916 			if (val <= 0)
1917 				return EINVAL;
1918 
1919 			/*
1920 			 * Setting these values may allow suspended operations
1921 			 * (send, recv, select) to be resumed, so recheck.
1922 			 */
1923 			if (name == SO_SNDLOWAT) {
1924 				sock->sock_slowat = (size_t)val;
1925 
1926 				sockevent_raise(sock, SEV_SEND);
1927 			} else {
1928 				sock->sock_rlowat = (size_t)val;
1929 
1930 				sockevent_raise(sock, SEV_RECV);
1931 			}
1932 
1933 			return OK;
1934 
1935 		case SO_SNDTIMEO:
1936 		case SO_RCVTIMEO:
1937 			if ((r = sockdriver_copyin_opt(data, &tv, sizeof(tv),
1938 			    len)) != OK)
1939 				return r;
1940 
1941 			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
1942 			    (unsigned long)tv.tv_usec >= US)
1943 				return EINVAL;
1944 			if (tv.tv_sec >= TMRDIFF_MAX / sys_hz())
1945 				return EDOM;
1946 
1947 			ticks = tv.tv_sec * sys_hz() +
1948 			    (tv.tv_usec * sys_hz() + US - 1) / US;
1949 
1950 			if (name == SO_SNDTIMEO)
1951 				sock->sock_stimeo = ticks;
1952 			else
1953 				sock->sock_rtimeo = ticks;
1954 
1955 			/*
1956 			 * The timeouts for any calls already in progress for
1957 			 * this socket are left as is.
1958 			 */
1959 			return OK;
1960 
1961 		case SO_ACCEPTCONN:
1962 		case SO_ERROR:
1963 		case SO_TYPE:
1964 			/* These options may be retrieved but not set. */
1965 			return ENOPROTOOPT;
1966 
1967 		default:
1968 			/*
1969 			 * The remaining options either cannot be handled in a
1970 			 * generic way, or are not recognized altogether.  Pass
1971 			 * them to the socket driver, which should handle what
1972 			 * it knows and reject the rest.
1973 			 */
1974 			break;
1975 		}
1976 	}
1977 
1978 	if (sock->sock_ops->sop_setsockopt == NULL)
1979 		return ENOPROTOOPT;
1980 
1981 	/*
1982 	 * The socket driver must return ENOPROTOOPT for all options it does
1983 	 * not recognize.
1984 	 */
1985 	return sock->sock_ops->sop_setsockopt(sock, level, name, data, len);
1986 }
1987 
1988 /*
1989  * Retrieve socket options.
1990  */
1991 static int
1992 sockevent_getsockopt(sockid_t id, int level, int name,
1993 	const struct sockdriver_data * __restrict data,
1994 	socklen_t * __restrict len)
1995 {
1996 	struct sock *sock;
1997 	struct linger linger;
1998 	struct timeval tv;
1999 	clock_t ticks;
2000 	int val;
2001 
2002 	if ((sock = sockhash_get(id)) == NULL)
2003 		return EINVAL;
2004 
2005 	if (level == SOL_SOCKET) {
2006 		/*
2007 		 * As with setting, handle a subset of the socket-level options
2008 		 * here.  The rest is to be taken care of by the socket driver.
2009 		 */
2010 		switch (name) {
2011 		case SO_DEBUG:
2012 		case SO_ACCEPTCONN:
2013 		case SO_REUSEADDR:
2014 		case SO_KEEPALIVE:
2015 		case SO_DONTROUTE:
2016 		case SO_BROADCAST:
2017 		case SO_OOBINLINE:
2018 		case SO_REUSEPORT:
2019 		case SO_NOSIGPIPE:
2020 		case SO_TIMESTAMP:
2021 			val = !!(sock->sock_opt & (unsigned int)name);
2022 
2023 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2024 			    len);
2025 
2026 		case SO_LINGER:
2027 			linger.l_onoff = !!(sock->sock_opt & SO_LINGER);
2028 			linger.l_linger = sock->sock_linger / sys_hz();
2029 
2030 			return sockdriver_copyout_opt(data, &linger,
2031 			   sizeof(linger), len);
2032 
2033 		case SO_ERROR:
2034 			if ((val = -sock->sock_err) != OK)
2035 				sock->sock_err = OK;
2036 
2037 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2038 			    len);
2039 
2040 		case SO_TYPE:
2041 			val = sock->sock_type;
2042 
2043 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2044 			    len);
2045 
2046 		case SO_SNDLOWAT:
2047 			val = (int)sock->sock_slowat;
2048 
2049 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2050 			    len);
2051 
2052 		case SO_RCVLOWAT:
2053 			val = (int)sock->sock_rlowat;
2054 
2055 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2056 			    len);
2057 
2058 		case SO_SNDTIMEO:
2059 		case SO_RCVTIMEO:
2060 			if (name == SO_SNDTIMEO)
2061 				ticks = sock->sock_stimeo;
2062 			else
2063 				ticks = sock->sock_rtimeo;
2064 
2065 			tv.tv_sec = ticks / sys_hz();
2066 			tv.tv_usec = (ticks % sys_hz()) * US / sys_hz();
2067 
2068 			return sockdriver_copyout_opt(data, &tv, sizeof(tv),
2069 			    len);
2070 
2071 		default:
2072 			break;
2073 		}
2074 	}
2075 
2076 	if (sock->sock_ops->sop_getsockopt == NULL)
2077 		return ENOPROTOOPT;
2078 
2079 	/*
2080 	 * The socket driver must return ENOPROTOOPT for all options it does
2081 	 * not recognize.
2082 	 */
2083 	return sock->sock_ops->sop_getsockopt(sock, level, name, data, len);
2084 }
2085 
2086 /*
2087  * Retrieve a socket's local address.
2088  */
2089 static int
2090 sockevent_getsockname(sockid_t id, struct sockaddr * __restrict addr,
2091 	socklen_t * __restrict addr_len)
2092 {
2093 	struct sock *sock;
2094 
2095 	if ((sock = sockhash_get(id)) == NULL)
2096 		return EINVAL;
2097 
2098 	if (sock->sock_ops->sop_getsockname == NULL)
2099 		return EOPNOTSUPP;
2100 
2101 	return sock->sock_ops->sop_getsockname(sock, addr, addr_len);
2102 }
2103 
2104 /*
2105  * Retrieve a socket's remote address.
2106  */
2107 static int
2108 sockevent_getpeername(sockid_t id, struct sockaddr * __restrict addr,
2109 	socklen_t * __restrict addr_len)
2110 {
2111 	struct sock *sock;
2112 
2113 	if ((sock = sockhash_get(id)) == NULL)
2114 		return EINVAL;
2115 
2116 	/* Listening-mode sockets cannot possibly have a peer address. */
2117 	if (sock->sock_opt & SO_ACCEPTCONN)
2118 		return ENOTCONN;
2119 
2120 	if (sock->sock_ops->sop_getpeername == NULL)
2121 		return EOPNOTSUPP;
2122 
2123 	return sock->sock_ops->sop_getpeername(sock, addr, addr_len);
2124 }
2125 
2126 /*
2127  * Mark the socket object as shut down for sending and/or receiving.  The flags
2128  * parameter may be a bitwise-OR'ed combination of SFL_SHUT_RD and SFL_SHUT_WR.
2129  * This function will wake up any suspended requests affected by this change,
2130  * but it will not invoke the sop_shutdown() callback function on the socket.
2131  * The function may in fact be called from sop_shutdown() before completion to
2132  * mark the socket as shut down as reflected by sockevent_is_shutdown().
2133  */
2134 void
2135 sockevent_set_shutdown(struct sock * sock, unsigned int flags)
2136 {
2137 	unsigned int mask;
2138 
2139 	assert(sock->sock_ops != NULL);
2140 	assert(!(flags & ~(SFL_SHUT_RD | SFL_SHUT_WR)));
2141 
2142 	/* Look at the newly set flags only. */
2143 	flags &= ~(unsigned int)sock->sock_flags;
2144 
2145 	if (flags != 0) {
2146 		sock->sock_flags |= flags;
2147 
2148 		/*
2149 		 * Wake up any blocked calls that are affected by the shutdown.
2150 		 * Shutting down listening sockets causes ongoing accept calls
2151 		 * to be rechecked.
2152 		 */
2153 		mask = 0;
2154 		if (flags & SFL_SHUT_RD)
2155 			mask |= SEV_RECV;
2156 		if (flags & SFL_SHUT_WR)
2157 			mask |= SEV_SEND;
2158 		if (sock->sock_opt & SO_ACCEPTCONN)
2159 			mask |= SEV_ACCEPT;
2160 
2161 		assert(mask != 0);
2162 		sockevent_raise(sock, mask);
2163 	}
2164 }
2165 
2166 /*
2167  * Shut down socket send and receive operations.
2168  */
2169 static int
2170 sockevent_shutdown(sockid_t id, int how)
2171 {
2172 	struct sock *sock;
2173 	unsigned int flags;
2174 	int r;
2175 
2176 	if ((sock = sockhash_get(id)) == NULL)
2177 		return EINVAL;
2178 
2179 	/* Convert the request to a set of flags. */
2180 	flags = 0;
2181 	if (how == SHUT_RD || how == SHUT_RDWR)
2182 		flags |= SFL_SHUT_RD;
2183 	if (how == SHUT_WR || how == SHUT_RDWR)
2184 		flags |= SFL_SHUT_WR;
2185 
2186 	if (sock->sock_ops->sop_shutdown != NULL)
2187 		r = sock->sock_ops->sop_shutdown(sock, flags);
2188 	else
2189 		r = OK;
2190 
2191 	/* On success, update our internal state as well. */
2192 	if (r == OK)
2193 		sockevent_set_shutdown(sock, flags);
2194 
2195 	return r;
2196 }
2197 
2198 /*
2199  * Close a socket.
2200  */
2201 static int
2202 sockevent_close(sockid_t id, const struct sockdriver_call * call)
2203 {
2204 	struct sock *sock;
2205 	int r, force;
2206 
2207 	if ((sock = sockhash_get(id)) == NULL)
2208 		return EINVAL;
2209 
2210 	assert(sock->sock_proc == NULL);
2211 	sock->sock_select.ss_endpt = NONE;
2212 
2213 	/*
2214 	 * There are several scenarios when it comes to closing sockets.  First
2215 	 * of all, we never actually force the socket driver to close a socket.
2216 	 * The driver may always suspend the close call and take as long as it
2217 	 * wants.  After a suspension, it signals its completion of the close
2218 	 * through the SEV_CLOSE socket event.
2219 	 *
2220 	 * With that said, we offer two levels of urgency regarding the close
2221 	 * request: regular and forced.  The former allows for a graceful
2222 	 * close; the latter urges the socket driver to close the socket as
2223 	 * soon as possible.  A socket that has been requested to be closed
2224 	 * gracefully can, as long as it is still open (i.e., no SEV_CLOSE was
2225 	 * fired yet), later be requested to be closed forcefully.  This is how
2226 	 * SO_LINGER with a nonzero timeout is implemented.  If SO_LINGER is
2227 	 * set with a zero timeout, the socket is force-closed immediately.
2228 	 * Finally, if SO_LINGER is not set, the socket will be closed normally
2229 	 * and never be forced--akin to SO_LINGER with an infinite timeout.
2230 	 *
2231 	 * The return value of the caller's close(2) may only ever be either
2232 	 * OK or EINPROGRESS, to ensure that the caller knows that the file
2233 	 * descriptor is freed up, as per Austin Group Defect #529.  In fact,
2234 	 * EINPROGRESS is to be returned only on signal interruption (i.e.,
2235 	 * cancel).  For that reason, this function only ever returns OK.
2236 	 */
2237 	force = ((sock->sock_opt & SO_LINGER) && sock->sock_linger == 0);
2238 
2239 	if (sock->sock_ops->sop_close != NULL)
2240 		r = sock->sock_ops->sop_close(sock, force);
2241 	else
2242 		r = OK;
2243 
2244 	assert(r == OK || r == SUSPEND);
2245 
2246 	if (r == SUSPEND) {
2247 		sock->sock_flags |= SFL_CLOSING;
2248 
2249 		/*
2250 		 * If we were requested to force-close the socket immediately,
2251 		 * but the socket driver needs more time anyway, then tell the
2252 		 * caller that the socket was closed right away.
2253 		 */
2254 		if (force)
2255 			return OK;
2256 
2257 		/*
2258 		 * If we are to force-close the socket only after a specific
2259 		 * linger timeout, set the timer for that now, even if the call
2260 		 * is non-blocking.  This also means that we cannot associate
2261 		 * the linger timeout with the close call.  Instead, we convert
2262 		 * the sock_linger value from a (relative) duration to an
2263 		 * (absolute) timeout time, and use the SFL_CLOSING flag (along
2264 		 * with SFL_TIMER) to tell the difference.  Since the socket is
2265 		 * otherwise unreachable from userland at this point, the
2266 		 * conversion is never visible in any way.
2267 		 *
2268 		 * The socket may already be in the timers list, so we must
2269 		 * always check the SO_LINGER flag before checking sock_linger.
2270 		 *
2271 		 * If SO_LINGER is not set, we must never suspend the call.
2272 		 */
2273 		if (sock->sock_opt & SO_LINGER) {
2274 			sock->sock_linger =
2275 			    socktimer_add(sock, sock->sock_linger);
2276 		} else
2277 			call = NULL;
2278 
2279 		/*
2280 		 * A non-blocking close is completed asynchronously.  The
2281 		 * caller is not told about this with EWOULDBLOCK as usual, for
2282 		 * the reasons mentioned above.
2283 		 */
2284 		if (call != NULL)
2285 			sockevent_suspend(sock, SEV_CLOSE, call, NONE);
2286 		else
2287 			r = OK;
2288 	} else if (r == OK)
2289 		sockevent_free(sock);
2290 
2291 	return r;
2292 }
2293 
2294 /*
2295  * Cancel a suspended send request.
2296  */
2297 static void
2298 sockevent_cancel_send(struct sock * sock, struct sockevent_proc * spr, int err)
2299 {
2300 	int r;
2301 
2302 	/*
2303 	 * If any regular or control data were sent, return the number of data
2304 	 * bytes sent--possibly zero.  Otherwise return the given error code.
2305 	 */
2306 	if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
2307 		r = (int)spr->spr_dataoff;
2308 	else
2309 		r = err;
2310 
2311 	sockdriver_reply_generic(&spr->spr_call, r);
2312 
2313 	/*
2314 	 * In extremely rare circumstances, one send may be queued behind
2315 	 * another send even though the former can actually be sent on the
2316 	 * socket right away.  For this reason, we retry sending when canceling
2317 	 * a send.  We need to do this only when the first send in the queue
2318 	 * was canceled, but multiple blocked sends on a single socket should
2319 	 * be rare anyway.
2320 	 */
2321 	sockevent_raise(sock, SEV_SEND);
2322 }
2323 
2324 /*
2325  * Cancel a suspended receive request.
2326  */
2327 static void
2328 sockevent_cancel_recv(struct sock * sock, struct sockevent_proc * spr, int err)
2329 {
2330 	int r;
2331 
2332 	/*
2333 	 * If any regular or control data were received, return the number of
2334 	 * data bytes received--possibly zero.  Otherwise return the given
2335 	 * error code.
2336 	 */
2337 	if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
2338 		r = (int)spr->spr_dataoff;
2339 	else
2340 		r = err;
2341 
2342 	/*
2343 	 * Also return any flags set for the data received so far, e.g.
2344 	 * MSG_CTRUNC.  Do not return an address: receive calls on unconnected
2345 	 * sockets must never block after receiving some data--instead, they
2346 	 * are supposed to return MSG_TRUNC if not all data were copied out.
2347 	 */
2348 	sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff, NULL, 0,
2349 	    spr->spr_rflags);
2350 
2351 	/*
2352 	 * The same story as for sends (see above) applies to receives,
2353 	 * although this case should be even more rare in practice.
2354 	 */
2355 	sockevent_raise(sock, SEV_RECV);
2356 }
2357 
2358 /*
2359  * Cancel a previous request that may currently be suspended.  The cancel
2360  * operation itself does not have a reply.  Instead, if the given request was
2361  * found to be suspended, that request must be aborted and an appropriate reply
2362  * must be sent for the request.  If no matching request was found, no reply
2363  * must be sent at all.
2364  */
2365 static void
2366 sockevent_cancel(sockid_t id, const struct sockdriver_call * call)
2367 {
2368 	struct sockevent_proc *spr;
2369 	struct sock *sock;
2370 
2371 	/*
2372 	 * Due to asynchronous close(2) operations, not even the sock object
2373 	 * may be found.  If this (entirely legitimate) case, do not send any
2374 	 * reply.
2375 	 */
2376 	if ((sock = sockhash_get(id)) == NULL)
2377 		return;
2378 
2379 	/*
2380 	 * The request may already have completed by the time we receive the
2381 	 * cancel request, in which case we can not find it.  In this (entirely
2382 	 * legitimate) case, do not send any reply.
2383 	 */
2384 	if ((spr = sockevent_unsuspend(sock, call)) == NULL)
2385 		return;
2386 
2387 	/*
2388 	 * We found the operation.  Cancel it according to its call type.
2389 	 * Then, once fully done with it, free the suspension data structure.
2390 	 *
2391 	 * Note that we have to use the call structure from the suspension data
2392 	 * structure rather than the given 'call' pointer: only the former
2393 	 * includes all the information necessary to resume the request!
2394 	 */
2395 	switch (spr->spr_event) {
2396 	case SEV_BIND:
2397 	case SEV_CONNECT:
2398 		assert(spr->spr_call.sc_endpt != NONE);
2399 
2400 		sockdriver_reply_generic(&spr->spr_call, EINTR);
2401 
2402 		break;
2403 
2404 	case SEV_ACCEPT:
2405 		sockdriver_reply_accept(&spr->spr_call, EINTR, NULL, 0);
2406 
2407 		break;
2408 
2409 	case SEV_SEND:
2410 		sockevent_cancel_send(sock, spr, EINTR);
2411 
2412 		break;
2413 
2414 	case SEV_RECV:
2415 		sockevent_cancel_recv(sock, spr, EINTR);
2416 
2417 		break;
2418 
2419 	case SEV_CLOSE:
2420 		/*
2421 		 * Return EINPROGRESS rather than EINTR, so that the user
2422 		 * process can tell from the close(2) result that the file
2423 		 * descriptor has in fact been closed.
2424 		 */
2425 		sockdriver_reply_generic(&spr->spr_call, EINPROGRESS);
2426 
2427 		/*
2428 		 * Do not free the sock object here: the socket driver will
2429 		 * complete the close in the background, and fire SEV_CLOSE
2430 		 * once it is done.  Only then is the sock object freed.
2431 		 */
2432 		break;
2433 
2434 	default:
2435 		panic("libsockevent: process suspended on unknown event 0x%x",
2436 		    spr->spr_event);
2437 	}
2438 
2439 	sockevent_proc_free(spr);
2440 }
2441 
2442 /*
2443  * Process a select request.
2444  */
2445 static int
2446 sockevent_select(sockid_t id, unsigned int ops,
2447 	const struct sockdriver_select * sel)
2448 {
2449 	struct sock *sock;
2450 	unsigned int r, notify;
2451 
2452 	if ((sock = sockhash_get(id)) == NULL)
2453 		return EINVAL;
2454 
2455 	notify = (ops & SDEV_NOTIFY);
2456 	ops &= (SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR);
2457 
2458 	/*
2459 	 * See if any of the requested select operations can be satisfied
2460 	 * immediately.
2461 	 */
2462 	r = sockevent_test_select(sock, ops);
2463 
2464 	/*
2465 	 * If select operations were pending, the new results must not indicate
2466 	 * that any of those were satisfied, as that would indicate an internal
2467 	 * logic error: the socket driver is supposed to update its state
2468 	 * proactively, and thus, discovering that things have changed here is
2469 	 * not something that should ever happen.
2470 	 */
2471 	assert(!(sock->sock_selops & r));
2472 
2473 	/*
2474 	 * If any select operations are not satisfied immediately, and we are
2475 	 * asked to notify the caller when they are satisfied later, save them
2476 	 * for later retesting.
2477 	 */
2478 	ops &= ~r;
2479 
2480 	if (notify && ops != 0) {
2481 		/*
2482 		 * For now, we support only one caller when it comes to select
2483 		 * queries: VFS.  If we want to support a networked file system
2484 		 * (or so) directly calling select as well, this library will
2485 		 * have to be extended accordingly (should not be too hard).
2486 		 */
2487 		if (sock->sock_select.ss_endpt != NONE) {
2488 			if (sock->sock_select.ss_endpt != sel->ss_endpt) {
2489 				printf("libsockevent: no support for multiple "
2490 				    "select callers yet\n");
2491 
2492 				return EIO;
2493 			}
2494 
2495 			/*
2496 			 * If a select query was already pending for this
2497 			 * caller, we must simply merge in the new operations.
2498 			 */
2499 			sock->sock_selops |= ops;
2500 		} else {
2501 			assert(sel->ss_endpt != NONE);
2502 
2503 			sock->sock_select = *sel;
2504 			sock->sock_selops = ops;
2505 		}
2506 	}
2507 
2508 	return r;
2509 }
2510 
2511 /*
2512  * An alarm has triggered.  Expire any timers.  Socket drivers that do not pass
2513  * clock notification messages to libsockevent must call expire_timers(3)
2514  * themselves instead.
2515  */
2516 static void
2517 sockevent_alarm(clock_t now)
2518 {
2519 
2520 	expire_timers(now);
2521 }
2522 
2523 static const struct sockdriver sockevent_tab = {
2524 	.sdr_socket		= sockevent_socket,
2525 	.sdr_socketpair		= sockevent_socketpair,
2526 	.sdr_bind		= sockevent_bind,
2527 	.sdr_connect		= sockevent_connect,
2528 	.sdr_listen		= sockevent_listen,
2529 	.sdr_accept		= sockevent_accept,
2530 	.sdr_send		= sockevent_send,
2531 	.sdr_recv		= sockevent_recv,
2532 	.sdr_ioctl		= sockevent_ioctl,
2533 	.sdr_setsockopt		= sockevent_setsockopt,
2534 	.sdr_getsockopt		= sockevent_getsockopt,
2535 	.sdr_getsockname	= sockevent_getsockname,
2536 	.sdr_getpeername	= sockevent_getpeername,
2537 	.sdr_shutdown		= sockevent_shutdown,
2538 	.sdr_close		= sockevent_close,
2539 	.sdr_cancel		= sockevent_cancel,
2540 	.sdr_select		= sockevent_select,
2541 	.sdr_alarm		= sockevent_alarm
2542 };
2543 
2544 /*
2545  * Initialize the socket event library.
2546  */
2547 void
2548 sockevent_init(sockevent_socket_cb_t socket_cb)
2549 {
2550 
2551 	sockhash_init();
2552 
2553 	socktimer_init();
2554 
2555 	sockevent_proc_init();
2556 
2557 	SIMPLEQ_INIT(&sockevent_pending);
2558 
2559 	assert(socket_cb != NULL);
2560 	sockevent_socket_cb = socket_cb;
2561 
2562 	/* Announce we are up. */
2563 	sockdriver_announce();
2564 
2565 	sockevent_working = FALSE;
2566 }
2567 
2568 /*
2569  * Process a socket driver request message.
2570  */
2571 void
2572 sockevent_process(const message * m_ptr, int ipc_status)
2573 {
2574 
2575 	/* Block events until after we have processed the request. */
2576 	assert(!sockevent_working);
2577 	sockevent_working = TRUE;
2578 
2579 	/* Actually process the request. */
2580 	sockdriver_process(&sockevent_tab, m_ptr, ipc_status);
2581 
2582 	/*
2583 	 * If any events were fired while processing the request, they will
2584 	 * have been queued for later.  Go through them now.
2585 	 */
2586 	if (sockevent_has_events())
2587 		sockevent_pump();
2588 
2589 	sockevent_working = FALSE;
2590 }
2591