xref: /minix/minix/lib/libsockdriver/sockdriver.c (revision fb9c64b2)
1 /* The protocol family independent socket driver framework. */
2 /*
3  * The table below lists all supported socket driver requests, along with
4  * information on whether the request handler may suspend the call for later
5  * processing, and which message layout is to be used for the request and reply
6  * messages for each call.
7  *
8  * Type			May suspend	Request	layout	Reply layout
9  * ----			-----------	--------------	------------
10  * SDEV_SOCKET		no		socket		socket_reply
11  * SDEV_SOCKETPAIR	no		socket		socket_reply
12  * SDEV_BIND		yes		addr		reply
13  * SDEV_CONNECT		yes		addr		reply
14  * SDEV_LISTEN		no		simple		reply
15  * SDEV_ACCEPT		yes		addr		accept_reply
16  * SDEV_SEND		yes		sendrecv	reply
17  * SDEV_RECV		yes		sendrecv	recv_reply
18  * SDEV_IOCTL		yes		ioctl		reply
19  * SDEV_SETSOCKOPT	no		getset		reply
20  * SDEV_GETSOCKOPT	no		getset		reply
21  * SDEV_GETSOCKNAME	no		getset		reply
22  * SDEV_GETPEERNAME	no		getset		reply
23  * SDEV_SHUTDOWN	no		simple		reply
24  * SDEV_CLOSE		yes		simple		reply
25  * SDEV_CANCEL		n/a		simple		-
26  * SDEV_SELECT		yes (special)	select		select_reply
27  *
28  * The request message layouts are prefixed with "m_vfs_lsockdriver_".  The
29  * reply message layouts are prefixed with "m_lsockdriver_vfs_", and use
30  * message types of the format SDEV_{,SOCKET_,ACCEPT_,RECV_}REPLY, matching the
31  * listed reply layout.  One exception is SDEV_CANCEL, which itself has no
32  * reply at all.  The other exception is SDEV_SELECT, which has two reply
33  * codes: SDEV_SELECT1_REPLY (for immediate replies) and SDEV_SELECT2_REPLY
34  * (for late replies), both using the select_reply reply layout.
35  */
36 
37 #include <minix/drivers.h>
38 #include <minix/sockdriver.h>
39 #include <sys/ioctl.h>
40 
41 static int running;
42 
43 /*
44  * Announce that we are up and running, after a fresh start or a restart.
45  */
46 void
47 sockdriver_announce(void)
48 {
49 	static const char *sockdriver_prefix = "drv.sck.";
50 	char key[DS_MAX_KEYLEN], label[DS_MAX_KEYLEN];
51 	int r;
52 
53 	/* Publish a driver up event. */
54 	if ((r = ds_retrieve_label_name(label, sef_self())) != OK)
55 		panic("sockdriver: unable to get own label: %d", r);
56 
57 	snprintf(key, sizeof(key), "%s%s", sockdriver_prefix, label);
58 	if ((r = ds_publish_u32(key, DS_DRIVER_UP, DSF_OVERWRITE)) != OK)
59 		panic("sockdriver: unable to publish driver up event: %d", r);
60 }
61 
62 /*
63  * Copy data from the caller into the local address space.  Return OK or a
64  * negative error code.
65  */
66 int
67 sockdriver_copyin(const struct sockdriver_data * __restrict data, size_t off,
68 	void * __restrict ptr, size_t len)
69 {
70 
71 	assert(data != NULL);
72 	assert(off + len <= data->_sd_len);
73 	assert(data->_sd_endpt != SELF);
74 	assert(GRANT_VALID(data->_sd_grant));
75 
76 	return sys_safecopyfrom(data->_sd_endpt, data->_sd_grant, off,
77 	    (vir_bytes)ptr, len);
78 }
79 
80 /*
81  * Copy data from the local address space to the caller.  Return OK or a
82  * negative error code.
83  */
84 int
85 sockdriver_copyout(const struct sockdriver_data * __restrict data, size_t off,
86 	const void * __restrict ptr, size_t len)
87 {
88 
89 	assert(data != NULL);
90 	assert(off + len <= data->_sd_len);
91 	assert(data->_sd_endpt != SELF);
92 	assert(GRANT_VALID(data->_sd_grant));
93 
94 	return sys_safecopyto(data->_sd_endpt, data->_sd_grant, off,
95 	    (vir_bytes)ptr, len);
96 }
97 
98 /*
99  * Copy data between the caller and the local address space, using a vector of
100  * at most SOCKDRIVER_IOV_MAX buffers.  Return OK or an error code.
101  */
102 static int
103 sockdriver_vcopy(const struct sockdriver_data * __restrict data, size_t off,
104 	const iovec_t * __restrict iov, unsigned int iovcnt, int copyin)
105 {
106 	static struct vscp_vec vec[SOCKDRIVER_IOV_MAX];
107 	unsigned int i;
108 
109 	assert(iov != NULL);
110 	assert(iovcnt <= __arraycount(vec));
111 
112 	/* We allow zero-element vectors, because we are nice. */
113 	if (iovcnt == 0)
114 		return OK;
115 
116 	/*
117 	 * Do not use a vector copy operation for single-element copies, as
118 	 * this saves the kernel from having to copy in the vector itself.
119 	 */
120 	if (iovcnt == 1) {
121 		if (copyin)
122 			return sockdriver_copyin(data, off,
123 			    (void *)iov->iov_addr, iov->iov_size);
124 		else
125 			return sockdriver_copyout(data, off,
126 			    (const void *)iov->iov_addr, iov->iov_size);
127 	}
128 
129 	assert(data != NULL);
130 	assert(data->_sd_endpt != SELF);
131 	assert(GRANT_VALID(data->_sd_grant));
132 
133 	for (i = 0; i < iovcnt; i++, iov++) {
134 		if (copyin) {
135 			vec[i].v_from = data->_sd_endpt;
136 			vec[i].v_to = SELF;
137 		} else {
138 			vec[i].v_from = SELF;
139 			vec[i].v_to = data->_sd_endpt;
140 		}
141 		vec[i].v_gid = data->_sd_grant;
142 		vec[i].v_offset = off;
143 		vec[i].v_addr = iov->iov_addr;
144 		vec[i].v_bytes = iov->iov_size;
145 
146 		off += iov->iov_size;
147 	}
148 
149 	assert(off <= data->_sd_len);
150 
151 	return sys_vsafecopy(vec, iovcnt);
152 }
153 
154 /*
155  * Copy data from the caller into the local address space, using a vector of
156  * buffers.  Return OK or a negative error code.
157  */
158 int
159 sockdriver_vcopyin(const struct sockdriver_data * __restrict data, size_t off,
160 	const iovec_t * __restrict iov, unsigned int iovcnt)
161 {
162 
163 	return sockdriver_vcopy(data, off, iov, iovcnt, TRUE /*copyin*/);
164 }
165 
166 /*
167  * Copy data from the local address space to the caller, using a vector of
168  * buffers.  Return OK or a negative error code.
169  */
170 int
171 sockdriver_vcopyout(const struct sockdriver_data * __restrict data, size_t off,
172 	const iovec_t * __restrict iov, unsigned int iovcnt)
173 {
174 
175 	return sockdriver_vcopy(data, off, iov, iovcnt, FALSE /*copyin*/);
176 }
177 
178 /*
179  * Copy data from the caller into the local address space, using socket option
180  * semantics: fail the call with EINVAL if the given 'optlen' is not equal to
181  * the given 'len'.  Return OK or a negative error code.
182  */
183 int
184 sockdriver_copyin_opt(const struct sockdriver_data * __restrict data,
185 	void * __restrict ptr, size_t len, socklen_t optlen)
186 {
187 
188 	if (len != optlen)
189 		return EINVAL;
190 	else
191 		return sockdriver_copyin(data, 0, ptr, len);
192 }
193 
194 /*
195  * Copy data from the local address space to the caller, using socket option
196  * semantics: limit the size of the copied-out data to the size pointed to by
197  * 'optlen', and return the possibly truncated size in 'optlen' on success.
198  * Return OK or a negative error code.
199  */
200 int
201 sockdriver_copyout_opt(const struct sockdriver_data * __restrict data,
202 	const void * __restrict ptr, size_t len, socklen_t * __restrict optlen)
203 {
204 	int r;
205 
206 	if (len > *optlen)
207 		len = *optlen;
208 
209 	if ((r = sockdriver_copyout(data, 0, ptr, len)) == OK)
210 		*optlen = len;
211 
212 	return r;
213 }
214 
215 /*
216  * Compress a sockdriver_data structure to a smaller variant that stores only
217  * the fields that are not already stored redundantly in/as the given 'call'
218  * and 'len' parameters.  The typical use case here this call suspension.  In
219  * that case, the caller will already store 'call' and 'len' as is, and can
220  * save memory by storing a packed version of 'data' rather than that structure
221  * itself.  Return OK on success, with 'pack' containing a compressed version
222  * of 'data'.  Return EINVAL if the given parameters do not match; this would
223  * typically be a sign that the calling application messed up badly.
224  */
225 int
226 sockdriver_pack_data(struct sockdriver_packed_data * pack,
227 	const struct sockdriver_call * call,
228 	const struct sockdriver_data * data, size_t len)
229 {
230 
231 	if (data->_sd_endpt != call->sc_endpt)
232 		return EINVAL;
233 	if (data->_sd_len != len)
234 		return EINVAL;
235 
236 	pack->_spd_grant = data->_sd_grant;
237 	return OK;
238 }
239 
240 /*
241  * Decompress a previously packed sockdriver data structure into a full
242  * sockdriver_data structure, with the help of the given 'call' and 'len'
243  * parameters.  Return the unpacked version of 'pack' in 'data'.  This function
244  * always succeeds.
245  */
246 void
247 sockdriver_unpack_data(struct sockdriver_data * data,
248 	const struct sockdriver_call * call,
249 	const struct sockdriver_packed_data * pack, size_t len)
250 {
251 
252 	data->_sd_endpt = call->sc_endpt;
253 	data->_sd_grant = pack->_spd_grant;
254 	data->_sd_len = len;
255 }
256 
257 /*
258  * Send a reply to a request.
259  */
260 static void
261 send_reply(endpoint_t endpt, int type, message * m_ptr)
262 {
263 	int r;
264 
265 	m_ptr->m_type = type;
266 
267 	if ((r = asynsend(endpt, m_ptr)) != OK)
268 		printf("sockdriver: sending reply to %d failed (%d)\n",
269 		    endpt, r);
270 }
271 
272 /*
273  * Send a reply which takes only a result code and no additional reply fields.
274  */
275 static void
276 send_generic_reply(endpoint_t endpt, sockreq_t req, int reply)
277 {
278 	message m;
279 
280 	assert(reply != SUSPEND && reply != EDONTREPLY);
281 
282 	memset(&m, 0, sizeof(m));
283 	m.m_lsockdriver_vfs_reply.req_id = req;
284 	m.m_lsockdriver_vfs_reply.status = reply;
285 
286 	send_reply(endpt, SDEV_REPLY, &m);
287 }
288 
289 /*
290  * Send a reply to an earlier suspended request which takes only a result code
291  * and no additional reply fields.
292  */
293 void
294 sockdriver_reply_generic(const struct sockdriver_call * call, int reply)
295 {
296 
297 	send_generic_reply(call->sc_endpt, call->sc_req, reply);
298 }
299 
300 /*
301  * Send a reply to a socket or a socketpair request.  Since these calls may not
302  * be suspended, this function is used internally only.
303  */
304 static void
305 send_socket_reply(endpoint_t endpt, sockreq_t req, sockid_t reply,
306 	sockid_t reply2)
307 {
308 	message m;
309 
310 	assert(reply != SUSPEND && reply != EDONTREPLY);
311 
312 	memset(&m, 0, sizeof(m));
313 	m.m_lsockdriver_vfs_socket_reply.req_id = req;
314 	m.m_lsockdriver_vfs_socket_reply.sock_id = reply;
315 	m.m_lsockdriver_vfs_socket_reply.sock_id2 = reply2;
316 
317 	send_reply(endpt, SDEV_SOCKET_REPLY, &m);
318 }
319 
320 /*
321  * Send a reply to an earlier suspended accept request.  The given reply is
322  * either a socket identifier (>= 0) or an error code (< 0).  On success, an
323  * address must be given as 'addr', and its nonzero length must be given as
324  * 'addr_len'.
325  */
326 void
327 sockdriver_reply_accept(const struct sockdriver_call * __restrict call,
328 	sockid_t reply, struct sockaddr * __restrict addr, socklen_t addr_len)
329 {
330 	sockid_t id;
331 	message m;
332 
333 	assert(reply != SUSPEND && reply != EDONTREPLY);
334 
335 	/*
336 	 * If the accept was successful, copy out the address, if requested.
337 	 * If the copy fails, send both a valid socket ID and an error to VFS.
338 	 * VFS will then close the newly created socket immediately, and return
339 	 * the error to the caller.
340 	 *
341 	 * While not particularly nice, the general behavior of closing the
342 	 * socket after accepting it seems to be common among other OSes for
343 	 * address copy errors.  Most importantly, it frees the socket driver
344 	 * from having to deal with address copy errors itself.
345 	 *
346 	 * Letting VFS close the socket is also not all that great.  However,
347 	 * it is the lesser evil compared to the two main alternatives: 1)
348 	 * immediately calling sdr_close() from here, which would seriously
349 	 * complicate writing socket drivers due to sockets disappearing from
350 	 * under it, so to speak, and 2) queuing a forged incoming SDEV_CLOSE
351 	 * request, for which we do not have the necessary infrastructure.
352 	 * Additionally, VFS may close the newly accepted socket when out of
353 	 * other required resources anyway, so logically this fits in well.
354 	 * The only real price to pay is a slightly uglier message protocol.
355 	 *
356 	 * Copying out the address *length* is not our responsibility at all;
357 	 * if VFS chooses to do this itself (as opposed to letting libc do it),
358 	 * it too will have to close the socket on failure, using a separate
359 	 * close call.  This is always multithreading-safe because userland can
360 	 * not access the accepted socket yet anyway.
361 	 */
362 	if (reply >= 0) {
363 		id = reply;
364 		reply = OK;
365 	} else
366 		id = -1;
367 
368 	if (reply == OK && GRANT_VALID(call->_sc_grant)) {
369 		if (addr == NULL || addr_len == 0)
370 			panic("libsockdriver: success but no address given");
371 
372 		if (addr_len > call->_sc_len)
373 			addr_len = call->_sc_len; /* truncate addr and len */
374 
375 		if (addr_len > 0) {
376 			reply = sys_safecopyto(call->sc_endpt, call->_sc_grant,
377 			    0, (vir_bytes)addr, addr_len);
378 
379 			/* Intentionally leave 'id' set on failure here. */
380 		}
381 	} else
382 		addr_len = 0;	/* not needed, but cleaner */
383 
384 	memset(&m, 0, sizeof(m));
385 	m.m_lsockdriver_vfs_accept_reply.req_id = call->sc_req;
386 	m.m_lsockdriver_vfs_accept_reply.sock_id = id;
387 	m.m_lsockdriver_vfs_accept_reply.status = reply;
388 	m.m_lsockdriver_vfs_accept_reply.len = addr_len;
389 
390 	send_reply(call->sc_endpt, SDEV_ACCEPT_REPLY, &m);
391 }
392 
393 /*
394  * Send a reply to an earlier suspended receive call.  The given reply code is
395  * the number of regular data bytes received (>= 0) or an error code (< 0).
396  * On success, for connectionless sockets, 'addr' must point to the source
397  * address and 'addr_len' must contain the address length; for connection-
398  * oriented sockets, 'addr_len' must be zero, in which case 'addr' is ignored.
399  */
400 void
401 sockdriver_reply_recv(const struct sockdriver_call * __restrict call,
402 	int reply, socklen_t ctl_len, struct sockaddr * __restrict addr,
403 	socklen_t addr_len, int flags)
404 {
405 	message m;
406 	int r;
407 
408 	assert(reply != SUSPEND && reply != EDONTREPLY);
409 
410 	/*
411 	 * If applicable, copy out the address.  If this fails, the result is
412 	 * loss of the data received; in the case of AF_UNIX, this may include
413 	 * references to file descriptors already created in the receiving
414 	 * process.  At least Linux and NetBSD behave this way as well, which
415 	 * is not an excuse to be lazy, but we need to change just about
416 	 * everything for the worse (including having additional grants just
417 	 * for storing lengths) in order to fully solve this corner case.
418 	 *
419 	 * TODO: a reasonable compromise might be to add a callback routine for
420 	 * closing file descriptors in any already-written control data.  This
421 	 * would solve the worst aspect of the data loss, not the loss itself.
422 	 */
423 	if (reply >= 0 && addr_len > 0 && GRANT_VALID(call->_sc_grant)) {
424 		if (addr == NULL)
425 			panic("libsockdriver: success but no address given");
426 
427 		if (addr_len > call->_sc_len)
428 			addr_len = call->_sc_len; /* truncate addr and len */
429 
430 		if (addr_len > 0 && (r = sys_safecopyto(call->sc_endpt,
431 		    call->_sc_grant, 0, (vir_bytes)addr, addr_len)) != OK)
432 			reply = r;
433 	} else
434 		addr_len = 0;
435 
436 	memset(&m, 0, sizeof(m));
437 	m.m_lsockdriver_vfs_recv_reply.req_id = call->sc_req;
438 	m.m_lsockdriver_vfs_recv_reply.status = reply;
439 	m.m_lsockdriver_vfs_recv_reply.ctl_len = ctl_len;
440 	m.m_lsockdriver_vfs_recv_reply.addr_len = addr_len;
441 	m.m_lsockdriver_vfs_recv_reply.flags = flags;
442 
443 	send_reply(call->sc_endpt, SDEV_RECV_REPLY, &m);
444 }
445 
446 /*
447  * Send a reply to a select request.
448  */
449 static void
450 send_select_reply(const struct sockdriver_select * sel, int type, sockid_t id,
451 	int ops)
452 {
453 	message m;
454 
455 	assert(ops != SUSPEND && ops != EDONTREPLY);
456 
457 	memset(&m, 0, sizeof(m));
458 	m.m_lsockdriver_vfs_select_reply.sock_id = id;
459 	m.m_lsockdriver_vfs_select_reply.status = ops;
460 
461 	send_reply(sel->ss_endpt, type, &m);
462 }
463 
464 /*
465  * Send a reply to an earlier select call that requested notifications.
466  */
467 void
468 sockdriver_reply_select(const struct sockdriver_select * sel, sockid_t id,
469 	int ops)
470 {
471 
472 	send_select_reply(sel, SDEV_SELECT2_REPLY, id, ops);
473 }
474 
475 /*
476  * Create a new socket.  This call may not be suspended.
477  */
478 static void
479 do_socket(const struct sockdriver * __restrict sdp,
480 	const message * __restrict m_ptr)
481 {
482 	sockid_t r;
483 
484 	if (sdp->sdr_socket != NULL)
485 		r = sdp->sdr_socket(m_ptr->m_vfs_lsockdriver_socket.domain,
486 		    m_ptr->m_vfs_lsockdriver_socket.type,
487 		    m_ptr->m_vfs_lsockdriver_socket.protocol,
488 		    m_ptr->m_vfs_lsockdriver_socket.user_endpt);
489 	else
490 		r = EOPNOTSUPP;
491 
492 	send_socket_reply(m_ptr->m_source,
493 	    m_ptr->m_vfs_lsockdriver_socket.req_id, r, -1);
494 }
495 
496 /*
497  * Create a pair of connected sockets.  Relevant for UNIX domain sockets only.
498  * This call may not be suspended.
499  */
500 static void
501 do_socketpair(const struct sockdriver * __restrict sdp,
502 	const message * __restrict m_ptr)
503 {
504 	sockid_t sockid[2];
505 	int r;
506 
507 	if (sdp->sdr_socketpair != NULL)
508 		r = sdp->sdr_socketpair(m_ptr->m_vfs_lsockdriver_socket.domain,
509 		    m_ptr->m_vfs_lsockdriver_socket.type,
510 		    m_ptr->m_vfs_lsockdriver_socket.protocol,
511 		    m_ptr->m_vfs_lsockdriver_socket.user_endpt, sockid);
512 	else
513 		r = EOPNOTSUPP;
514 
515 	if (r != OK) {
516 		sockid[0] = r;
517 		sockid[1] = -1;
518 	}
519 
520 	send_socket_reply(m_ptr->m_source,
521 	    m_ptr->m_vfs_lsockdriver_socket.req_id, sockid[0], sockid[1]);
522 }
523 
524 /*
525  * Bind a socket to a local address, or connect a socket to a remote address.
526  * In both cases, this call may be suspended by the socket driver, in which
527  * case sockdriver_reply_generic() must be used to reply later.
528  *
529  * For bind(2), POSIX is not entirely consistent regarding call suspension: the
530  * bind(2) call may return EINPROGRESS for nonblocking sockets, but this also
531  * suggests that blocking bind(2) calls may be interrupted by signals (as on
532  * MINIX3 they can be), yet EINTR is not defined as a valid return code for it.
533  */
534 static void
535 do_bind_connect(const struct sockdriver * __restrict sdp,
536 	const message * __restrict m_ptr)
537 {
538 	int (*proc)(sockid_t, const struct sockaddr * __restrict, socklen_t,
539 	    endpoint_t, const struct sockdriver_call * __restrict);
540 	struct sockdriver_call call;
541 	char buf[SOCKADDR_MAX];
542 	sockid_t id;
543 	cp_grant_id_t grant;
544 	socklen_t len;
545 	endpoint_t user_endpt;
546 	int r, sflags;
547 
548 	call.sc_endpt = m_ptr->m_source;
549 	call.sc_req = m_ptr->m_vfs_lsockdriver_addr.req_id;
550 
551 	id = m_ptr->m_vfs_lsockdriver_addr.sock_id;
552 	grant = m_ptr->m_vfs_lsockdriver_addr.grant;
553 	len = m_ptr->m_vfs_lsockdriver_addr.len;
554 	user_endpt = m_ptr->m_vfs_lsockdriver_addr.user_endpt;
555 	sflags = m_ptr->m_vfs_lsockdriver_addr.sflags;
556 
557 	switch (m_ptr->m_type) {
558 	case SDEV_BIND:		proc = sdp->sdr_bind;		break;
559 	case SDEV_CONNECT:	proc = sdp->sdr_connect;	break;
560 	default:		panic("expected bind or connect");
561 	}
562 
563 	r = OK;
564 	if (!GRANT_VALID(grant) || len == 0 || len > sizeof(buf))
565 		r = EINVAL;
566 	else
567 		r = sys_safecopyfrom(m_ptr->m_source, grant, 0, (vir_bytes)buf,
568 		    len);
569 
570 	if (r == OK) {
571 		if (proc != NULL)
572 			r = proc(id, (struct sockaddr *)buf, len, user_endpt,
573 			    (sflags & SDEV_NONBLOCK) ? NULL : &call);
574 		else
575 			r = EOPNOTSUPP;
576 	}
577 
578 	assert(!(sflags & SDEV_NONBLOCK) || (r != SUSPEND && r != EDONTREPLY));
579 
580 	if (r != SUSPEND && r != EDONTREPLY)
581 		sockdriver_reply_generic(&call, r);
582 }
583 
584 /*
585  * Put a socket in listening mode.  This call may not be suspended.
586  */
587 static void
588 do_listen(const struct sockdriver * __restrict sdp,
589 	const message * __restrict m_ptr)
590 {
591 	int r;
592 
593 	if (sdp->sdr_listen != NULL)
594 		r = sdp->sdr_listen(m_ptr->m_vfs_lsockdriver_simple.sock_id,
595 		    m_ptr->m_vfs_lsockdriver_simple.param /*backlog*/);
596 	else
597 		r = EOPNOTSUPP;
598 
599 	send_generic_reply(m_ptr->m_source,
600 	    m_ptr->m_vfs_lsockdriver_simple.req_id, r);
601 }
602 
603 /*
604  * Accept a connection on a listening socket, creating a new socket.
605  * This call may be suspended by the socket driver, in which case
606  * sockdriver_reply_accept() must be used to reply later.
607  */
608 static void
609 do_accept(const struct sockdriver * __restrict sdp,
610 	const message * __restrict m_ptr)
611 {
612 	struct sockdriver_call call;
613 	char buf[SOCKADDR_MAX];
614 	struct sockaddr *addr;
615 	socklen_t len;
616 	endpoint_t user_endpt;
617 	int sflags;
618 	sockid_t r;
619 
620 	call.sc_endpt = m_ptr->m_source;
621 	call.sc_req = m_ptr->m_vfs_lsockdriver_addr.req_id;
622 	call._sc_grant = m_ptr->m_vfs_lsockdriver_addr.grant;
623 	call._sc_len = m_ptr->m_vfs_lsockdriver_addr.len;
624 
625 	addr = (struct sockaddr *)buf;
626 	len = 0;
627 	user_endpt = m_ptr->m_vfs_lsockdriver_addr.user_endpt;
628 	sflags = m_ptr->m_vfs_lsockdriver_addr.sflags;
629 
630 	if (sdp->sdr_accept != NULL)
631 		r = sdp->sdr_accept(m_ptr->m_vfs_lsockdriver_addr.sock_id,
632 		    addr, &len, user_endpt,
633 		    (sflags & SDEV_NONBLOCK) ? NULL : &call);
634 	else
635 		r = EOPNOTSUPP;
636 
637 	assert(!(sflags & SDEV_NONBLOCK) || (r != SUSPEND && r != EDONTREPLY));
638 
639 	if (r != SUSPEND && r != EDONTREPLY)
640 		sockdriver_reply_accept(&call, r, addr, len);
641 }
642 
643 /*
644  * Send regular and/or control data.  This call may be suspended by the socket
645  * driver, in which case sockdriver_reply_generic() must be used to reply
646  * later.
647  */
648 static void
649 do_send(const struct sockdriver * __restrict sdp,
650 	const message * __restrict m_ptr)
651 {
652 	struct sockdriver_call call;
653 	struct sockdriver_data data, ctl_data;
654 	char buf[SOCKADDR_MAX];
655 	struct sockaddr *addr;
656 	cp_grant_id_t addr_grant;
657 	socklen_t addr_len;
658 	endpoint_t user_endpt;
659 	sockid_t id;
660 	int r, flags;
661 
662 	call.sc_endpt = m_ptr->m_source;
663 	call.sc_req = m_ptr->m_vfs_lsockdriver_sendrecv.req_id;
664 
665 	data._sd_grant = m_ptr->m_vfs_lsockdriver_sendrecv.data_grant;
666 	data._sd_endpt = m_ptr->m_source;
667 	data._sd_len = m_ptr->m_vfs_lsockdriver_sendrecv.data_len;
668 
669 	/* The returned size must fit in an 'int'; truncate accordingly. */
670 	if (data._sd_len > INT_MAX)
671 		data._sd_len = INT_MAX;
672 
673 	ctl_data._sd_endpt = m_ptr->m_source;
674 	ctl_data._sd_grant = m_ptr->m_vfs_lsockdriver_sendrecv.ctl_grant;
675 	ctl_data._sd_len = m_ptr->m_vfs_lsockdriver_sendrecv.ctl_len;
676 
677 	id = m_ptr->m_vfs_lsockdriver_sendrecv.sock_id;
678 	addr_grant = m_ptr->m_vfs_lsockdriver_sendrecv.addr_grant;
679 	addr_len = m_ptr->m_vfs_lsockdriver_sendrecv.addr_len;
680 	user_endpt = m_ptr->m_vfs_lsockdriver_sendrecv.user_endpt;
681 	flags = m_ptr->m_vfs_lsockdriver_sendrecv.flags;
682 
683 	r = OK;
684 	if (GRANT_VALID(addr_grant)) {
685 		if (addr_len == 0 || addr_len > sizeof(buf))
686 			r = EINVAL;
687 		else
688 			r = sys_safecopyfrom(m_ptr->m_source, addr_grant, 0,
689 			    (vir_bytes)buf, addr_len);
690 		addr = (struct sockaddr *)buf;
691 	} else {
692 		addr = NULL;
693 		addr_len = 0;
694 	}
695 
696 	if (r == OK) {
697 		if (sdp->sdr_send != NULL)
698 			r = sdp->sdr_send(id, &data, data._sd_len, &ctl_data,
699 			    ctl_data._sd_len, addr, addr_len, user_endpt,
700 			    flags, (flags & MSG_DONTWAIT) ? NULL : &call);
701 		else
702 			r = EOPNOTSUPP;
703 	}
704 
705 	assert(!(flags & MSG_DONTWAIT) || (r != SUSPEND && r != EDONTREPLY));
706 
707 	if (r != SUSPEND && r != EDONTREPLY)
708 		sockdriver_reply_generic(&call, r);
709 }
710 
711 /*
712  * Receive regular and/or control data.  This call may be suspended by the
713  * socket driver, in which case sockdriver_reply_recv() must be used to reply
714  * later.
715  */
716 static void
717 do_recv(const struct sockdriver * __restrict sdp,
718 	const message * __restrict m_ptr)
719 {
720 	struct sockdriver_call call;
721 	struct sockdriver_data data, ctl_data;
722 	char buf[SOCKADDR_MAX];
723 	struct sockaddr *addr;
724 	sockid_t id;
725 	socklen_t ctl_len, addr_len;
726 	endpoint_t user_endpt;
727 	int r, flags;
728 
729 	call.sc_endpt = m_ptr->m_source;
730 	call.sc_req = m_ptr->m_vfs_lsockdriver_sendrecv.req_id;
731 	call._sc_grant = m_ptr->m_vfs_lsockdriver_sendrecv.addr_grant;
732 	call._sc_len = m_ptr->m_vfs_lsockdriver_sendrecv.addr_len;
733 
734 	data._sd_endpt = m_ptr->m_source;
735 	data._sd_grant = m_ptr->m_vfs_lsockdriver_sendrecv.data_grant;
736 	data._sd_len = m_ptr->m_vfs_lsockdriver_sendrecv.data_len;
737 
738 	/* The returned size must fit in an 'int'; truncate accordingly. */
739 	if (data._sd_len > INT_MAX)
740 		data._sd_len = INT_MAX;
741 
742 	ctl_data._sd_endpt = m_ptr->m_source;
743 	ctl_data._sd_grant = m_ptr->m_vfs_lsockdriver_sendrecv.ctl_grant;
744 	ctl_data._sd_len = m_ptr->m_vfs_lsockdriver_sendrecv.ctl_len;
745 
746 	id = m_ptr->m_vfs_lsockdriver_sendrecv.sock_id;
747 	ctl_len = ctl_data._sd_len;
748 	addr = (struct sockaddr *)buf;
749 	addr_len = 0; /* the default: no source address */
750 	user_endpt = m_ptr->m_vfs_lsockdriver_sendrecv.user_endpt;
751 	flags = m_ptr->m_vfs_lsockdriver_sendrecv.flags;
752 
753 	if (sdp->sdr_recv != NULL)
754 		r = sdp->sdr_recv(id, &data, data._sd_len, &ctl_data, &ctl_len,
755 		    addr, &addr_len, user_endpt, &flags,
756 		    (flags & MSG_DONTWAIT) ? NULL : &call);
757 	else
758 		r = EOPNOTSUPP;
759 
760 	assert(!(flags & MSG_DONTWAIT) || (r != SUSPEND && r != EDONTREPLY));
761 
762 	if (r != SUSPEND && r != EDONTREPLY)
763 		sockdriver_reply_recv(&call, r, ctl_len, addr, addr_len,
764 		    flags);
765 }
766 
767 /*
768  * Process an I/O control call.  This call may be suspended by the socket
769  * driver, in which case sockdriver_reply_generic() must be used to reply
770  * later.
771  */
772 static void
773 do_ioctl(const struct sockdriver * __restrict sdp,
774 	const message * __restrict m_ptr)
775 {
776 	struct sockdriver_call call;
777 	struct sockdriver_data data;
778 	sockid_t id;
779 	unsigned long request;
780 	endpoint_t user_endpt;
781 	int r, sflags;
782 
783 	call.sc_endpt = m_ptr->m_source;
784 	call.sc_req = m_ptr->m_vfs_lsockdriver_ioctl.req_id;
785 
786 	id = m_ptr->m_vfs_lsockdriver_ioctl.sock_id;
787 	request = m_ptr->m_vfs_lsockdriver_ioctl.request;
788 	user_endpt = m_ptr->m_vfs_lsockdriver_ioctl.user_endpt;
789 	sflags = m_ptr->m_vfs_lsockdriver_ioctl.sflags;
790 
791 	data._sd_endpt = m_ptr->m_source;
792 	data._sd_grant = m_ptr->m_vfs_lsockdriver_ioctl.grant;
793 	if (_MINIX_IOCTL_BIG(request))
794 		data._sd_len = _MINIX_IOCTL_SIZE_BIG(request);
795 	else
796 		data._sd_len = _MINIX_IOCTL_SIZE(request);
797 
798 	if (sdp->sdr_ioctl != NULL)
799 		r = sdp->sdr_ioctl(id, request, &data, user_endpt,
800 		    (sflags & SDEV_NONBLOCK) ? NULL : &call);
801 	else
802 		r = EOPNOTSUPP;
803 
804 	assert(!(sflags & SDEV_NONBLOCK) || (r != SUSPEND && r != EDONTREPLY));
805 
806 	if (r != SUSPEND && r != EDONTREPLY)
807 		sockdriver_reply_generic(&call, r);
808 }
809 
810 /*
811  * Set socket options.  This call may not be suspended.
812  */
813 static void
814 do_setsockopt(const struct sockdriver * __restrict sdp,
815 	const message * __restrict m_ptr)
816 {
817 	struct sockdriver_data data;
818 	int r;
819 
820 	data._sd_endpt = m_ptr->m_source;
821 	data._sd_grant = m_ptr->m_vfs_lsockdriver_getset.grant;
822 	data._sd_len = m_ptr->m_vfs_lsockdriver_getset.len;
823 
824 	if (sdp->sdr_setsockopt != NULL)
825 		r = sdp->sdr_setsockopt(
826 		    m_ptr->m_vfs_lsockdriver_getset.sock_id,
827 		    m_ptr->m_vfs_lsockdriver_getset.level,
828 		    m_ptr->m_vfs_lsockdriver_getset.name, &data, data._sd_len);
829 	else
830 		r = EOPNOTSUPP;
831 
832 	send_generic_reply(m_ptr->m_source,
833 	    m_ptr->m_vfs_lsockdriver_getset.req_id, r);
834 }
835 
836 /*
837  * Retrieve socket options.  This call may not be suspended.
838  */
839 static void
840 do_getsockopt(const struct sockdriver * __restrict sdp,
841 	const message * __restrict m_ptr)
842 {
843 	struct sockdriver_data data;
844 	socklen_t len;
845 	int r;
846 
847 	data._sd_endpt = m_ptr->m_source;
848 	data._sd_grant = m_ptr->m_vfs_lsockdriver_getset.grant;
849 	data._sd_len = m_ptr->m_vfs_lsockdriver_getset.len;
850 
851 	len = data._sd_len;
852 
853 	if (sdp->sdr_setsockopt != NULL)
854 		r = sdp->sdr_getsockopt(
855 		    m_ptr->m_vfs_lsockdriver_getset.sock_id,
856 		    m_ptr->m_vfs_lsockdriver_getset.level,
857 		    m_ptr->m_vfs_lsockdriver_getset.name, &data, &len);
858 	else
859 		r = EOPNOTSUPP;
860 
861 	/*
862 	 * For these requests, the main reply code is used to return the
863 	 * resulting data length on success.  The length will never large
864 	 * enough to overflow, and we save on API calls and messages this way.
865 	 */
866 	if (r == OK) {
867 		assert(len <= INT_MAX);
868 
869 		r = (int)len;
870 	} else if (r > 0)
871 		panic("libsockdriver: invalid reply");
872 
873 	send_generic_reply(m_ptr->m_source,
874 	    m_ptr->m_vfs_lsockdriver_getset.req_id, r);
875 }
876 
877 /*
878  * Get local or remote address.  This call may not be suspended.
879  */
880 static void
881 do_getname(const struct sockdriver * __restrict sdp,
882 	const message * __restrict m_ptr)
883 {
884 	int (*proc)(sockid_t, struct sockaddr * __restrict,
885 	    socklen_t * __restrict);
886 	char buf[SOCKADDR_MAX];
887 	socklen_t addr_len, len;
888 	int r;
889 
890 	switch (m_ptr->m_type) {
891 	case SDEV_GETSOCKNAME:	proc = sdp->sdr_getsockname;	break;
892 	case SDEV_GETPEERNAME:	proc = sdp->sdr_getpeername;	break;
893 	default:		panic("expected getsockname or getpeername");
894 	}
895 
896 	/* The 'name' and 'level' message fields are unused for these calls. */
897 
898 	addr_len = m_ptr->m_vfs_lsockdriver_getset.len;
899 	len = 0;
900 
901 	if (proc != NULL)
902 		r = proc(m_ptr->m_vfs_lsockdriver_getset.sock_id,
903 		    (struct sockaddr *)buf, &len);
904 	else
905 		r = EOPNOTSUPP;
906 
907 	if (r == OK) {
908 		if (len == 0)
909 			panic("libsockdriver: success but no address given");
910 
911 		if (addr_len > len)
912 			addr_len = len;
913 
914 		/* As above, use the reply code for the resulting length. */
915 		if (addr_len > 0 && (r = sys_safecopyto(m_ptr->m_source,
916 		    m_ptr->m_vfs_lsockdriver_getset.grant, 0, (vir_bytes)buf,
917 		    addr_len)) == OK) {
918 			assert(addr_len <= INT_MAX);
919 
920 			/*
921 			 * The Open Group wording has changed recently, now
922 			 * suggesting that when truncating the "stored address"
923 			 * the resulting length should be truncated as well.
924 			 */
925 			r = addr_len;
926 		}
927 	} else if (r > 0)
928 		panic("libsockdriver: invalid reply");
929 
930 	send_generic_reply(m_ptr->m_source,
931 	    m_ptr->m_vfs_lsockdriver_getset.req_id, r);
932 }
933 
934 /*
935  * Shut down socket send and receive operations.  This call may not be
936  * suspended.
937  */
938 static void
939 do_shutdown(const struct sockdriver * __restrict sdp,
940 	const message * __restrict m_ptr)
941 {
942 	int r;
943 
944 	if (sdp->sdr_shutdown != NULL)
945 		r = sdp->sdr_shutdown(
946 		    m_ptr->m_vfs_lsockdriver_simple.sock_id,
947 		    m_ptr->m_vfs_lsockdriver_simple.param /*how*/);
948 	else
949 		r = EOPNOTSUPP;
950 
951 	send_generic_reply(m_ptr->m_source,
952 	    m_ptr->m_vfs_lsockdriver_simple.req_id, r);
953 }
954 
955 /*
956  * Close a socket.  This call may be suspended by the socket driver, in which
957  * case sockdriver_reply_generic() must be used to reply later.  Note that VFS
958  * currently does not support blocking close operations, and will mark all
959  * close operations as nonblocking.  This will be changed in the future.
960  */
961 static void
962 do_close(const struct sockdriver * __restrict sdp,
963 	const message * __restrict m_ptr)
964 {
965 	struct sockdriver_call call;
966 	int r, sflags;
967 
968 	call.sc_endpt = m_ptr->m_source;
969 	call.sc_req = m_ptr->m_vfs_lsockdriver_simple.req_id;
970 
971 	sflags = m_ptr->m_vfs_lsockdriver_simple.param;
972 
973 	if (sdp->sdr_close != NULL)
974 		r = sdp->sdr_close(m_ptr->m_vfs_lsockdriver_simple.sock_id,
975 		    (sflags & SDEV_NONBLOCK) ? NULL : &call);
976 	else
977 		r = OK; /* exception: this must never fail */
978 
979 	assert(!(sflags & SDEV_NONBLOCK) || (r != SUSPEND && r != EDONTREPLY));
980 
981 	if (r != SUSPEND && r != EDONTREPLY)
982 		sockdriver_reply_generic(&call, r);
983 }
984 
985 /*
986  * Cancel a previous operation which may currently be suspended.  The cancel
987  * operation itself does not have a reply.  Instead, if the provided operation
988  * was found to be currently suspended, that operation must be aborted and a
989  * reply (typically EINTR) must be sent for it.  If no matching operation was
990  * found, no reply must be sent at all.
991  */
992 static void
993 do_cancel(const struct sockdriver * __restrict sdp,
994 	const message * __restrict m_ptr)
995 {
996 	struct sockdriver_call call;
997 
998 	call.sc_endpt = m_ptr->m_source;
999 	call.sc_req = m_ptr->m_vfs_lsockdriver_simple.req_id;
1000 
1001 	/* The 'param' message field is unused by this request. */
1002 
1003 	if (sdp->sdr_cancel != NULL)
1004 		sdp->sdr_cancel(m_ptr->m_vfs_lsockdriver_simple.sock_id,
1005 		    &call);
1006 }
1007 
1008 /*
1009  * Process a select request.  Select requests have their own rules with respect
1010  * to suspension and later notification.  The basic idea is: an immediate reply
1011  * is always sent with the subset of requested operations that are ready.  If
1012  * SDEV_NOTIFY is given, the remaining operations are to be combined with any
1013  * previous operations requested (with SDEV_NOTIFY) by the calling endpoint.
1014  * If any of the pending previous operations become ready, a late reply is sent
1015  * and only those ready operations are forgotten, leaving any other non-ready
1016  * operations for other late replies.
1017  */
1018 static void
1019 do_select(const struct sockdriver * __restrict sdp,
1020 	const message * __restrict m_ptr)
1021 {
1022 	struct sockdriver_select sel;
1023 	sockid_t id;
1024 	int r, ops;
1025 
1026 	sel.ss_endpt = m_ptr->m_source;
1027 	id = m_ptr->m_vfs_lsockdriver_select.sock_id;
1028 	ops = m_ptr->m_vfs_lsockdriver_select.ops;
1029 
1030 	if (sdp->sdr_select != NULL)
1031 		r = sdp->sdr_select(id, ops,
1032 		    (ops & SDEV_NOTIFY) ? &sel : NULL);
1033 	else
1034 		r = EOPNOTSUPP;
1035 
1036 	send_select_reply(&sel, SDEV_SELECT1_REPLY, id, r);
1037 }
1038 
1039 /*
1040  * Return TRUE if the given endpoint may initiate socket requests.
1041  */
1042 static int
1043 may_request(endpoint_t endpt)
1044 {
1045 
1046 	/*
1047 	 * For now, we allow only VFS to initiate socket calls.  In the future,
1048 	 * we may allow networked file systems to call into the network stack
1049 	 * directly.  The sockdriver API has already been designed to allow for
1050 	 * that, but this check will then need to change.  Ideally it would be
1051 	 * using some sort of ACL system.  For now, this check prevents that
1052 	 * network drivers themselves create and use sockets.
1053 	 */
1054 	return (endpt == VFS_PROC_NR);
1055 }
1056 
1057 /*
1058  * Process an incoming message, and (typically) send a reply.
1059  */
1060 void
1061 sockdriver_process(const struct sockdriver * __restrict sdp,
1062 	const message * __restrict m_ptr, int ipc_status)
1063 {
1064 
1065 	/* Handle notifications separately. */
1066 	if (is_ipc_notify(ipc_status)) {
1067 		switch (m_ptr->m_source) {
1068 		case CLOCK:
1069 			if (sdp->sdr_alarm != NULL)
1070 				sdp->sdr_alarm(m_ptr->m_notify.timestamp);
1071 			break;
1072 		default:
1073 			if (sdp->sdr_other != NULL)
1074 				sdp->sdr_other(m_ptr, ipc_status);
1075 		}
1076 
1077 		return; /* do not send a reply */
1078 	}
1079 
1080 	/* Is this a socket request from an acceptable party? */
1081 	if (!IS_SDEV_RQ(m_ptr->m_type) || !may_request(m_ptr->m_source)) {
1082 		if (sdp->sdr_other != NULL)
1083 			sdp->sdr_other(m_ptr, ipc_status);
1084 
1085 		return;	/* do not send a reply */
1086 	}
1087 
1088 	/*
1089 	 * Process the request.  If the request is not recognized, we cannot
1090 	 * send a reply either, because we do not know the reply message
1091 	 * format.  Passing the request message to the sdr_other hook serves no
1092 	 * practical purpose either: if the request is legitimate, this library
1093 	 * should know about it.
1094 	 */
1095 	switch (m_ptr->m_type) {
1096 	case SDEV_SOCKET:	do_socket(sdp, m_ptr);		break;
1097 	case SDEV_SOCKETPAIR:	do_socketpair(sdp, m_ptr);	break;
1098 	case SDEV_BIND:		do_bind_connect(sdp, m_ptr);	break;
1099 	case SDEV_CONNECT:	do_bind_connect(sdp, m_ptr);	break;
1100 	case SDEV_LISTEN:	do_listen(sdp, m_ptr);		break;
1101 	case SDEV_ACCEPT:	do_accept(sdp, m_ptr);		break;
1102 	case SDEV_SEND:		do_send(sdp, m_ptr);		break;
1103 	case SDEV_RECV:		do_recv(sdp, m_ptr);		break;
1104 	case SDEV_IOCTL:	do_ioctl(sdp, m_ptr);		break;
1105 	case SDEV_SETSOCKOPT:	do_setsockopt(sdp, m_ptr);	break;
1106 	case SDEV_GETSOCKOPT:	do_getsockopt(sdp, m_ptr);	break;
1107 	case SDEV_GETSOCKNAME:	do_getname(sdp, m_ptr);		break;
1108 	case SDEV_GETPEERNAME:	do_getname(sdp, m_ptr);		break;
1109 	case SDEV_SHUTDOWN:	do_shutdown(sdp, m_ptr);	break;
1110 	case SDEV_CLOSE:	do_close(sdp, m_ptr);		break;
1111 	case SDEV_CANCEL:	do_cancel(sdp, m_ptr);		break;
1112 	case SDEV_SELECT:	do_select(sdp, m_ptr);		break;
1113 	}
1114 }
1115 
1116 /*
1117  * Break out of the main loop after finishing the current request.
1118  */
1119 void
1120 sockdriver_terminate(void)
1121 {
1122 
1123 	running = FALSE;
1124 
1125 	sef_cancel();
1126 }
1127 
1128 /*
1129  * Main program of any socket driver.
1130  */
1131 void
1132 sockdriver_task(const struct sockdriver * sdp)
1133 {
1134 	message m;
1135 	int r, ipc_status;
1136 
1137 	/* The main message loop. */
1138 	running = TRUE;
1139 
1140 	while (running) {
1141 		if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) {
1142 			if (r == EINTR)
1143 				continue;	/* sef_cancel() was called */
1144 
1145 			panic("sockdriver: sef_receive_status failed: %d", r);
1146 		}
1147 
1148 		sockdriver_process(sdp, &m, ipc_status);
1149 	}
1150 }
1151