xref: /illumos-gate/usr/src/cmd/bhyve/mevent.c (revision 3382f241)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 /*
32  * Copyright 2018 Joyent, Inc.
33  */
34 
35 /*
36  * Micro event library for FreeBSD, designed for a single i/o thread
37  * using kqueue, and having events be persistent by default.
38  */
39 
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42 
43 #include <assert.h>
44 #ifndef WITHOUT_CAPSICUM
45 #include <capsicum_helpers.h>
46 #endif
47 #include <err.h>
48 #include <errno.h>
49 #include <stdlib.h>
50 #include <stdio.h>
51 #include <string.h>
52 #include <sysexits.h>
53 #include <unistd.h>
54 
55 #include <sys/types.h>
56 #ifndef WITHOUT_CAPSICUM
57 #include <sys/capsicum.h>
58 #endif
59 #ifdef __FreeBSD__
60 #include <sys/event.h>
61 #else
62 #include <port.h>
63 #include <sys/poll.h>
64 #include <sys/siginfo.h>
65 #include <sys/queue.h>
66 #endif
67 #include <sys/time.h>
68 
69 #include <pthread.h>
70 #include <pthread_np.h>
71 
72 #include "mevent.h"
73 
74 #define	MEVENT_MAX	64
75 
76 #define	MEV_ADD		1
77 #define	MEV_ENABLE	2
78 #define	MEV_DISABLE	3
79 #define	MEV_DEL_PENDING	4
80 
81 extern char *vmname;
82 
83 static pthread_t mevent_tid;
84 static int mevent_timid = 43;
85 static int mevent_pipefd[2];
86 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
87 
88 struct mevent {
89 	void	(*me_func)(int, enum ev_type, void *);
90 #define me_msecs me_fd
91 	int	me_fd;
92 #ifdef __FreeBSD__
93 	int	me_timid;
94 #else
95 	timer_t me_timid;
96 #endif
97 	enum ev_type me_type;
98 	void    *me_param;
99 	int	me_cq;
100 	int	me_state;
101 	int	me_closefd;
102 #ifndef __FreeBSD__
103 	port_notify_t	me_notify;
104 	struct sigevent	me_sigev;
105 	boolean_t	me_auto_requeue;
106 #endif
107 	LIST_ENTRY(mevent) me_list;
108 };
109 
110 static LIST_HEAD(listhead, mevent) global_head, change_head;
111 
112 static void
113 mevent_qlock(void)
114 {
115 	pthread_mutex_lock(&mevent_lmutex);
116 }
117 
118 static void
119 mevent_qunlock(void)
120 {
121 	pthread_mutex_unlock(&mevent_lmutex);
122 }
123 
124 static void
125 mevent_pipe_read(int fd, enum ev_type type, void *param)
126 {
127 	char buf[MEVENT_MAX];
128 	int status;
129 
130 	/*
131 	 * Drain the pipe read side. The fd is non-blocking so this is
132 	 * safe to do.
133 	 */
134 	do {
135 		status = read(fd, buf, sizeof(buf));
136 	} while (status == MEVENT_MAX);
137 }
138 
139 static void
140 mevent_notify(void)
141 {
142 	char c;
143 
144 	/*
145 	 * If calling from outside the i/o thread, write a byte on the
146 	 * pipe to force the i/o thread to exit the blocking kevent call.
147 	 */
148 	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
149 		write(mevent_pipefd[1], &c, 1);
150 	}
151 }
152 #ifdef __FreeBSD__
153 static int
154 mevent_kq_filter(struct mevent *mevp)
155 {
156 	int retval;
157 
158 	retval = 0;
159 
160 	if (mevp->me_type == EVF_READ)
161 		retval = EVFILT_READ;
162 
163 	if (mevp->me_type == EVF_WRITE)
164 		retval = EVFILT_WRITE;
165 
166 	if (mevp->me_type == EVF_TIMER)
167 		retval = EVFILT_TIMER;
168 
169 	if (mevp->me_type == EVF_SIGNAL)
170 		retval = EVFILT_SIGNAL;
171 
172 	return (retval);
173 }
174 
175 static int
176 mevent_kq_flags(struct mevent *mevp)
177 {
178 	int ret;
179 
180 	switch (mevp->me_state) {
181 	case MEV_ADD:
182 		ret = EV_ADD;		/* implicitly enabled */
183 		break;
184 	case MEV_ENABLE:
185 		ret = EV_ENABLE;
186 		break;
187 	case MEV_DISABLE:
188 		ret = EV_DISABLE;
189 		break;
190 	case MEV_DEL_PENDING:
191 		ret = EV_DELETE;
192 		break;
193 	default:
194 		assert(0);
195 		break;
196 	}
197 
198 	return (ret);
199 }
200 
201 static int
202 mevent_kq_fflags(struct mevent *mevp)
203 {
204 	/* XXX nothing yet, perhaps EV_EOF for reads ? */
205 	return (0);
206 }
207 
208 static int
209 mevent_build(int mfd, struct kevent *kev)
210 {
211 	struct mevent *mevp, *tmpp;
212 	int i;
213 
214 	i = 0;
215 
216 	mevent_qlock();
217 
218 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
219 		if (mevp->me_closefd) {
220 			/*
221 			 * A close of the file descriptor will remove the
222 			 * event
223 			 */
224 			close(mevp->me_fd);
225 		} else {
226 			if (mevp->me_type == EVF_TIMER) {
227 				kev[i].ident = mevp->me_timid;
228 				kev[i].data = mevp->me_msecs;
229 			} else {
230 				kev[i].ident = mevp->me_fd;
231 				kev[i].data = 0;
232 			}
233 			kev[i].filter = mevent_kq_filter(mevp);
234 			kev[i].flags = mevent_kq_flags(mevp);
235 			kev[i].fflags = mevent_kq_fflags(mevp);
236 			kev[i].udata = mevp;
237 			i++;
238 		}
239 
240 		mevp->me_cq = 0;
241 		LIST_REMOVE(mevp, me_list);
242 
243 		if (mevp->me_state == MEV_DEL_PENDING) {
244 			free(mevp);
245 		} else {
246 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
247 		}
248 
249 		assert(i < MEVENT_MAX);
250 	}
251 
252 	mevent_qunlock();
253 
254 	return (i);
255 }
256 
257 static void
258 mevent_handle(struct kevent *kev, int numev)
259 {
260 	struct mevent *mevp;
261 	int i;
262 
263 	for (i = 0; i < numev; i++) {
264 		mevp = kev[i].udata;
265 
266 		/* XXX check for EV_ERROR ? */
267 
268 		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
269 	}
270 }
271 
272 #else /* __FreeBSD__ */
273 
274 static void
275 mevent_update_one(struct mevent *mevp)
276 {
277 	int portfd = mevp->me_notify.portnfy_port;
278 
279 	switch (mevp->me_type) {
280 	case EVF_READ:
281 	case EVF_WRITE:
282 		mevp->me_auto_requeue = B_FALSE;
283 
284 		switch (mevp->me_state) {
285 		case MEV_ADD:
286 		case MEV_ENABLE:
287 		{
288 			int events;
289 
290 			events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT;
291 
292 			if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd,
293 			    events, mevp) != 0) {
294 				(void) fprintf(stderr,
295 				    "port_associate fd %d %p failed: %s\n",
296 				    mevp->me_fd, mevp, strerror(errno));
297 			}
298 			return;
299 		}
300 		case MEV_DISABLE:
301 		case MEV_DEL_PENDING:
302 			/*
303 			 * A disable that comes in while an event is being
304 			 * handled will result in an ENOENT.
305 			 */
306 			if (port_dissociate(portfd, PORT_SOURCE_FD,
307 			    mevp->me_fd) != 0 && errno != ENOENT) {
308 				(void) fprintf(stderr, "port_dissociate "
309 				    "portfd %d fd %d mevp %p failed: %s\n",
310 				    portfd, mevp->me_fd, mevp, strerror(errno));
311 			}
312 			return;
313 		default:
314 			goto abort;
315 		}
316 
317 	case EVF_TIMER:
318 		mevp->me_auto_requeue = B_TRUE;
319 
320 		switch (mevp->me_state) {
321 		case MEV_ADD:
322 		case MEV_ENABLE:
323 		{
324 			struct itimerspec it = { 0 };
325 
326 			mevp->me_sigev.sigev_notify = SIGEV_PORT;
327 			mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify;
328 
329 			if (timer_create(CLOCK_REALTIME, &mevp->me_sigev,
330 			    &mevp->me_timid) != 0) {
331 				(void) fprintf(stderr,
332 				    "timer_create failed: %s", strerror(errno));
333 				return;
334 			}
335 
336 			/* The first timeout */
337 			it.it_value.tv_sec = mevp->me_msecs / MILLISEC;
338 			it.it_value.tv_nsec =
339 				MSEC2NSEC(mevp->me_msecs % MILLISEC);
340 			/* Repeat at the same interval */
341 			it.it_interval = it.it_value;
342 
343 			if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) {
344 				(void) fprintf(stderr, "timer_settime failed: "
345 				    "%s", strerror(errno));
346 			}
347 			return;
348 		}
349 		case MEV_DISABLE:
350 		case MEV_DEL_PENDING:
351 			if (timer_delete(mevp->me_timid) != 0) {
352 				(void) fprintf(stderr, "timer_delete failed: "
353 				    "%s", strerror(errno));
354 			}
355 			return;
356 		default:
357 			goto abort;
358 		}
359 	default:
360 		/* EVF_SIGNAL not yet implemented. */
361 		goto abort;
362 	}
363 
364 abort:
365 	(void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__,
366 	    mevp->me_type, mevp->me_state);
367 	abort();
368 }
369 
370 static void
371 mevent_update_pending(int portfd)
372 {
373 	struct mevent *mevp, *tmpp;
374 
375 	mevent_qlock();
376 
377 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
378 		mevp->me_notify.portnfy_port = portfd;
379 		mevp->me_notify.portnfy_user = mevp;
380 		if (mevp->me_closefd) {
381 			/*
382 			 * A close of the file descriptor will remove the
383 			 * event
384 			 */
385 			(void) close(mevp->me_fd);
386 			mevp->me_fd = -1;
387 		} else {
388 			mevent_update_one(mevp);
389 		}
390 
391 		mevp->me_cq = 0;
392 		LIST_REMOVE(mevp, me_list);
393 
394 		if (mevp->me_state == MEV_DEL_PENDING) {
395 			free(mevp);
396 		} else {
397 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
398 		}
399 	}
400 
401 	mevent_qunlock();
402 }
403 
404 static void
405 mevent_handle_pe(port_event_t *pe)
406 {
407 	struct mevent *mevp = pe->portev_user;
408 
409 	mevent_qunlock();
410 
411 	(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
412 
413 	mevent_qlock();
414 	if (!mevp->me_cq && !mevp->me_auto_requeue) {
415 		mevent_update_one(mevp);
416 	}
417 	mevent_qunlock();
418 }
419 #endif
420 
421 struct mevent *
422 mevent_add(int tfd, enum ev_type type,
423 	   void (*func)(int, enum ev_type, void *), void *param)
424 {
425 	struct mevent *lp, *mevp;
426 
427 	if (tfd < 0 || func == NULL) {
428 		return (NULL);
429 	}
430 
431 	mevp = NULL;
432 
433 	mevent_qlock();
434 
435 	/*
436 	 * Verify that the fd/type tuple is not present in any list
437 	 */
438 	LIST_FOREACH(lp, &global_head, me_list) {
439 		if (type != EVF_TIMER && lp->me_fd == tfd &&
440 		    lp->me_type == type) {
441 			goto exit;
442 		}
443 	}
444 
445 	LIST_FOREACH(lp, &change_head, me_list) {
446 		if (type != EVF_TIMER && lp->me_fd == tfd &&
447 		    lp->me_type == type) {
448 			goto exit;
449 		}
450 	}
451 
452 	/*
453 	 * Allocate an entry, populate it, and add it to the change list.
454 	 */
455 	mevp = calloc(1, sizeof(struct mevent));
456 	if (mevp == NULL) {
457 		goto exit;
458 	}
459 
460 	if (type == EVF_TIMER) {
461 		mevp->me_msecs = tfd;
462 		mevp->me_timid = mevent_timid++;
463 	} else
464 		mevp->me_fd = tfd;
465 	mevp->me_type = type;
466 	mevp->me_func = func;
467 	mevp->me_param = param;
468 
469 	LIST_INSERT_HEAD(&change_head, mevp, me_list);
470 	mevp->me_cq = 1;
471 	mevp->me_state = MEV_ADD;
472 	mevent_notify();
473 
474 exit:
475 	mevent_qunlock();
476 
477 	return (mevp);
478 }
479 
480 static int
481 mevent_update(struct mevent *evp, int newstate)
482 {
483 	/*
484 	 * It's not possible to enable/disable a deleted event
485 	 */
486 	if (evp->me_state == MEV_DEL_PENDING)
487 		return (EINVAL);
488 
489 	/*
490 	 * No update needed if state isn't changing
491 	 */
492 	if (evp->me_state == newstate)
493 		return (0);
494 
495 	mevent_qlock();
496 
497 	evp->me_state = newstate;
498 
499 	/*
500 	 * Place the entry onto the changed list if not already there.
501 	 */
502 	if (evp->me_cq == 0) {
503 		evp->me_cq = 1;
504 		LIST_REMOVE(evp, me_list);
505 		LIST_INSERT_HEAD(&change_head, evp, me_list);
506 		mevent_notify();
507 	}
508 
509 	mevent_qunlock();
510 
511 	return (0);
512 }
513 
514 int
515 mevent_enable(struct mevent *evp)
516 {
517 
518 	return (mevent_update(evp, MEV_ENABLE));
519 }
520 
521 int
522 mevent_disable(struct mevent *evp)
523 {
524 
525 	return (mevent_update(evp, MEV_DISABLE));
526 }
527 
528 static int
529 mevent_delete_event(struct mevent *evp, int closefd)
530 {
531 	mevent_qlock();
532 
533 	/*
534          * Place the entry onto the changed list if not already there, and
535 	 * mark as to be deleted.
536          */
537         if (evp->me_cq == 0) {
538 		evp->me_cq = 1;
539 		LIST_REMOVE(evp, me_list);
540 		LIST_INSERT_HEAD(&change_head, evp, me_list);
541 		mevent_notify();
542         }
543 	evp->me_state = MEV_DEL_PENDING;
544 
545 	if (closefd)
546 		evp->me_closefd = 1;
547 
548 	mevent_qunlock();
549 
550 	return (0);
551 }
552 
553 int
554 mevent_delete(struct mevent *evp)
555 {
556 
557 	return (mevent_delete_event(evp, 0));
558 }
559 
560 int
561 mevent_delete_close(struct mevent *evp)
562 {
563 
564 	return (mevent_delete_event(evp, 1));
565 }
566 
567 static void
568 mevent_set_name(void)
569 {
570 
571 	pthread_set_name_np(mevent_tid, "mevent");
572 }
573 
574 void
575 mevent_dispatch(void)
576 {
577 #ifdef __FreeBSD__
578 	struct kevent changelist[MEVENT_MAX];
579 	struct kevent eventlist[MEVENT_MAX];
580 	struct mevent *pipev;
581 	int mfd;
582 	int numev;
583 #else
584 	struct mevent *pipev;
585 	int portfd;
586 #endif
587 	int ret;
588 #ifndef WITHOUT_CAPSICUM
589 	cap_rights_t rights;
590 #endif
591 
592 	mevent_tid = pthread_self();
593 	mevent_set_name();
594 
595 #ifdef __FreeBSD__
596 	mfd = kqueue();
597 	assert(mfd > 0);
598 #else
599 	portfd = port_create();
600 	assert(portfd >= 0);
601 #endif
602 
603 #ifndef WITHOUT_CAPSICUM
604 	cap_rights_init(&rights, CAP_KQUEUE);
605 	if (caph_rights_limit(mfd, &rights) == -1)
606 		errx(EX_OSERR, "Unable to apply rights for sandbox");
607 #endif
608 
609 	/*
610 	 * Open the pipe that will be used for other threads to force
611 	 * the blocking kqueue call to exit by writing to it. Set the
612 	 * descriptor to non-blocking.
613 	 */
614 	ret = pipe(mevent_pipefd);
615 	if (ret < 0) {
616 		perror("pipe");
617 		exit(0);
618 	}
619 
620 #ifndef WITHOUT_CAPSICUM
621 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
622 	if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
623 		errx(EX_OSERR, "Unable to apply rights for sandbox");
624 	if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
625 		errx(EX_OSERR, "Unable to apply rights for sandbox");
626 #endif
627 
628 	/*
629 	 * Add internal event handler for the pipe write fd
630 	 */
631 	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
632 	assert(pipev != NULL);
633 
634 	for (;;) {
635 #ifdef __FreeBSD__
636 		/*
637 		 * Build changelist if required.
638 		 * XXX the changelist can be put into the blocking call
639 		 * to eliminate the extra syscall. Currently better for
640 		 * debug.
641 		 */
642 		numev = mevent_build(mfd, changelist);
643 		if (numev) {
644 			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
645 			if (ret == -1) {
646 				perror("Error return from kevent change");
647 			}
648 		}
649 
650 		/*
651 		 * Block awaiting events
652 		 */
653 		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
654 		if (ret == -1 && errno != EINTR) {
655 			perror("Error return from kevent monitor");
656 		}
657 
658 		/*
659 		 * Handle reported events
660 		 */
661 		mevent_handle(eventlist, ret);
662 
663 #else /* __FreeBSD__ */
664 		port_event_t pev;
665 
666 		/* Handle any pending updates */
667 		mevent_update_pending(portfd);
668 
669 		/* Block awaiting events */
670 		ret = port_get(portfd, &pev, NULL);
671 		if (ret != 0 && errno != EINTR) {
672 			perror("Error return from port_get");
673 			continue;
674 		}
675 
676 		/* Handle reported event */
677 		mevent_handle_pe(&pev);
678 #endif /* __FreeBSD__ */
679 	}
680 }
681