xref: /illumos-gate/usr/src/cmd/bhyve/mevent.c (revision 3a183383)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 /*
32  * Copyright 2018 Joyent, Inc.
33  */
34 
35 /*
36  * Micro event library for FreeBSD, designed for a single i/o thread
37  * using kqueue, and having events be persistent by default.
38  */
39 
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42 
43 #include <assert.h>
44 #ifndef WITHOUT_CAPSICUM
45 #include <capsicum_helpers.h>
46 #endif
47 #include <err.h>
48 #include <errno.h>
49 #include <stdbool.h>
50 #include <stdlib.h>
51 #include <stdio.h>
52 #include <string.h>
53 #include <sysexits.h>
54 #include <unistd.h>
55 
56 #include <sys/types.h>
57 #ifndef WITHOUT_CAPSICUM
58 #include <sys/capsicum.h>
59 #endif
60 #ifdef __FreeBSD__
61 #include <sys/event.h>
62 #else
63 #include <port.h>
64 #include <sys/poll.h>
65 #include <sys/siginfo.h>
66 #include <sys/queue.h>
67 #include <sys/debug.h>
68 #endif
69 #include <sys/time.h>
70 
71 #include <pthread.h>
72 #include <pthread_np.h>
73 
74 #include "mevent.h"
75 
76 #define	MEVENT_MAX	64
77 
78 #ifndef __FreeBSD__
79 #define	EV_ENABLE	0x01
80 #define	EV_ADD		EV_ENABLE
81 #define	EV_DISABLE	0x02
82 #define	EV_DELETE	0x04
83 #endif
84 
85 static pthread_t mevent_tid;
86 static int mevent_timid = 43;
87 static int mevent_pipefd[2];
88 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
89 
90 struct mevent {
91 	void	(*me_func)(int, enum ev_type, void *);
92 #define me_msecs me_fd
93 	int	me_fd;
94 #ifdef __FreeBSD__
95 	int	me_timid;
96 #else
97 	timer_t me_timid;
98 #endif
99 	enum ev_type me_type;
100 	void    *me_param;
101 	int	me_cq;
102 	int	me_state; /* Desired kevent flags. */
103 	int	me_closefd;
104 #ifndef __FreeBSD__
105 	port_notify_t	me_notify;
106 	struct sigevent	me_sigev;
107 	boolean_t	me_auto_requeue;
108 #endif
109 	LIST_ENTRY(mevent) me_list;
110 };
111 
112 static LIST_HEAD(listhead, mevent) global_head, change_head;
113 
114 static void
115 mevent_qlock(void)
116 {
117 	pthread_mutex_lock(&mevent_lmutex);
118 }
119 
120 static void
121 mevent_qunlock(void)
122 {
123 	pthread_mutex_unlock(&mevent_lmutex);
124 }
125 
126 static void
127 mevent_pipe_read(int fd, enum ev_type type, void *param)
128 {
129 	char buf[MEVENT_MAX];
130 	int status;
131 
132 	/*
133 	 * Drain the pipe read side. The fd is non-blocking so this is
134 	 * safe to do.
135 	 */
136 	do {
137 		status = read(fd, buf, sizeof(buf));
138 	} while (status == MEVENT_MAX);
139 }
140 
141 static void
142 mevent_notify(void)
143 {
144 	char c = '\0';
145 
146 	/*
147 	 * If calling from outside the i/o thread, write a byte on the
148 	 * pipe to force the i/o thread to exit the blocking kevent call.
149 	 */
150 	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
151 		write(mevent_pipefd[1], &c, 1);
152 	}
153 }
154 #ifdef __FreeBSD__
155 static int
156 mevent_kq_filter(struct mevent *mevp)
157 {
158 	int retval;
159 
160 	retval = 0;
161 
162 	if (mevp->me_type == EVF_READ)
163 		retval = EVFILT_READ;
164 
165 	if (mevp->me_type == EVF_WRITE)
166 		retval = EVFILT_WRITE;
167 
168 	if (mevp->me_type == EVF_TIMER)
169 		retval = EVFILT_TIMER;
170 
171 	if (mevp->me_type == EVF_SIGNAL)
172 		retval = EVFILT_SIGNAL;
173 
174 	return (retval);
175 }
176 
177 static int
178 mevent_kq_flags(struct mevent *mevp)
179 {
180 	return (mevp->me_state);
181 }
182 
183 static int
184 mevent_kq_fflags(struct mevent *mevp)
185 {
186 	/* XXX nothing yet, perhaps EV_EOF for reads ? */
187 	return (0);
188 }
189 
190 static int
191 mevent_build(int mfd, struct kevent *kev)
192 {
193 	struct mevent *mevp, *tmpp;
194 	int i;
195 
196 	i = 0;
197 
198 	mevent_qlock();
199 
200 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
201 		if (mevp->me_closefd) {
202 			/*
203 			 * A close of the file descriptor will remove the
204 			 * event
205 			 */
206 			close(mevp->me_fd);
207 		} else {
208 			if (mevp->me_type == EVF_TIMER) {
209 				kev[i].ident = mevp->me_timid;
210 				kev[i].data = mevp->me_msecs;
211 			} else {
212 				kev[i].ident = mevp->me_fd;
213 				kev[i].data = 0;
214 			}
215 			kev[i].filter = mevent_kq_filter(mevp);
216 			kev[i].flags = mevent_kq_flags(mevp);
217 			kev[i].fflags = mevent_kq_fflags(mevp);
218 			kev[i].udata = mevp;
219 			i++;
220 		}
221 
222 		mevp->me_cq = 0;
223 		LIST_REMOVE(mevp, me_list);
224 
225 		if (mevp->me_state & EV_DELETE) {
226 			free(mevp);
227 		} else {
228 			/*
229 			 * We need to add the event only once, so we can
230 			 * reset the EV_ADD bit after it has been propagated
231 			 * to the kevent() arguments the first time.
232 			 */
233 			mevp->me_state &= ~EV_ADD;
234 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
235 		}
236 
237 		assert(i < MEVENT_MAX);
238 	}
239 
240 	mevent_qunlock();
241 
242 	return (i);
243 }
244 
245 static void
246 mevent_handle(struct kevent *kev, int numev)
247 {
248 	struct mevent *mevp;
249 	int i;
250 
251 	for (i = 0; i < numev; i++) {
252 		mevp = kev[i].udata;
253 
254 		/* XXX check for EV_ERROR ? */
255 
256 		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
257 	}
258 }
259 
260 #else /* __FreeBSD__ */
261 
262 static boolean_t
263 mevent_clarify_state(struct mevent *mevp)
264 {
265 	const int state = mevp->me_state;
266 
267 	if ((state & EV_DELETE) != 0) {
268 		/* All other intents are overriden by delete. */
269 		mevp->me_state = EV_DELETE;
270 		return (B_TRUE);
271 	}
272 
273 	/*
274 	 * Without a distinction between EV_ADD and EV_ENABLE in our emulation,
275 	 * handling the add-disabled case means eliding the portfs operation
276 	 * when both flags are present.
277 	 *
278 	 * This is not a concern for subsequent enable/disable operations, as
279 	 * mevent_update() toggles the flags properly so they are not left in
280 	 * conflict.
281 	 */
282 	if (state == (EV_ENABLE|EV_DISABLE)) {
283 		mevp->me_state = EV_DISABLE;
284 		return (B_FALSE);
285 	}
286 
287 	return (B_TRUE);
288 }
289 
290 static void
291 mevent_update_one(struct mevent *mevp)
292 {
293 	int portfd = mevp->me_notify.portnfy_port;
294 
295 	switch (mevp->me_type) {
296 	case EVF_READ:
297 	case EVF_WRITE:
298 		mevp->me_auto_requeue = B_FALSE;
299 
300 		switch (mevp->me_state) {
301 		case EV_ENABLE:
302 		{
303 			int events;
304 
305 			events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT;
306 
307 			if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd,
308 			    events, mevp) != 0) {
309 				(void) fprintf(stderr,
310 				    "port_associate fd %d %p failed: %s\n",
311 				    mevp->me_fd, mevp, strerror(errno));
312 			}
313 			return;
314 		}
315 		case EV_DISABLE:
316 		case EV_DELETE:
317 			/*
318 			 * A disable that comes in while an event is being
319 			 * handled will result in an ENOENT.
320 			 */
321 			if (port_dissociate(portfd, PORT_SOURCE_FD,
322 			    mevp->me_fd) != 0 && errno != ENOENT) {
323 				(void) fprintf(stderr, "port_dissociate "
324 				    "portfd %d fd %d mevp %p failed: %s\n",
325 				    portfd, mevp->me_fd, mevp, strerror(errno));
326 			}
327 			return;
328 		default:
329 			goto abort;
330 		}
331 
332 	case EVF_TIMER:
333 		mevp->me_auto_requeue = B_TRUE;
334 
335 		switch (mevp->me_state) {
336 		case EV_ENABLE:
337 		{
338 			struct itimerspec it = { 0 };
339 
340 			mevp->me_sigev.sigev_notify = SIGEV_PORT;
341 			mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify;
342 
343 			if (timer_create(CLOCK_REALTIME, &mevp->me_sigev,
344 			    &mevp->me_timid) != 0) {
345 				(void) fprintf(stderr,
346 				    "timer_create failed: %s", strerror(errno));
347 				return;
348 			}
349 
350 			/* The first timeout */
351 			it.it_value.tv_sec = mevp->me_msecs / MILLISEC;
352 			it.it_value.tv_nsec =
353 				MSEC2NSEC(mevp->me_msecs % MILLISEC);
354 			/* Repeat at the same interval */
355 			it.it_interval = it.it_value;
356 
357 			if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) {
358 				(void) fprintf(stderr, "timer_settime failed: "
359 				    "%s", strerror(errno));
360 			}
361 			return;
362 		}
363 		case EV_DISABLE:
364 		case EV_DELETE:
365 			if (timer_delete(mevp->me_timid) != 0) {
366 				(void) fprintf(stderr, "timer_delete failed: "
367 				    "%s", strerror(errno));
368 			}
369 			return;
370 		default:
371 			goto abort;
372 		}
373 	default:
374 		/* EVF_SIGNAL not yet implemented. */
375 		goto abort;
376 	}
377 
378 abort:
379 	(void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__,
380 	    mevp->me_type, mevp->me_state);
381 	abort();
382 }
383 
384 static void
385 mevent_update_pending(int portfd)
386 {
387 	struct mevent *mevp, *tmpp;
388 
389 	mevent_qlock();
390 
391 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
392 		mevp->me_notify.portnfy_port = portfd;
393 		mevp->me_notify.portnfy_user = mevp;
394 		if (mevp->me_closefd) {
395 			/*
396 			 * A close of the file descriptor will remove the
397 			 * event
398 			 */
399 			(void) close(mevp->me_fd);
400 			mevp->me_fd = -1;
401 		} else {
402 			if (mevent_clarify_state(mevp)) {
403 				mevent_update_one(mevp);
404 			}
405 		}
406 
407 		mevp->me_cq = 0;
408 		LIST_REMOVE(mevp, me_list);
409 
410 		if (mevp->me_state & EV_DELETE) {
411 			free(mevp);
412 		} else {
413 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
414 		}
415 	}
416 
417 	mevent_qunlock();
418 }
419 
420 static void
421 mevent_handle_pe(port_event_t *pe)
422 {
423 	struct mevent *mevp = pe->portev_user;
424 
425 	mevent_qunlock();
426 
427 	(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
428 
429 	mevent_qlock();
430 	if (!mevp->me_cq && !mevp->me_auto_requeue) {
431 		mevent_update_one(mevp);
432 	}
433 	mevent_qunlock();
434 }
435 #endif
436 
437 static struct mevent *
438 mevent_add_state(int tfd, enum ev_type type,
439 	   void (*func)(int, enum ev_type, void *), void *param,
440 	   int state)
441 {
442 	struct mevent *lp, *mevp;
443 
444 	if (tfd < 0 || func == NULL) {
445 		return (NULL);
446 	}
447 
448 	mevp = NULL;
449 
450 	mevent_qlock();
451 
452 	/*
453 	 * Verify that the fd/type tuple is not present in any list
454 	 */
455 	LIST_FOREACH(lp, &global_head, me_list) {
456 		if (type != EVF_TIMER && lp->me_fd == tfd &&
457 		    lp->me_type == type) {
458 			goto exit;
459 		}
460 	}
461 
462 	LIST_FOREACH(lp, &change_head, me_list) {
463 		if (type != EVF_TIMER && lp->me_fd == tfd &&
464 		    lp->me_type == type) {
465 			goto exit;
466 		}
467 	}
468 
469 	/*
470 	 * Allocate an entry, populate it, and add it to the change list.
471 	 */
472 	mevp = calloc(1, sizeof(struct mevent));
473 	if (mevp == NULL) {
474 		goto exit;
475 	}
476 
477 	if (type == EVF_TIMER) {
478 		mevp->me_msecs = tfd;
479 		mevp->me_timid = mevent_timid++;
480 	} else
481 		mevp->me_fd = tfd;
482 	mevp->me_type = type;
483 	mevp->me_func = func;
484 	mevp->me_param = param;
485 
486 	LIST_INSERT_HEAD(&change_head, mevp, me_list);
487 	mevp->me_cq = 1;
488 	mevp->me_state = state;
489 	mevent_notify();
490 
491 exit:
492 	mevent_qunlock();
493 
494 	return (mevp);
495 }
496 
497 struct mevent *
498 mevent_add(int tfd, enum ev_type type,
499 	   void (*func)(int, enum ev_type, void *), void *param)
500 {
501 
502 	return (mevent_add_state(tfd, type, func, param, EV_ADD));
503 }
504 
505 struct mevent *
506 mevent_add_disabled(int tfd, enum ev_type type,
507 		    void (*func)(int, enum ev_type, void *), void *param)
508 {
509 
510 	return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE));
511 }
512 
513 static int
514 mevent_update(struct mevent *evp, bool enable)
515 {
516 	int newstate;
517 
518 	mevent_qlock();
519 
520 	/*
521 	 * It's not possible to enable/disable a deleted event
522 	 */
523 	assert((evp->me_state & EV_DELETE) == 0);
524 
525 	newstate = evp->me_state;
526 	if (enable) {
527 		newstate |= EV_ENABLE;
528 		newstate &= ~EV_DISABLE;
529 	} else {
530 		newstate |= EV_DISABLE;
531 		newstate &= ~EV_ENABLE;
532 	}
533 
534 	/*
535 	 * No update needed if state isn't changing
536 	 */
537 	if (evp->me_state != newstate) {
538 		evp->me_state = newstate;
539 
540 		/*
541 		 * Place the entry onto the changed list if not
542 		 * already there.
543 		 */
544 		if (evp->me_cq == 0) {
545 			evp->me_cq = 1;
546 			LIST_REMOVE(evp, me_list);
547 			LIST_INSERT_HEAD(&change_head, evp, me_list);
548 			mevent_notify();
549 		}
550 	}
551 
552 	mevent_qunlock();
553 
554 	return (0);
555 }
556 
557 int
558 mevent_enable(struct mevent *evp)
559 {
560 
561 	return (mevent_update(evp, true));
562 }
563 
564 int
565 mevent_disable(struct mevent *evp)
566 {
567 
568 	return (mevent_update(evp, false));
569 }
570 
571 static int
572 mevent_delete_event(struct mevent *evp, int closefd)
573 {
574 	mevent_qlock();
575 
576 	/*
577          * Place the entry onto the changed list if not already there, and
578 	 * mark as to be deleted.
579          */
580         if (evp->me_cq == 0) {
581 		evp->me_cq = 1;
582 		LIST_REMOVE(evp, me_list);
583 		LIST_INSERT_HEAD(&change_head, evp, me_list);
584 		mevent_notify();
585         }
586 	evp->me_state = EV_DELETE;
587 
588 	if (closefd)
589 		evp->me_closefd = 1;
590 
591 	mevent_qunlock();
592 
593 	return (0);
594 }
595 
596 int
597 mevent_delete(struct mevent *evp)
598 {
599 
600 	return (mevent_delete_event(evp, 0));
601 }
602 
603 int
604 mevent_delete_close(struct mevent *evp)
605 {
606 
607 	return (mevent_delete_event(evp, 1));
608 }
609 
610 static void
611 mevent_set_name(void)
612 {
613 
614 	pthread_set_name_np(mevent_tid, "mevent");
615 }
616 
617 void
618 mevent_dispatch(void)
619 {
620 #ifdef __FreeBSD__
621 	struct kevent changelist[MEVENT_MAX];
622 	struct kevent eventlist[MEVENT_MAX];
623 	struct mevent *pipev;
624 	int mfd;
625 	int numev;
626 #else
627 	struct mevent *pipev;
628 	int portfd;
629 #endif
630 	int ret;
631 #ifndef WITHOUT_CAPSICUM
632 	cap_rights_t rights;
633 #endif
634 
635 	mevent_tid = pthread_self();
636 	mevent_set_name();
637 
638 #ifdef __FreeBSD__
639 	mfd = kqueue();
640 	assert(mfd > 0);
641 #else
642 	portfd = port_create();
643 	assert(portfd >= 0);
644 #endif
645 
646 #ifndef WITHOUT_CAPSICUM
647 	cap_rights_init(&rights, CAP_KQUEUE);
648 	if (caph_rights_limit(mfd, &rights) == -1)
649 		errx(EX_OSERR, "Unable to apply rights for sandbox");
650 #endif
651 
652 	/*
653 	 * Open the pipe that will be used for other threads to force
654 	 * the blocking kqueue call to exit by writing to it. Set the
655 	 * descriptor to non-blocking.
656 	 */
657 	ret = pipe(mevent_pipefd);
658 	if (ret < 0) {
659 		perror("pipe");
660 		exit(0);
661 	}
662 
663 #ifndef WITHOUT_CAPSICUM
664 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
665 	if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
666 		errx(EX_OSERR, "Unable to apply rights for sandbox");
667 	if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
668 		errx(EX_OSERR, "Unable to apply rights for sandbox");
669 #endif
670 
671 	/*
672 	 * Add internal event handler for the pipe write fd
673 	 */
674 	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
675 	assert(pipev != NULL);
676 
677 	for (;;) {
678 #ifdef __FreeBSD__
679 		/*
680 		 * Build changelist if required.
681 		 * XXX the changelist can be put into the blocking call
682 		 * to eliminate the extra syscall. Currently better for
683 		 * debug.
684 		 */
685 		numev = mevent_build(mfd, changelist);
686 		if (numev) {
687 			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
688 			if (ret == -1) {
689 				perror("Error return from kevent change");
690 			}
691 		}
692 
693 		/*
694 		 * Block awaiting events
695 		 */
696 		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
697 		if (ret == -1 && errno != EINTR) {
698 			perror("Error return from kevent monitor");
699 		}
700 
701 		/*
702 		 * Handle reported events
703 		 */
704 		mevent_handle(eventlist, ret);
705 
706 #else /* __FreeBSD__ */
707 		port_event_t pev;
708 
709 		/* Handle any pending updates */
710 		mevent_update_pending(portfd);
711 
712 		/* Block awaiting events */
713 		ret = port_get(portfd, &pev, NULL);
714 		if (ret != 0) {
715 			if (errno != EINTR)
716 				perror("Error return from port_get");
717 			continue;
718 		}
719 
720 		/* Handle reported event */
721 		mevent_handle_pe(&pev);
722 #endif /* __FreeBSD__ */
723 	}
724 }
725