1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 /* 32 * Copyright 2018 Joyent, Inc. 33 */ 34 35 /* 36 * Micro event library for FreeBSD, designed for a single i/o thread 37 * using kqueue, and having events be persistent by default. 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include <assert.h> 44 #ifndef WITHOUT_CAPSICUM 45 #include <capsicum_helpers.h> 46 #endif 47 #include <err.h> 48 #include <errno.h> 49 #include <stdlib.h> 50 #include <stdio.h> 51 #include <string.h> 52 #include <sysexits.h> 53 #include <unistd.h> 54 55 #include <sys/types.h> 56 #ifndef WITHOUT_CAPSICUM 57 #include <sys/capsicum.h> 58 #endif 59 #ifdef __FreeBSD__ 60 #include <sys/event.h> 61 #else 62 #include <port.h> 63 #include <sys/poll.h> 64 #include <sys/siginfo.h> 65 #include <sys/queue.h> 66 #endif 67 #include <sys/time.h> 68 69 #include <pthread.h> 70 #include <pthread_np.h> 71 72 #include "mevent.h" 73 74 #define MEVENT_MAX 64 75 76 #define MEV_ADD 1 77 #define MEV_ENABLE 2 78 #define MEV_DISABLE 3 79 #define MEV_DEL_PENDING 4 80 81 extern char *vmname; 82 83 static pthread_t mevent_tid; 84 static int mevent_timid = 43; 85 static int mevent_pipefd[2]; 86 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; 87 88 struct mevent { 89 void (*me_func)(int, enum ev_type, void *); 90 #define me_msecs me_fd 91 int me_fd; 92 #ifdef __FreeBSD__ 93 int me_timid; 94 #else 95 timer_t me_timid; 96 #endif 97 enum ev_type me_type; 98 void *me_param; 99 int me_cq; 100 int me_state; 101 int me_closefd; 102 #ifndef __FreeBSD__ 103 port_notify_t me_notify; 104 struct sigevent me_sigev; 105 boolean_t me_auto_requeue; 106 #endif 107 LIST_ENTRY(mevent) me_list; 108 }; 109 110 static LIST_HEAD(listhead, mevent) global_head, change_head; 111 112 static void 113 mevent_qlock(void) 114 { 115 pthread_mutex_lock(&mevent_lmutex); 116 } 117 118 static void 119 mevent_qunlock(void) 120 { 121 pthread_mutex_unlock(&mevent_lmutex); 122 } 123 124 static void 125 mevent_pipe_read(int fd, enum ev_type type, void *param) 126 { 127 char buf[MEVENT_MAX]; 128 int status; 129 130 /* 131 * Drain the pipe read side. The fd is non-blocking so this is 132 * safe to do. 133 */ 134 do { 135 status = read(fd, buf, sizeof(buf)); 136 } while (status == MEVENT_MAX); 137 } 138 139 static void 140 mevent_notify(void) 141 { 142 char c; 143 144 /* 145 * If calling from outside the i/o thread, write a byte on the 146 * pipe to force the i/o thread to exit the blocking kevent call. 147 */ 148 if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { 149 write(mevent_pipefd[1], &c, 1); 150 } 151 } 152 #ifdef __FreeBSD__ 153 static int 154 mevent_kq_filter(struct mevent *mevp) 155 { 156 int retval; 157 158 retval = 0; 159 160 if (mevp->me_type == EVF_READ) 161 retval = EVFILT_READ; 162 163 if (mevp->me_type == EVF_WRITE) 164 retval = EVFILT_WRITE; 165 166 if (mevp->me_type == EVF_TIMER) 167 retval = EVFILT_TIMER; 168 169 if (mevp->me_type == EVF_SIGNAL) 170 retval = EVFILT_SIGNAL; 171 172 return (retval); 173 } 174 175 static int 176 mevent_kq_flags(struct mevent *mevp) 177 { 178 int ret; 179 180 switch (mevp->me_state) { 181 case MEV_ADD: 182 ret = EV_ADD; /* implicitly enabled */ 183 break; 184 case MEV_ENABLE: 185 ret = EV_ENABLE; 186 break; 187 case MEV_DISABLE: 188 ret = EV_DISABLE; 189 break; 190 case MEV_DEL_PENDING: 191 ret = EV_DELETE; 192 break; 193 default: 194 assert(0); 195 break; 196 } 197 198 return (ret); 199 } 200 201 static int 202 mevent_kq_fflags(struct mevent *mevp) 203 { 204 /* XXX nothing yet, perhaps EV_EOF for reads ? */ 205 return (0); 206 } 207 208 static int 209 mevent_build(int mfd, struct kevent *kev) 210 { 211 struct mevent *mevp, *tmpp; 212 int i; 213 214 i = 0; 215 216 mevent_qlock(); 217 218 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { 219 if (mevp->me_closefd) { 220 /* 221 * A close of the file descriptor will remove the 222 * event 223 */ 224 close(mevp->me_fd); 225 } else { 226 if (mevp->me_type == EVF_TIMER) { 227 kev[i].ident = mevp->me_timid; 228 kev[i].data = mevp->me_msecs; 229 } else { 230 kev[i].ident = mevp->me_fd; 231 kev[i].data = 0; 232 } 233 kev[i].filter = mevent_kq_filter(mevp); 234 kev[i].flags = mevent_kq_flags(mevp); 235 kev[i].fflags = mevent_kq_fflags(mevp); 236 kev[i].udata = mevp; 237 i++; 238 } 239 240 mevp->me_cq = 0; 241 LIST_REMOVE(mevp, me_list); 242 243 if (mevp->me_state == MEV_DEL_PENDING) { 244 free(mevp); 245 } else { 246 LIST_INSERT_HEAD(&global_head, mevp, me_list); 247 } 248 249 assert(i < MEVENT_MAX); 250 } 251 252 mevent_qunlock(); 253 254 return (i); 255 } 256 257 static void 258 mevent_handle(struct kevent *kev, int numev) 259 { 260 struct mevent *mevp; 261 int i; 262 263 for (i = 0; i < numev; i++) { 264 mevp = kev[i].udata; 265 266 /* XXX check for EV_ERROR ? */ 267 268 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); 269 } 270 } 271 272 #else /* __FreeBSD__ */ 273 274 static void 275 mevent_update_one(struct mevent *mevp) 276 { 277 int portfd = mevp->me_notify.portnfy_port; 278 279 switch (mevp->me_type) { 280 case EVF_READ: 281 case EVF_WRITE: 282 mevp->me_auto_requeue = B_FALSE; 283 284 switch (mevp->me_state) { 285 case MEV_ADD: 286 case MEV_ENABLE: 287 { 288 int events; 289 290 events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT; 291 292 if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd, 293 events, mevp) != 0) { 294 (void) fprintf(stderr, 295 "port_associate fd %d %p failed: %s\n", 296 mevp->me_fd, mevp, strerror(errno)); 297 } 298 return; 299 } 300 case MEV_DISABLE: 301 case MEV_DEL_PENDING: 302 /* 303 * A disable that comes in while an event is being 304 * handled will result in an ENOENT. 305 */ 306 if (port_dissociate(portfd, PORT_SOURCE_FD, 307 mevp->me_fd) != 0 && errno != ENOENT) { 308 (void) fprintf(stderr, "port_dissociate " 309 "portfd %d fd %d mevp %p failed: %s\n", 310 portfd, mevp->me_fd, mevp, strerror(errno)); 311 } 312 return; 313 default: 314 goto abort; 315 } 316 317 case EVF_TIMER: 318 mevp->me_auto_requeue = B_TRUE; 319 320 switch (mevp->me_state) { 321 case MEV_ADD: 322 case MEV_ENABLE: 323 { 324 struct itimerspec it = { 0 }; 325 326 mevp->me_sigev.sigev_notify = SIGEV_PORT; 327 mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify; 328 329 if (timer_create(CLOCK_REALTIME, &mevp->me_sigev, 330 &mevp->me_timid) != 0) { 331 (void) fprintf(stderr, 332 "timer_create failed: %s", strerror(errno)); 333 return; 334 } 335 336 /* The first timeout */ 337 it.it_value.tv_sec = mevp->me_msecs / MILLISEC; 338 it.it_value.tv_nsec = 339 MSEC2NSEC(mevp->me_msecs % MILLISEC); 340 /* Repeat at the same interval */ 341 it.it_interval = it.it_value; 342 343 if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) { 344 (void) fprintf(stderr, "timer_settime failed: " 345 "%s", strerror(errno)); 346 } 347 return; 348 } 349 case MEV_DISABLE: 350 case MEV_DEL_PENDING: 351 if (timer_delete(mevp->me_timid) != 0) { 352 (void) fprintf(stderr, "timer_delete failed: " 353 "%s", strerror(errno)); 354 } 355 return; 356 default: 357 goto abort; 358 } 359 default: 360 /* EVF_SIGNAL not yet implemented. */ 361 goto abort; 362 } 363 364 abort: 365 (void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__, 366 mevp->me_type, mevp->me_state); 367 abort(); 368 } 369 370 static void 371 mevent_update_pending(int portfd) 372 { 373 struct mevent *mevp, *tmpp; 374 375 mevent_qlock(); 376 377 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { 378 mevp->me_notify.portnfy_port = portfd; 379 mevp->me_notify.portnfy_user = mevp; 380 if (mevp->me_closefd) { 381 /* 382 * A close of the file descriptor will remove the 383 * event 384 */ 385 (void) close(mevp->me_fd); 386 mevp->me_fd = -1; 387 } else { 388 mevent_update_one(mevp); 389 } 390 391 mevp->me_cq = 0; 392 LIST_REMOVE(mevp, me_list); 393 394 if (mevp->me_state == MEV_DEL_PENDING) { 395 free(mevp); 396 } else { 397 LIST_INSERT_HEAD(&global_head, mevp, me_list); 398 } 399 } 400 401 mevent_qunlock(); 402 } 403 404 static void 405 mevent_handle_pe(port_event_t *pe) 406 { 407 struct mevent *mevp = pe->portev_user; 408 409 mevent_qunlock(); 410 411 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); 412 413 mevent_qlock(); 414 if (!mevp->me_cq && !mevp->me_auto_requeue) { 415 mevent_update_one(mevp); 416 } 417 mevent_qunlock(); 418 } 419 #endif 420 421 struct mevent * 422 mevent_add(int tfd, enum ev_type type, 423 void (*func)(int, enum ev_type, void *), void *param) 424 { 425 struct mevent *lp, *mevp; 426 427 if (tfd < 0 || func == NULL) { 428 return (NULL); 429 } 430 431 mevp = NULL; 432 433 mevent_qlock(); 434 435 /* 436 * Verify that the fd/type tuple is not present in any list 437 */ 438 LIST_FOREACH(lp, &global_head, me_list) { 439 if (type != EVF_TIMER && lp->me_fd == tfd && 440 lp->me_type == type) { 441 goto exit; 442 } 443 } 444 445 LIST_FOREACH(lp, &change_head, me_list) { 446 if (type != EVF_TIMER && lp->me_fd == tfd && 447 lp->me_type == type) { 448 goto exit; 449 } 450 } 451 452 /* 453 * Allocate an entry, populate it, and add it to the change list. 454 */ 455 mevp = calloc(1, sizeof(struct mevent)); 456 if (mevp == NULL) { 457 goto exit; 458 } 459 460 if (type == EVF_TIMER) { 461 mevp->me_msecs = tfd; 462 mevp->me_timid = mevent_timid++; 463 } else 464 mevp->me_fd = tfd; 465 mevp->me_type = type; 466 mevp->me_func = func; 467 mevp->me_param = param; 468 469 LIST_INSERT_HEAD(&change_head, mevp, me_list); 470 mevp->me_cq = 1; 471 mevp->me_state = MEV_ADD; 472 mevent_notify(); 473 474 exit: 475 mevent_qunlock(); 476 477 return (mevp); 478 } 479 480 static int 481 mevent_update(struct mevent *evp, int newstate) 482 { 483 /* 484 * It's not possible to enable/disable a deleted event 485 */ 486 if (evp->me_state == MEV_DEL_PENDING) 487 return (EINVAL); 488 489 /* 490 * No update needed if state isn't changing 491 */ 492 if (evp->me_state == newstate) 493 return (0); 494 495 mevent_qlock(); 496 497 evp->me_state = newstate; 498 499 /* 500 * Place the entry onto the changed list if not already there. 501 */ 502 if (evp->me_cq == 0) { 503 evp->me_cq = 1; 504 LIST_REMOVE(evp, me_list); 505 LIST_INSERT_HEAD(&change_head, evp, me_list); 506 mevent_notify(); 507 } 508 509 mevent_qunlock(); 510 511 return (0); 512 } 513 514 int 515 mevent_enable(struct mevent *evp) 516 { 517 518 return (mevent_update(evp, MEV_ENABLE)); 519 } 520 521 int 522 mevent_disable(struct mevent *evp) 523 { 524 525 return (mevent_update(evp, MEV_DISABLE)); 526 } 527 528 static int 529 mevent_delete_event(struct mevent *evp, int closefd) 530 { 531 mevent_qlock(); 532 533 /* 534 * Place the entry onto the changed list if not already there, and 535 * mark as to be deleted. 536 */ 537 if (evp->me_cq == 0) { 538 evp->me_cq = 1; 539 LIST_REMOVE(evp, me_list); 540 LIST_INSERT_HEAD(&change_head, evp, me_list); 541 mevent_notify(); 542 } 543 evp->me_state = MEV_DEL_PENDING; 544 545 if (closefd) 546 evp->me_closefd = 1; 547 548 mevent_qunlock(); 549 550 return (0); 551 } 552 553 int 554 mevent_delete(struct mevent *evp) 555 { 556 557 return (mevent_delete_event(evp, 0)); 558 } 559 560 int 561 mevent_delete_close(struct mevent *evp) 562 { 563 564 return (mevent_delete_event(evp, 1)); 565 } 566 567 static void 568 mevent_set_name(void) 569 { 570 571 pthread_set_name_np(mevent_tid, "mevent"); 572 } 573 574 void 575 mevent_dispatch(void) 576 { 577 #ifdef __FreeBSD__ 578 struct kevent changelist[MEVENT_MAX]; 579 struct kevent eventlist[MEVENT_MAX]; 580 struct mevent *pipev; 581 int mfd; 582 int numev; 583 #else 584 struct mevent *pipev; 585 int portfd; 586 #endif 587 int ret; 588 #ifndef WITHOUT_CAPSICUM 589 cap_rights_t rights; 590 #endif 591 592 mevent_tid = pthread_self(); 593 mevent_set_name(); 594 595 #ifdef __FreeBSD__ 596 mfd = kqueue(); 597 assert(mfd > 0); 598 #else 599 portfd = port_create(); 600 assert(portfd >= 0); 601 #endif 602 603 #ifndef WITHOUT_CAPSICUM 604 cap_rights_init(&rights, CAP_KQUEUE); 605 if (caph_rights_limit(mfd, &rights) == -1) 606 errx(EX_OSERR, "Unable to apply rights for sandbox"); 607 #endif 608 609 /* 610 * Open the pipe that will be used for other threads to force 611 * the blocking kqueue call to exit by writing to it. Set the 612 * descriptor to non-blocking. 613 */ 614 ret = pipe(mevent_pipefd); 615 if (ret < 0) { 616 perror("pipe"); 617 exit(0); 618 } 619 620 #ifndef WITHOUT_CAPSICUM 621 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 622 if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) 623 errx(EX_OSERR, "Unable to apply rights for sandbox"); 624 if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) 625 errx(EX_OSERR, "Unable to apply rights for sandbox"); 626 #endif 627 628 /* 629 * Add internal event handler for the pipe write fd 630 */ 631 pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); 632 assert(pipev != NULL); 633 634 for (;;) { 635 #ifdef __FreeBSD__ 636 /* 637 * Build changelist if required. 638 * XXX the changelist can be put into the blocking call 639 * to eliminate the extra syscall. Currently better for 640 * debug. 641 */ 642 numev = mevent_build(mfd, changelist); 643 if (numev) { 644 ret = kevent(mfd, changelist, numev, NULL, 0, NULL); 645 if (ret == -1) { 646 perror("Error return from kevent change"); 647 } 648 } 649 650 /* 651 * Block awaiting events 652 */ 653 ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); 654 if (ret == -1 && errno != EINTR) { 655 perror("Error return from kevent monitor"); 656 } 657 658 /* 659 * Handle reported events 660 */ 661 mevent_handle(eventlist, ret); 662 663 #else /* __FreeBSD__ */ 664 port_event_t pev; 665 666 /* Handle any pending updates */ 667 mevent_update_pending(portfd); 668 669 /* Block awaiting events */ 670 ret = port_get(portfd, &pev, NULL); 671 if (ret != 0 && errno != EINTR) { 672 perror("Error return from port_get"); 673 continue; 674 } 675 676 /* Handle reported event */ 677 mevent_handle_pe(&pev); 678 #endif /* __FreeBSD__ */ 679 } 680 } 681