1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 /* 32 * Copyright 2018 Joyent, Inc. 33 */ 34 35 /* 36 * Micro event library for FreeBSD, designed for a single i/o thread 37 * using kqueue, and having events be persistent by default. 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include <assert.h> 44 #ifndef WITHOUT_CAPSICUM 45 #include <capsicum_helpers.h> 46 #endif 47 #include <err.h> 48 #include <errno.h> 49 #include <stdbool.h> 50 #include <stdlib.h> 51 #include <stdio.h> 52 #include <string.h> 53 #include <sysexits.h> 54 #include <unistd.h> 55 56 #include <sys/types.h> 57 #ifndef WITHOUT_CAPSICUM 58 #include <sys/capsicum.h> 59 #endif 60 #ifdef __FreeBSD__ 61 #include <sys/event.h> 62 #else 63 #include <port.h> 64 #include <sys/poll.h> 65 #include <sys/siginfo.h> 66 #include <sys/queue.h> 67 #include <sys/debug.h> 68 #endif 69 #include <sys/time.h> 70 71 #include <pthread.h> 72 #include <pthread_np.h> 73 74 #include "mevent.h" 75 76 #define MEVENT_MAX 64 77 78 #ifndef __FreeBSD__ 79 #define EV_ENABLE 0x01 80 #define EV_ADD EV_ENABLE 81 #define EV_DISABLE 0x02 82 #define EV_DELETE 0x04 83 #endif 84 85 static pthread_t mevent_tid; 86 static int mevent_timid = 43; 87 static int mevent_pipefd[2]; 88 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; 89 90 struct mevent { 91 void (*me_func)(int, enum ev_type, void *); 92 #define me_msecs me_fd 93 int me_fd; 94 #ifdef __FreeBSD__ 95 int me_timid; 96 #else 97 timer_t me_timid; 98 #endif 99 enum ev_type me_type; 100 void *me_param; 101 int me_cq; 102 int me_state; /* Desired kevent flags. */ 103 int me_closefd; 104 #ifndef __FreeBSD__ 105 port_notify_t me_notify; 106 struct sigevent me_sigev; 107 boolean_t me_auto_requeue; 108 #endif 109 LIST_ENTRY(mevent) me_list; 110 }; 111 112 static LIST_HEAD(listhead, mevent) global_head, change_head; 113 114 static void 115 mevent_qlock(void) 116 { 117 pthread_mutex_lock(&mevent_lmutex); 118 } 119 120 static void 121 mevent_qunlock(void) 122 { 123 pthread_mutex_unlock(&mevent_lmutex); 124 } 125 126 static void 127 mevent_pipe_read(int fd, enum ev_type type, void *param) 128 { 129 char buf[MEVENT_MAX]; 130 int status; 131 132 /* 133 * Drain the pipe read side. The fd is non-blocking so this is 134 * safe to do. 135 */ 136 do { 137 status = read(fd, buf, sizeof(buf)); 138 } while (status == MEVENT_MAX); 139 } 140 141 static void 142 mevent_notify(void) 143 { 144 char c = '\0'; 145 146 /* 147 * If calling from outside the i/o thread, write a byte on the 148 * pipe to force the i/o thread to exit the blocking kevent call. 149 */ 150 if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { 151 write(mevent_pipefd[1], &c, 1); 152 } 153 } 154 #ifdef __FreeBSD__ 155 static int 156 mevent_kq_filter(struct mevent *mevp) 157 { 158 int retval; 159 160 retval = 0; 161 162 if (mevp->me_type == EVF_READ) 163 retval = EVFILT_READ; 164 165 if (mevp->me_type == EVF_WRITE) 166 retval = EVFILT_WRITE; 167 168 if (mevp->me_type == EVF_TIMER) 169 retval = EVFILT_TIMER; 170 171 if (mevp->me_type == EVF_SIGNAL) 172 retval = EVFILT_SIGNAL; 173 174 return (retval); 175 } 176 177 static int 178 mevent_kq_flags(struct mevent *mevp) 179 { 180 return (mevp->me_state); 181 } 182 183 static int 184 mevent_kq_fflags(struct mevent *mevp) 185 { 186 /* XXX nothing yet, perhaps EV_EOF for reads ? */ 187 return (0); 188 } 189 190 static int 191 mevent_build(int mfd, struct kevent *kev) 192 { 193 struct mevent *mevp, *tmpp; 194 int i; 195 196 i = 0; 197 198 mevent_qlock(); 199 200 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { 201 if (mevp->me_closefd) { 202 /* 203 * A close of the file descriptor will remove the 204 * event 205 */ 206 close(mevp->me_fd); 207 } else { 208 if (mevp->me_type == EVF_TIMER) { 209 kev[i].ident = mevp->me_timid; 210 kev[i].data = mevp->me_msecs; 211 } else { 212 kev[i].ident = mevp->me_fd; 213 kev[i].data = 0; 214 } 215 kev[i].filter = mevent_kq_filter(mevp); 216 kev[i].flags = mevent_kq_flags(mevp); 217 kev[i].fflags = mevent_kq_fflags(mevp); 218 kev[i].udata = mevp; 219 i++; 220 } 221 222 mevp->me_cq = 0; 223 LIST_REMOVE(mevp, me_list); 224 225 if (mevp->me_state & EV_DELETE) { 226 free(mevp); 227 } else { 228 /* 229 * We need to add the event only once, so we can 230 * reset the EV_ADD bit after it has been propagated 231 * to the kevent() arguments the first time. 232 */ 233 mevp->me_state &= ~EV_ADD; 234 LIST_INSERT_HEAD(&global_head, mevp, me_list); 235 } 236 237 assert(i < MEVENT_MAX); 238 } 239 240 mevent_qunlock(); 241 242 return (i); 243 } 244 245 static void 246 mevent_handle(struct kevent *kev, int numev) 247 { 248 struct mevent *mevp; 249 int i; 250 251 for (i = 0; i < numev; i++) { 252 mevp = kev[i].udata; 253 254 /* XXX check for EV_ERROR ? */ 255 256 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); 257 } 258 } 259 260 #else /* __FreeBSD__ */ 261 262 static boolean_t 263 mevent_clarify_state(struct mevent *mevp) 264 { 265 const int state = mevp->me_state; 266 267 if ((state & EV_DELETE) != 0) { 268 /* All other intents are overriden by delete. */ 269 mevp->me_state = EV_DELETE; 270 return (B_TRUE); 271 } 272 273 /* 274 * Without a distinction between EV_ADD and EV_ENABLE in our emulation, 275 * handling the add-disabled case means eliding the portfs operation 276 * when both flags are present. 277 * 278 * This is not a concern for subsequent enable/disable operations, as 279 * mevent_update() toggles the flags properly so they are not left in 280 * conflict. 281 */ 282 if (state == (EV_ENABLE|EV_DISABLE)) { 283 mevp->me_state = EV_DISABLE; 284 return (B_FALSE); 285 } 286 287 return (B_TRUE); 288 } 289 290 static void 291 mevent_update_one(struct mevent *mevp) 292 { 293 int portfd = mevp->me_notify.portnfy_port; 294 295 switch (mevp->me_type) { 296 case EVF_READ: 297 case EVF_WRITE: 298 mevp->me_auto_requeue = B_FALSE; 299 300 switch (mevp->me_state) { 301 case EV_ENABLE: 302 { 303 int events; 304 305 events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT; 306 307 if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd, 308 events, mevp) != 0) { 309 (void) fprintf(stderr, 310 "port_associate fd %d %p failed: %s\n", 311 mevp->me_fd, mevp, strerror(errno)); 312 } 313 return; 314 } 315 case EV_DISABLE: 316 case EV_DELETE: 317 /* 318 * A disable that comes in while an event is being 319 * handled will result in an ENOENT. 320 */ 321 if (port_dissociate(portfd, PORT_SOURCE_FD, 322 mevp->me_fd) != 0 && errno != ENOENT) { 323 (void) fprintf(stderr, "port_dissociate " 324 "portfd %d fd %d mevp %p failed: %s\n", 325 portfd, mevp->me_fd, mevp, strerror(errno)); 326 } 327 return; 328 default: 329 goto abort; 330 } 331 332 case EVF_TIMER: 333 mevp->me_auto_requeue = B_TRUE; 334 335 switch (mevp->me_state) { 336 case EV_ENABLE: 337 { 338 struct itimerspec it = { 0 }; 339 340 mevp->me_sigev.sigev_notify = SIGEV_PORT; 341 mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify; 342 343 if (timer_create(CLOCK_REALTIME, &mevp->me_sigev, 344 &mevp->me_timid) != 0) { 345 (void) fprintf(stderr, 346 "timer_create failed: %s", strerror(errno)); 347 return; 348 } 349 350 /* The first timeout */ 351 it.it_value.tv_sec = mevp->me_msecs / MILLISEC; 352 it.it_value.tv_nsec = 353 MSEC2NSEC(mevp->me_msecs % MILLISEC); 354 /* Repeat at the same interval */ 355 it.it_interval = it.it_value; 356 357 if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) { 358 (void) fprintf(stderr, "timer_settime failed: " 359 "%s", strerror(errno)); 360 } 361 return; 362 } 363 case EV_DISABLE: 364 case EV_DELETE: 365 if (timer_delete(mevp->me_timid) != 0) { 366 (void) fprintf(stderr, "timer_delete failed: " 367 "%s", strerror(errno)); 368 } 369 return; 370 default: 371 goto abort; 372 } 373 default: 374 /* EVF_SIGNAL not yet implemented. */ 375 goto abort; 376 } 377 378 abort: 379 (void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__, 380 mevp->me_type, mevp->me_state); 381 abort(); 382 } 383 384 static void 385 mevent_update_pending(int portfd) 386 { 387 struct mevent *mevp, *tmpp; 388 389 mevent_qlock(); 390 391 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { 392 mevp->me_notify.portnfy_port = portfd; 393 mevp->me_notify.portnfy_user = mevp; 394 if (mevp->me_closefd) { 395 /* 396 * A close of the file descriptor will remove the 397 * event 398 */ 399 (void) close(mevp->me_fd); 400 mevp->me_fd = -1; 401 } else { 402 if (mevent_clarify_state(mevp)) { 403 mevent_update_one(mevp); 404 } 405 } 406 407 mevp->me_cq = 0; 408 LIST_REMOVE(mevp, me_list); 409 410 if (mevp->me_state & EV_DELETE) { 411 free(mevp); 412 } else { 413 LIST_INSERT_HEAD(&global_head, mevp, me_list); 414 } 415 } 416 417 mevent_qunlock(); 418 } 419 420 static void 421 mevent_handle_pe(port_event_t *pe) 422 { 423 struct mevent *mevp = pe->portev_user; 424 425 mevent_qunlock(); 426 427 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); 428 429 mevent_qlock(); 430 if (!mevp->me_cq && !mevp->me_auto_requeue) { 431 mevent_update_one(mevp); 432 } 433 mevent_qunlock(); 434 } 435 #endif 436 437 static struct mevent * 438 mevent_add_state(int tfd, enum ev_type type, 439 void (*func)(int, enum ev_type, void *), void *param, 440 int state) 441 { 442 struct mevent *lp, *mevp; 443 444 if (tfd < 0 || func == NULL) { 445 return (NULL); 446 } 447 448 mevp = NULL; 449 450 mevent_qlock(); 451 452 /* 453 * Verify that the fd/type tuple is not present in any list 454 */ 455 LIST_FOREACH(lp, &global_head, me_list) { 456 if (type != EVF_TIMER && lp->me_fd == tfd && 457 lp->me_type == type) { 458 goto exit; 459 } 460 } 461 462 LIST_FOREACH(lp, &change_head, me_list) { 463 if (type != EVF_TIMER && lp->me_fd == tfd && 464 lp->me_type == type) { 465 goto exit; 466 } 467 } 468 469 /* 470 * Allocate an entry, populate it, and add it to the change list. 471 */ 472 mevp = calloc(1, sizeof(struct mevent)); 473 if (mevp == NULL) { 474 goto exit; 475 } 476 477 if (type == EVF_TIMER) { 478 mevp->me_msecs = tfd; 479 mevp->me_timid = mevent_timid++; 480 } else 481 mevp->me_fd = tfd; 482 mevp->me_type = type; 483 mevp->me_func = func; 484 mevp->me_param = param; 485 486 LIST_INSERT_HEAD(&change_head, mevp, me_list); 487 mevp->me_cq = 1; 488 mevp->me_state = state; 489 mevent_notify(); 490 491 exit: 492 mevent_qunlock(); 493 494 return (mevp); 495 } 496 497 struct mevent * 498 mevent_add(int tfd, enum ev_type type, 499 void (*func)(int, enum ev_type, void *), void *param) 500 { 501 502 return (mevent_add_state(tfd, type, func, param, EV_ADD)); 503 } 504 505 struct mevent * 506 mevent_add_disabled(int tfd, enum ev_type type, 507 void (*func)(int, enum ev_type, void *), void *param) 508 { 509 510 return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE)); 511 } 512 513 static int 514 mevent_update(struct mevent *evp, bool enable) 515 { 516 int newstate; 517 518 mevent_qlock(); 519 520 /* 521 * It's not possible to enable/disable a deleted event 522 */ 523 assert((evp->me_state & EV_DELETE) == 0); 524 525 newstate = evp->me_state; 526 if (enable) { 527 newstate |= EV_ENABLE; 528 newstate &= ~EV_DISABLE; 529 } else { 530 newstate |= EV_DISABLE; 531 newstate &= ~EV_ENABLE; 532 } 533 534 /* 535 * No update needed if state isn't changing 536 */ 537 if (evp->me_state != newstate) { 538 evp->me_state = newstate; 539 540 /* 541 * Place the entry onto the changed list if not 542 * already there. 543 */ 544 if (evp->me_cq == 0) { 545 evp->me_cq = 1; 546 LIST_REMOVE(evp, me_list); 547 LIST_INSERT_HEAD(&change_head, evp, me_list); 548 mevent_notify(); 549 } 550 } 551 552 mevent_qunlock(); 553 554 return (0); 555 } 556 557 int 558 mevent_enable(struct mevent *evp) 559 { 560 561 return (mevent_update(evp, true)); 562 } 563 564 int 565 mevent_disable(struct mevent *evp) 566 { 567 568 return (mevent_update(evp, false)); 569 } 570 571 static int 572 mevent_delete_event(struct mevent *evp, int closefd) 573 { 574 mevent_qlock(); 575 576 /* 577 * Place the entry onto the changed list if not already there, and 578 * mark as to be deleted. 579 */ 580 if (evp->me_cq == 0) { 581 evp->me_cq = 1; 582 LIST_REMOVE(evp, me_list); 583 LIST_INSERT_HEAD(&change_head, evp, me_list); 584 mevent_notify(); 585 } 586 evp->me_state = EV_DELETE; 587 588 if (closefd) 589 evp->me_closefd = 1; 590 591 mevent_qunlock(); 592 593 return (0); 594 } 595 596 int 597 mevent_delete(struct mevent *evp) 598 { 599 600 return (mevent_delete_event(evp, 0)); 601 } 602 603 int 604 mevent_delete_close(struct mevent *evp) 605 { 606 607 return (mevent_delete_event(evp, 1)); 608 } 609 610 static void 611 mevent_set_name(void) 612 { 613 614 pthread_set_name_np(mevent_tid, "mevent"); 615 } 616 617 void 618 mevent_dispatch(void) 619 { 620 #ifdef __FreeBSD__ 621 struct kevent changelist[MEVENT_MAX]; 622 struct kevent eventlist[MEVENT_MAX]; 623 struct mevent *pipev; 624 int mfd; 625 int numev; 626 #else 627 struct mevent *pipev; 628 int portfd; 629 #endif 630 int ret; 631 #ifndef WITHOUT_CAPSICUM 632 cap_rights_t rights; 633 #endif 634 635 mevent_tid = pthread_self(); 636 mevent_set_name(); 637 638 #ifdef __FreeBSD__ 639 mfd = kqueue(); 640 assert(mfd > 0); 641 #else 642 portfd = port_create(); 643 assert(portfd >= 0); 644 #endif 645 646 #ifndef WITHOUT_CAPSICUM 647 cap_rights_init(&rights, CAP_KQUEUE); 648 if (caph_rights_limit(mfd, &rights) == -1) 649 errx(EX_OSERR, "Unable to apply rights for sandbox"); 650 #endif 651 652 /* 653 * Open the pipe that will be used for other threads to force 654 * the blocking kqueue call to exit by writing to it. Set the 655 * descriptor to non-blocking. 656 */ 657 ret = pipe(mevent_pipefd); 658 if (ret < 0) { 659 perror("pipe"); 660 exit(0); 661 } 662 663 #ifndef WITHOUT_CAPSICUM 664 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 665 if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) 666 errx(EX_OSERR, "Unable to apply rights for sandbox"); 667 if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) 668 errx(EX_OSERR, "Unable to apply rights for sandbox"); 669 #endif 670 671 /* 672 * Add internal event handler for the pipe write fd 673 */ 674 pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); 675 assert(pipev != NULL); 676 677 for (;;) { 678 #ifdef __FreeBSD__ 679 /* 680 * Build changelist if required. 681 * XXX the changelist can be put into the blocking call 682 * to eliminate the extra syscall. Currently better for 683 * debug. 684 */ 685 numev = mevent_build(mfd, changelist); 686 if (numev) { 687 ret = kevent(mfd, changelist, numev, NULL, 0, NULL); 688 if (ret == -1) { 689 perror("Error return from kevent change"); 690 } 691 } 692 693 /* 694 * Block awaiting events 695 */ 696 ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); 697 if (ret == -1 && errno != EINTR) { 698 perror("Error return from kevent monitor"); 699 } 700 701 /* 702 * Handle reported events 703 */ 704 mevent_handle(eventlist, ret); 705 706 #else /* __FreeBSD__ */ 707 port_event_t pev; 708 709 /* Handle any pending updates */ 710 mevent_update_pending(portfd); 711 712 /* Block awaiting events */ 713 ret = port_get(portfd, &pev, NULL); 714 if (ret != 0) { 715 if (errno != EINTR) 716 perror("Error return from port_get"); 717 continue; 718 } 719 720 /* Handle reported event */ 721 mevent_handle_pe(&pev); 722 #endif /* __FreeBSD__ */ 723 } 724 } 725