1 /* $OpenBSD: kern_event.c,v 1.163 2021/04/22 15:30:12 visa Exp $ */ 2 3 /*- 4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD: src/sys/kern/kern_event.c,v 1.22 2001/02/23 20:32:42 jlemon Exp $ 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/atomic.h> 34 #include <sys/kernel.h> 35 #include <sys/proc.h> 36 #include <sys/pledge.h> 37 #include <sys/malloc.h> 38 #include <sys/unistd.h> 39 #include <sys/file.h> 40 #include <sys/filedesc.h> 41 #include <sys/fcntl.h> 42 #include <sys/selinfo.h> 43 #include <sys/queue.h> 44 #include <sys/event.h> 45 #include <sys/eventvar.h> 46 #include <sys/ktrace.h> 47 #include <sys/pool.h> 48 #include <sys/protosw.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/stat.h> 52 #include <sys/uio.h> 53 #include <sys/mount.h> 54 #include <sys/poll.h> 55 #include <sys/syscallargs.h> 56 #include <sys/time.h> 57 #include <sys/timeout.h> 58 #include <sys/wait.h> 59 60 #ifdef DIAGNOSTIC 61 #define KLIST_ASSERT_LOCKED(kl) do { \ 62 if ((kl)->kl_ops != NULL) \ 63 (kl)->kl_ops->klo_assertlk((kl)->kl_arg); \ 64 else \ 65 KERNEL_ASSERT_LOCKED(); \ 66 } while (0) 67 #else 68 #define KLIST_ASSERT_LOCKED(kl) ((void)(kl)) 69 #endif 70 71 struct kqueue *kqueue_alloc(struct filedesc *); 72 void kqueue_terminate(struct proc *p, struct kqueue *); 73 void kqueue_init(void); 74 void KQREF(struct kqueue *); 75 void KQRELE(struct kqueue *); 76 77 int kqueue_sleep(struct kqueue *, struct timespec *); 78 79 int kqueue_read(struct file *, struct uio *, int); 80 int kqueue_write(struct file *, struct uio *, int); 81 int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 82 struct proc *p); 83 int kqueue_poll(struct file *fp, int events, struct proc *p); 84 int kqueue_kqfilter(struct file *fp, struct knote *kn); 85 int kqueue_stat(struct file *fp, struct stat *st, struct proc *p); 86 int kqueue_close(struct file *fp, struct proc *p); 87 void kqueue_wakeup(struct kqueue *kq); 88 89 #ifdef KQUEUE_DEBUG 90 void kqueue_do_check(struct kqueue *kq, const char *func, int line); 91 #define kqueue_check(kq) kqueue_do_check((kq), __func__, __LINE__) 92 #else 93 #define kqueue_check(kq) do {} while (0) 94 #endif 95 96 void kqpoll_dequeue(struct proc *p); 97 98 static int filter_attach(struct knote *kn); 99 static void filter_detach(struct knote *kn); 100 static int filter_event(struct knote *kn, long hint); 101 static int filter_modify(struct kevent *kev, struct knote *kn); 102 static int filter_process(struct knote *kn, struct kevent *kev); 103 static void kqueue_expand_hash(struct kqueue *kq); 104 static void kqueue_expand_list(struct kqueue *kq, int fd); 105 static void kqueue_task(void *); 106 static int klist_lock(struct klist *); 107 static void klist_unlock(struct klist *, int); 108 109 const struct fileops kqueueops = { 110 .fo_read = kqueue_read, 111 .fo_write = kqueue_write, 112 .fo_ioctl = kqueue_ioctl, 113 .fo_poll = kqueue_poll, 114 .fo_kqfilter = kqueue_kqfilter, 115 .fo_stat = kqueue_stat, 116 .fo_close = kqueue_close 117 }; 118 119 void knote_attach(struct knote *kn); 120 void knote_detach(struct knote *kn); 121 void knote_drop(struct knote *kn, struct proc *p); 122 void knote_enqueue(struct knote *kn); 123 void knote_dequeue(struct knote *kn); 124 int knote_acquire(struct knote *kn, struct klist *, int); 125 void knote_release(struct knote *kn); 126 void knote_activate(struct knote *kn); 127 void knote_remove(struct proc *p, struct knlist *list, int purge); 128 129 void filt_kqdetach(struct knote *kn); 130 int filt_kqueue(struct knote *kn, long hint); 131 int filt_procattach(struct knote *kn); 132 void filt_procdetach(struct knote *kn); 133 int filt_proc(struct knote *kn, long hint); 134 int filt_fileattach(struct knote *kn); 135 void filt_timerexpire(void *knx); 136 int filt_timerattach(struct knote *kn); 137 void filt_timerdetach(struct knote *kn); 138 int filt_timermodify(struct kevent *kev, struct knote *kn); 139 int filt_timerprocess(struct knote *kn, struct kevent *kev); 140 void filt_seltruedetach(struct knote *kn); 141 142 const struct filterops kqread_filtops = { 143 .f_flags = FILTEROP_ISFD, 144 .f_attach = NULL, 145 .f_detach = filt_kqdetach, 146 .f_event = filt_kqueue, 147 }; 148 149 const struct filterops proc_filtops = { 150 .f_flags = 0, 151 .f_attach = filt_procattach, 152 .f_detach = filt_procdetach, 153 .f_event = filt_proc, 154 }; 155 156 const struct filterops file_filtops = { 157 .f_flags = FILTEROP_ISFD, 158 .f_attach = filt_fileattach, 159 .f_detach = NULL, 160 .f_event = NULL, 161 }; 162 163 const struct filterops timer_filtops = { 164 .f_flags = 0, 165 .f_attach = filt_timerattach, 166 .f_detach = filt_timerdetach, 167 .f_event = NULL, 168 .f_modify = filt_timermodify, 169 .f_process = filt_timerprocess, 170 }; 171 172 struct pool knote_pool; 173 struct pool kqueue_pool; 174 int kq_ntimeouts = 0; 175 int kq_timeoutmax = (4 * 1024); 176 177 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 178 179 /* 180 * Table for for all system-defined filters. 181 */ 182 const struct filterops *const sysfilt_ops[] = { 183 &file_filtops, /* EVFILT_READ */ 184 &file_filtops, /* EVFILT_WRITE */ 185 NULL, /*&aio_filtops,*/ /* EVFILT_AIO */ 186 &file_filtops, /* EVFILT_VNODE */ 187 &proc_filtops, /* EVFILT_PROC */ 188 &sig_filtops, /* EVFILT_SIGNAL */ 189 &timer_filtops, /* EVFILT_TIMER */ 190 &file_filtops, /* EVFILT_DEVICE */ 191 &file_filtops, /* EVFILT_EXCEPT */ 192 }; 193 194 void 195 KQREF(struct kqueue *kq) 196 { 197 atomic_inc_int(&kq->kq_refs); 198 } 199 200 void 201 KQRELE(struct kqueue *kq) 202 { 203 struct filedesc *fdp; 204 205 if (atomic_dec_int_nv(&kq->kq_refs) > 0) 206 return; 207 208 fdp = kq->kq_fdp; 209 if (rw_status(&fdp->fd_lock) == RW_WRITE) { 210 LIST_REMOVE(kq, kq_next); 211 } else { 212 fdplock(fdp); 213 LIST_REMOVE(kq, kq_next); 214 fdpunlock(fdp); 215 } 216 217 KASSERT(TAILQ_EMPTY(&kq->kq_head)); 218 219 free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * 220 sizeof(struct knlist)); 221 hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT); 222 pool_put(&kqueue_pool, kq); 223 } 224 225 void 226 kqueue_init(void) 227 { 228 pool_init(&kqueue_pool, sizeof(struct kqueue), 0, IPL_MPFLOOR, 229 PR_WAITOK, "kqueuepl", NULL); 230 pool_init(&knote_pool, sizeof(struct knote), 0, IPL_MPFLOOR, 231 PR_WAITOK, "knotepl", NULL); 232 } 233 234 int 235 filt_fileattach(struct knote *kn) 236 { 237 struct file *fp = kn->kn_fp; 238 239 return fp->f_ops->fo_kqfilter(fp, kn); 240 } 241 242 int 243 kqueue_kqfilter(struct file *fp, struct knote *kn) 244 { 245 struct kqueue *kq = kn->kn_fp->f_data; 246 247 if (kn->kn_filter != EVFILT_READ) 248 return (EINVAL); 249 250 kn->kn_fop = &kqread_filtops; 251 klist_insert_locked(&kq->kq_sel.si_note, kn); 252 return (0); 253 } 254 255 void 256 filt_kqdetach(struct knote *kn) 257 { 258 struct kqueue *kq = kn->kn_fp->f_data; 259 260 klist_remove_locked(&kq->kq_sel.si_note, kn); 261 } 262 263 int 264 filt_kqueue(struct knote *kn, long hint) 265 { 266 struct kqueue *kq = kn->kn_fp->f_data; 267 268 kn->kn_data = kq->kq_count; 269 return (kn->kn_data > 0); 270 } 271 272 int 273 filt_procattach(struct knote *kn) 274 { 275 struct process *pr; 276 int s; 277 278 if ((curproc->p_p->ps_flags & PS_PLEDGE) && 279 (curproc->p_p->ps_pledge & PLEDGE_PROC) == 0) 280 return pledge_fail(curproc, EPERM, PLEDGE_PROC); 281 282 if (kn->kn_id > PID_MAX) 283 return ESRCH; 284 285 pr = prfind(kn->kn_id); 286 if (pr == NULL) 287 return (ESRCH); 288 289 /* exiting processes can't be specified */ 290 if (pr->ps_flags & PS_EXITING) 291 return (ESRCH); 292 293 kn->kn_ptr.p_process = pr; 294 kn->kn_flags |= EV_CLEAR; /* automatically set */ 295 296 /* 297 * internal flag indicating registration done by kernel 298 */ 299 if (kn->kn_flags & EV_FLAG1) { 300 kn->kn_data = kn->kn_sdata; /* ppid */ 301 kn->kn_fflags = NOTE_CHILD; 302 kn->kn_flags &= ~EV_FLAG1; 303 } 304 305 s = splhigh(); 306 klist_insert_locked(&pr->ps_klist, kn); 307 splx(s); 308 309 return (0); 310 } 311 312 /* 313 * The knote may be attached to a different process, which may exit, 314 * leaving nothing for the knote to be attached to. So when the process 315 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 316 * it will be deleted when read out. However, as part of the knote deletion, 317 * this routine is called, so a check is needed to avoid actually performing 318 * a detach, because the original process does not exist any more. 319 */ 320 void 321 filt_procdetach(struct knote *kn) 322 { 323 struct process *pr = kn->kn_ptr.p_process; 324 int s; 325 326 if (kn->kn_status & KN_DETACHED) 327 return; 328 329 s = splhigh(); 330 klist_remove_locked(&pr->ps_klist, kn); 331 splx(s); 332 } 333 334 int 335 filt_proc(struct knote *kn, long hint) 336 { 337 u_int event; 338 339 /* 340 * mask off extra data 341 */ 342 event = (u_int)hint & NOTE_PCTRLMASK; 343 344 /* 345 * if the user is interested in this event, record it. 346 */ 347 if (kn->kn_sfflags & event) 348 kn->kn_fflags |= event; 349 350 /* 351 * process is gone, so flag the event as finished and remove it 352 * from the process's klist 353 */ 354 if (event == NOTE_EXIT) { 355 struct process *pr = kn->kn_ptr.p_process; 356 int s; 357 358 s = splhigh(); 359 kn->kn_status |= KN_DETACHED; 360 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 361 kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig); 362 klist_remove_locked(&pr->ps_klist, kn); 363 splx(s); 364 return (1); 365 } 366 367 /* 368 * process forked, and user wants to track the new process, 369 * so attach a new knote to it, and immediately report an 370 * event with the parent's pid. 371 */ 372 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 373 struct kevent kev; 374 int error; 375 376 /* 377 * register knote with new process. 378 */ 379 memset(&kev, 0, sizeof(kev)); 380 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 381 kev.filter = kn->kn_filter; 382 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 383 kev.fflags = kn->kn_sfflags; 384 kev.data = kn->kn_id; /* parent */ 385 kev.udata = kn->kn_udata; /* preserve udata */ 386 error = kqueue_register(kn->kn_kq, &kev, NULL); 387 if (error) 388 kn->kn_fflags |= NOTE_TRACKERR; 389 } 390 391 return (kn->kn_fflags != 0); 392 } 393 394 static void 395 filt_timer_timeout_add(struct knote *kn) 396 { 397 struct timeval tv; 398 struct timeout *to = kn->kn_hook; 399 int tticks; 400 401 tv.tv_sec = kn->kn_sdata / 1000; 402 tv.tv_usec = (kn->kn_sdata % 1000) * 1000; 403 tticks = tvtohz(&tv); 404 /* Remove extra tick from tvtohz() if timeout has fired before. */ 405 if (timeout_triggered(to)) 406 tticks--; 407 timeout_add(to, (tticks > 0) ? tticks : 1); 408 } 409 410 void 411 filt_timerexpire(void *knx) 412 { 413 struct knote *kn = knx; 414 415 kn->kn_data++; 416 knote_activate(kn); 417 418 if ((kn->kn_flags & EV_ONESHOT) == 0) 419 filt_timer_timeout_add(kn); 420 } 421 422 423 /* 424 * data contains amount of time to sleep, in milliseconds 425 */ 426 int 427 filt_timerattach(struct knote *kn) 428 { 429 struct timeout *to; 430 431 if (kq_ntimeouts > kq_timeoutmax) 432 return (ENOMEM); 433 kq_ntimeouts++; 434 435 kn->kn_flags |= EV_CLEAR; /* automatically set */ 436 to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); 437 timeout_set(to, filt_timerexpire, kn); 438 kn->kn_hook = to; 439 filt_timer_timeout_add(kn); 440 441 return (0); 442 } 443 444 void 445 filt_timerdetach(struct knote *kn) 446 { 447 struct timeout *to; 448 449 to = (struct timeout *)kn->kn_hook; 450 timeout_del_barrier(to); 451 free(to, M_KEVENT, sizeof(*to)); 452 kq_ntimeouts--; 453 } 454 455 int 456 filt_timermodify(struct kevent *kev, struct knote *kn) 457 { 458 struct timeout *to = kn->kn_hook; 459 int s; 460 461 /* Reset the timer. Any pending events are discarded. */ 462 463 timeout_del_barrier(to); 464 465 s = splhigh(); 466 if (kn->kn_status & KN_QUEUED) 467 knote_dequeue(kn); 468 kn->kn_status &= ~KN_ACTIVE; 469 splx(s); 470 471 kn->kn_data = 0; 472 knote_modify(kev, kn); 473 /* Reinit timeout to invoke tick adjustment again. */ 474 timeout_set(to, filt_timerexpire, kn); 475 filt_timer_timeout_add(kn); 476 477 return (0); 478 } 479 480 int 481 filt_timerprocess(struct knote *kn, struct kevent *kev) 482 { 483 int active, s; 484 485 s = splsoftclock(); 486 active = (kn->kn_data != 0); 487 if (active) 488 knote_submit(kn, kev); 489 splx(s); 490 491 return (active); 492 } 493 494 495 /* 496 * filt_seltrue: 497 * 498 * This filter "event" routine simulates seltrue(). 499 */ 500 int 501 filt_seltrue(struct knote *kn, long hint) 502 { 503 504 /* 505 * We don't know how much data can be read/written, 506 * but we know that it *can* be. This is about as 507 * good as select/poll does as well. 508 */ 509 kn->kn_data = 0; 510 return (1); 511 } 512 513 int 514 filt_seltruemodify(struct kevent *kev, struct knote *kn) 515 { 516 knote_modify(kev, kn); 517 return (1); 518 } 519 520 int 521 filt_seltrueprocess(struct knote *kn, struct kevent *kev) 522 { 523 knote_submit(kn, kev); 524 return (1); 525 } 526 527 /* 528 * This provides full kqfilter entry for device switch tables, which 529 * has same effect as filter using filt_seltrue() as filter method. 530 */ 531 void 532 filt_seltruedetach(struct knote *kn) 533 { 534 /* Nothing to do */ 535 } 536 537 const struct filterops seltrue_filtops = { 538 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 539 .f_attach = NULL, 540 .f_detach = filt_seltruedetach, 541 .f_event = filt_seltrue, 542 .f_modify = filt_seltruemodify, 543 .f_process = filt_seltrueprocess, 544 }; 545 546 int 547 seltrue_kqfilter(dev_t dev, struct knote *kn) 548 { 549 switch (kn->kn_filter) { 550 case EVFILT_READ: 551 case EVFILT_WRITE: 552 kn->kn_fop = &seltrue_filtops; 553 break; 554 default: 555 return (EINVAL); 556 } 557 558 /* Nothing more to do */ 559 return (0); 560 } 561 562 static int 563 filt_dead(struct knote *kn, long hint) 564 { 565 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 566 if (kn->kn_flags & __EV_POLL) 567 kn->kn_flags |= __EV_HUP; 568 kn->kn_data = 0; 569 return (1); 570 } 571 572 static void 573 filt_deaddetach(struct knote *kn) 574 { 575 /* Nothing to do */ 576 } 577 578 const struct filterops dead_filtops = { 579 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 580 .f_attach = NULL, 581 .f_detach = filt_deaddetach, 582 .f_event = filt_dead, 583 .f_modify = filt_seltruemodify, 584 .f_process = filt_seltrueprocess, 585 }; 586 587 static int 588 filt_badfd(struct knote *kn, long hint) 589 { 590 kn->kn_flags |= (EV_ERROR | EV_ONESHOT); 591 kn->kn_data = EBADF; 592 return (1); 593 } 594 595 /* For use with kqpoll. */ 596 const struct filterops badfd_filtops = { 597 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 598 .f_attach = NULL, 599 .f_detach = filt_deaddetach, 600 .f_event = filt_badfd, 601 .f_modify = filt_seltruemodify, 602 .f_process = filt_seltrueprocess, 603 }; 604 605 static int 606 filter_attach(struct knote *kn) 607 { 608 int error; 609 610 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 611 error = kn->kn_fop->f_attach(kn); 612 } else { 613 KERNEL_LOCK(); 614 error = kn->kn_fop->f_attach(kn); 615 KERNEL_UNLOCK(); 616 } 617 return (error); 618 } 619 620 static void 621 filter_detach(struct knote *kn) 622 { 623 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 624 kn->kn_fop->f_detach(kn); 625 } else { 626 KERNEL_LOCK(); 627 kn->kn_fop->f_detach(kn); 628 KERNEL_UNLOCK(); 629 } 630 } 631 632 static int 633 filter_event(struct knote *kn, long hint) 634 { 635 if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) 636 KERNEL_ASSERT_LOCKED(); 637 638 return (kn->kn_fop->f_event(kn, hint)); 639 } 640 641 static int 642 filter_modify(struct kevent *kev, struct knote *kn) 643 { 644 int active, s; 645 646 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 647 active = kn->kn_fop->f_modify(kev, kn); 648 } else { 649 KERNEL_LOCK(); 650 if (kn->kn_fop->f_modify != NULL) { 651 active = kn->kn_fop->f_modify(kev, kn); 652 } else { 653 /* Emulate f_modify using f_event. */ 654 s = splhigh(); 655 knote_modify(kev, kn); 656 active = kn->kn_fop->f_event(kn, 0); 657 splx(s); 658 } 659 KERNEL_UNLOCK(); 660 } 661 return (active); 662 } 663 664 static int 665 filter_process(struct knote *kn, struct kevent *kev) 666 { 667 int active, s; 668 669 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 670 active = kn->kn_fop->f_process(kn, kev); 671 } else { 672 KERNEL_LOCK(); 673 if (kn->kn_fop->f_process != NULL) { 674 active = kn->kn_fop->f_process(kn, kev); 675 } else { 676 /* Emulate f_process using f_event. */ 677 s = splhigh(); 678 /* 679 * If called from kqueue_scan(), skip f_event 680 * when EV_ONESHOT is set, to preserve old behaviour. 681 */ 682 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 683 active = 1; 684 else 685 active = kn->kn_fop->f_event(kn, 0); 686 if (active) 687 knote_submit(kn, kev); 688 splx(s); 689 } 690 KERNEL_UNLOCK(); 691 } 692 return (active); 693 } 694 695 void 696 kqpoll_init(void) 697 { 698 struct proc *p = curproc; 699 struct filedesc *fdp; 700 701 if (p->p_kq != NULL) { 702 /* 703 * Discard any knotes that have been enqueued after 704 * previous scan. 705 * This prevents accumulation of enqueued badfd knotes 706 * in case scan does not make progress for some reason. 707 */ 708 kqpoll_dequeue(p); 709 return; 710 } 711 712 p->p_kq = kqueue_alloc(p->p_fd); 713 p->p_kq_serial = arc4random(); 714 fdp = p->p_fd; 715 fdplock(fdp); 716 LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next); 717 fdpunlock(fdp); 718 } 719 720 void 721 kqpoll_exit(void) 722 { 723 struct proc *p = curproc; 724 725 if (p->p_kq == NULL) 726 return; 727 728 kqueue_purge(p, p->p_kq); 729 /* Clear any detached knotes that remain in the queue. */ 730 kqpoll_dequeue(p); 731 kqueue_terminate(p, p->p_kq); 732 KASSERT(p->p_kq->kq_refs == 1); 733 KQRELE(p->p_kq); 734 p->p_kq = NULL; 735 } 736 737 void 738 kqpoll_dequeue(struct proc *p) 739 { 740 struct knote *kn; 741 struct kqueue *kq = p->p_kq; 742 int s; 743 744 s = splhigh(); 745 while ((kn = TAILQ_FIRST(&kq->kq_head)) != NULL) { 746 /* This kqueue should not be scanned by other threads. */ 747 KASSERT(kn->kn_filter != EVFILT_MARKER); 748 749 if (!knote_acquire(kn, NULL, 0)) 750 continue; 751 752 kqueue_check(kq); 753 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 754 kn->kn_status &= ~KN_QUEUED; 755 kq->kq_count--; 756 757 splx(s); 758 kn->kn_fop->f_detach(kn); 759 knote_drop(kn, p); 760 s = splhigh(); 761 kqueue_check(kq); 762 } 763 splx(s); 764 } 765 766 struct kqueue * 767 kqueue_alloc(struct filedesc *fdp) 768 { 769 struct kqueue *kq; 770 771 kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO); 772 kq->kq_refs = 1; 773 kq->kq_fdp = fdp; 774 TAILQ_INIT(&kq->kq_head); 775 task_set(&kq->kq_task, kqueue_task, kq); 776 777 return (kq); 778 } 779 780 int 781 sys_kqueue(struct proc *p, void *v, register_t *retval) 782 { 783 struct filedesc *fdp = p->p_fd; 784 struct kqueue *kq; 785 struct file *fp; 786 int fd, error; 787 788 kq = kqueue_alloc(fdp); 789 790 fdplock(fdp); 791 error = falloc(p, &fp, &fd); 792 if (error) 793 goto out; 794 fp->f_flag = FREAD | FWRITE; 795 fp->f_type = DTYPE_KQUEUE; 796 fp->f_ops = &kqueueops; 797 fp->f_data = kq; 798 *retval = fd; 799 LIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_next); 800 kq = NULL; 801 fdinsert(fdp, fd, 0, fp); 802 FRELE(fp, p); 803 out: 804 fdpunlock(fdp); 805 if (kq != NULL) 806 pool_put(&kqueue_pool, kq); 807 return (error); 808 } 809 810 int 811 sys_kevent(struct proc *p, void *v, register_t *retval) 812 { 813 struct kqueue_scan_state scan; 814 struct filedesc* fdp = p->p_fd; 815 struct sys_kevent_args /* { 816 syscallarg(int) fd; 817 syscallarg(const struct kevent *) changelist; 818 syscallarg(int) nchanges; 819 syscallarg(struct kevent *) eventlist; 820 syscallarg(int) nevents; 821 syscallarg(const struct timespec *) timeout; 822 } */ *uap = v; 823 struct kevent *kevp; 824 struct kqueue *kq; 825 struct file *fp; 826 struct timespec ts; 827 struct timespec *tsp = NULL; 828 int i, n, nerrors, error; 829 int ready, total; 830 struct kevent kev[KQ_NEVENTS]; 831 832 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL) 833 return (EBADF); 834 835 if (fp->f_type != DTYPE_KQUEUE) { 836 error = EBADF; 837 goto done; 838 } 839 840 if (SCARG(uap, timeout) != NULL) { 841 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 842 if (error) 843 goto done; 844 #ifdef KTRACE 845 if (KTRPOINT(p, KTR_STRUCT)) 846 ktrreltimespec(p, &ts); 847 #endif 848 if (ts.tv_sec < 0 || !timespecisvalid(&ts)) { 849 error = EINVAL; 850 goto done; 851 } 852 tsp = &ts; 853 } 854 855 kq = fp->f_data; 856 nerrors = 0; 857 858 while ((n = SCARG(uap, nchanges)) > 0) { 859 if (n > nitems(kev)) 860 n = nitems(kev); 861 error = copyin(SCARG(uap, changelist), kev, 862 n * sizeof(struct kevent)); 863 if (error) 864 goto done; 865 #ifdef KTRACE 866 if (KTRPOINT(p, KTR_STRUCT)) 867 ktrevent(p, kev, n); 868 #endif 869 for (i = 0; i < n; i++) { 870 kevp = &kev[i]; 871 kevp->flags &= ~EV_SYSFLAGS; 872 error = kqueue_register(kq, kevp, p); 873 if (error || (kevp->flags & EV_RECEIPT)) { 874 if (SCARG(uap, nevents) != 0) { 875 kevp->flags = EV_ERROR; 876 kevp->data = error; 877 copyout(kevp, SCARG(uap, eventlist), 878 sizeof(*kevp)); 879 SCARG(uap, eventlist)++; 880 SCARG(uap, nevents)--; 881 nerrors++; 882 } else { 883 goto done; 884 } 885 } 886 } 887 SCARG(uap, nchanges) -= n; 888 SCARG(uap, changelist) += n; 889 } 890 if (nerrors) { 891 *retval = nerrors; 892 error = 0; 893 goto done; 894 } 895 896 kqueue_scan_setup(&scan, kq); 897 FRELE(fp, p); 898 /* 899 * Collect as many events as we can. The timeout on successive 900 * loops is disabled (kqueue_scan() becomes non-blocking). 901 */ 902 total = 0; 903 error = 0; 904 while ((n = SCARG(uap, nevents) - total) > 0) { 905 if (n > nitems(kev)) 906 n = nitems(kev); 907 ready = kqueue_scan(&scan, n, kev, tsp, p, &error); 908 if (ready == 0) 909 break; 910 error = copyout(kev, SCARG(uap, eventlist) + total, 911 sizeof(struct kevent) * ready); 912 #ifdef KTRACE 913 if (KTRPOINT(p, KTR_STRUCT)) 914 ktrevent(p, kev, ready); 915 #endif 916 total += ready; 917 if (error || ready < n) 918 break; 919 } 920 kqueue_scan_finish(&scan); 921 *retval = total; 922 return (error); 923 924 done: 925 FRELE(fp, p); 926 return (error); 927 } 928 929 #ifdef KQUEUE_DEBUG 930 void 931 kqueue_do_check(struct kqueue *kq, const char *func, int line) 932 { 933 struct knote *kn; 934 int count = 0, nmarker = 0; 935 936 KERNEL_ASSERT_LOCKED(); 937 splassert(IPL_HIGH); 938 939 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { 940 if (kn->kn_filter == EVFILT_MARKER) { 941 if ((kn->kn_status & KN_QUEUED) != 0) 942 panic("%s:%d: kq=%p kn=%p marker QUEUED", 943 func, line, kq, kn); 944 nmarker++; 945 } else { 946 if ((kn->kn_status & KN_ACTIVE) == 0) 947 panic("%s:%d: kq=%p kn=%p knote !ACTIVE", 948 func, line, kq, kn); 949 if ((kn->kn_status & KN_QUEUED) == 0) 950 panic("%s:%d: kq=%p kn=%p knote !QUEUED", 951 func, line, kq, kn); 952 if (kn->kn_kq != kq) 953 panic("%s:%d: kq=%p kn=%p kn_kq=%p != kq", 954 func, line, kq, kn, kn->kn_kq); 955 count++; 956 if (count > kq->kq_count) 957 goto bad; 958 } 959 } 960 if (count != kq->kq_count) { 961 bad: 962 panic("%s:%d: kq=%p kq_count=%d count=%d nmarker=%d", 963 func, line, kq, kq->kq_count, count, nmarker); 964 } 965 } 966 #endif 967 968 int 969 kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p) 970 { 971 struct filedesc *fdp = kq->kq_fdp; 972 const struct filterops *fops = NULL; 973 struct file *fp = NULL; 974 struct knote *kn = NULL, *newkn = NULL; 975 struct knlist *list = NULL; 976 int s, error = 0; 977 978 if (kev->filter < 0) { 979 if (kev->filter + EVFILT_SYSCOUNT < 0) 980 return (EINVAL); 981 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ 982 } 983 984 if (fops == NULL) { 985 /* 986 * XXX 987 * filter attach routine is responsible for ensuring that 988 * the identifier can be attached to it. 989 */ 990 return (EINVAL); 991 } 992 993 if (fops->f_flags & FILTEROP_ISFD) { 994 /* validate descriptor */ 995 if (kev->ident > INT_MAX) 996 return (EBADF); 997 } 998 999 if (kev->flags & EV_ADD) 1000 newkn = pool_get(&knote_pool, PR_WAITOK | PR_ZERO); 1001 1002 again: 1003 if (fops->f_flags & FILTEROP_ISFD) { 1004 if ((fp = fd_getfile(fdp, kev->ident)) == NULL) { 1005 error = EBADF; 1006 goto done; 1007 } 1008 if (kev->flags & EV_ADD) 1009 kqueue_expand_list(kq, kev->ident); 1010 if (kev->ident < kq->kq_knlistsize) 1011 list = &kq->kq_knlist[kev->ident]; 1012 } else { 1013 if (kev->flags & EV_ADD) 1014 kqueue_expand_hash(kq); 1015 if (kq->kq_knhashmask != 0) { 1016 list = &kq->kq_knhash[ 1017 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1018 } 1019 } 1020 if (list != NULL) { 1021 SLIST_FOREACH(kn, list, kn_link) { 1022 if (kev->filter == kn->kn_filter && 1023 kev->ident == kn->kn_id) { 1024 s = splhigh(); 1025 if (!knote_acquire(kn, NULL, 0)) { 1026 splx(s); 1027 if (fp != NULL) { 1028 FRELE(fp, p); 1029 fp = NULL; 1030 } 1031 goto again; 1032 } 1033 splx(s); 1034 break; 1035 } 1036 } 1037 } 1038 KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0); 1039 1040 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 1041 error = ENOENT; 1042 goto done; 1043 } 1044 1045 /* 1046 * kn now contains the matching knote, or NULL if no match. 1047 * If adding a new knote, sleeping is not allowed until the knote 1048 * has been inserted. 1049 */ 1050 if (kev->flags & EV_ADD) { 1051 if (kn == NULL) { 1052 kn = newkn; 1053 newkn = NULL; 1054 kn->kn_status = KN_PROCESSING; 1055 kn->kn_fp = fp; 1056 kn->kn_kq = kq; 1057 kn->kn_fop = fops; 1058 1059 /* 1060 * apply reference count to knote structure, and 1061 * do not release it at the end of this routine. 1062 */ 1063 fp = NULL; 1064 1065 kn->kn_sfflags = kev->fflags; 1066 kn->kn_sdata = kev->data; 1067 kev->fflags = 0; 1068 kev->data = 0; 1069 kn->kn_kevent = *kev; 1070 1071 knote_attach(kn); 1072 error = filter_attach(kn); 1073 if (error != 0) { 1074 knote_drop(kn, p); 1075 goto done; 1076 } 1077 1078 /* 1079 * If this is a file descriptor filter, check if 1080 * fd was closed while the knote was being added. 1081 * knote_fdclose() has missed kn if the function 1082 * ran before kn appeared in kq_knlist. 1083 */ 1084 if ((fops->f_flags & FILTEROP_ISFD) && 1085 fd_checkclosed(fdp, kev->ident, kn->kn_fp)) { 1086 /* 1087 * Drop the knote silently without error 1088 * because another thread might already have 1089 * seen it. This corresponds to the insert 1090 * happening in full before the close. 1091 */ 1092 filter_detach(kn); 1093 knote_drop(kn, p); 1094 goto done; 1095 } 1096 1097 /* Check if there is a pending event. */ 1098 if (filter_process(kn, NULL)) 1099 knote_activate(kn); 1100 } else { 1101 /* 1102 * The user may change some filter values after the 1103 * initial EV_ADD, but doing so will not reset any 1104 * filters which have already been triggered. 1105 */ 1106 if (filter_modify(kev, kn)) 1107 knote_activate(kn); 1108 if (kev->flags & EV_ERROR) { 1109 error = kev->data; 1110 goto release; 1111 } 1112 } 1113 } else if (kev->flags & EV_DELETE) { 1114 filter_detach(kn); 1115 knote_drop(kn, p); 1116 goto done; 1117 } 1118 1119 if ((kev->flags & EV_DISABLE) && 1120 ((kn->kn_status & KN_DISABLED) == 0)) { 1121 s = splhigh(); 1122 kn->kn_status |= KN_DISABLED; 1123 splx(s); 1124 } 1125 1126 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 1127 s = splhigh(); 1128 kn->kn_status &= ~KN_DISABLED; 1129 splx(s); 1130 /* Check if there is a pending event. */ 1131 if (filter_process(kn, NULL)) 1132 knote_activate(kn); 1133 } 1134 1135 release: 1136 s = splhigh(); 1137 knote_release(kn); 1138 splx(s); 1139 done: 1140 if (fp != NULL) 1141 FRELE(fp, p); 1142 if (newkn != NULL) 1143 pool_put(&knote_pool, newkn); 1144 return (error); 1145 } 1146 1147 int 1148 kqueue_sleep(struct kqueue *kq, struct timespec *tsp) 1149 { 1150 struct timespec elapsed, start, stop; 1151 uint64_t nsecs; 1152 int error; 1153 1154 splassert(IPL_HIGH); 1155 1156 if (tsp != NULL) { 1157 getnanouptime(&start); 1158 nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP); 1159 } else 1160 nsecs = INFSLP; 1161 error = tsleep_nsec(kq, PSOCK | PCATCH, "kqread", nsecs); 1162 if (tsp != NULL) { 1163 getnanouptime(&stop); 1164 timespecsub(&stop, &start, &elapsed); 1165 timespecsub(tsp, &elapsed, tsp); 1166 if (tsp->tv_sec < 0) 1167 timespecclear(tsp); 1168 } 1169 1170 return (error); 1171 } 1172 1173 /* 1174 * Scan the kqueue, blocking if necessary until the target time is reached. 1175 * If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both 1176 * 0 we do not block at all. 1177 */ 1178 int 1179 kqueue_scan(struct kqueue_scan_state *scan, int maxevents, 1180 struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp) 1181 { 1182 struct kqueue *kq = scan->kqs_kq; 1183 struct knote *kn; 1184 int s, error = 0, nkev = 0; 1185 1186 if (maxevents == 0) 1187 goto done; 1188 retry: 1189 KASSERT(nkev == 0); 1190 1191 error = 0; 1192 1193 if (kq->kq_state & KQ_DYING) { 1194 error = EBADF; 1195 goto done; 1196 } 1197 1198 s = splhigh(); 1199 if (kq->kq_count == 0) { 1200 /* 1201 * Successive loops are only necessary if there are more 1202 * ready events to gather, so they don't need to block. 1203 */ 1204 if ((tsp != NULL && !timespecisset(tsp)) || 1205 scan->kqs_nevent != 0) { 1206 splx(s); 1207 error = 0; 1208 goto done; 1209 } 1210 kq->kq_state |= KQ_SLEEP; 1211 error = kqueue_sleep(kq, tsp); 1212 splx(s); 1213 if (error == 0 || error == EWOULDBLOCK) 1214 goto retry; 1215 /* don't restart after signals... */ 1216 if (error == ERESTART) 1217 error = EINTR; 1218 goto done; 1219 } 1220 1221 /* 1222 * Put the end marker in the queue to limit the scan to the events 1223 * that are currently active. This prevents events from being 1224 * recollected if they reactivate during scan. 1225 * 1226 * If a partial scan has been performed already but no events have 1227 * been collected, reposition the end marker to make any new events 1228 * reachable. 1229 */ 1230 if (!scan->kqs_queued) { 1231 TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); 1232 scan->kqs_queued = 1; 1233 } else if (scan->kqs_nevent == 0) { 1234 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); 1235 TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); 1236 } 1237 1238 TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe); 1239 while (nkev < maxevents) { 1240 kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe); 1241 if (kn->kn_filter == EVFILT_MARKER) { 1242 if (kn == &scan->kqs_end) 1243 break; 1244 1245 /* Move start marker past another thread's marker. */ 1246 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); 1247 TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start, 1248 kn_tqe); 1249 continue; 1250 } 1251 1252 if (!knote_acquire(kn, NULL, 0)) 1253 continue; 1254 1255 kqueue_check(kq); 1256 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1257 kn->kn_status &= ~KN_QUEUED; 1258 kq->kq_count--; 1259 kqueue_check(kq); 1260 1261 if (kn->kn_status & KN_DISABLED) { 1262 knote_release(kn); 1263 continue; 1264 } 1265 1266 splx(s); 1267 1268 memset(kevp, 0, sizeof(*kevp)); 1269 if (filter_process(kn, kevp) == 0) { 1270 s = splhigh(); 1271 if ((kn->kn_status & KN_QUEUED) == 0) 1272 kn->kn_status &= ~KN_ACTIVE; 1273 knote_release(kn); 1274 kqueue_check(kq); 1275 continue; 1276 } 1277 1278 /* 1279 * Post-event action on the note 1280 */ 1281 if (kevp->flags & EV_ONESHOT) { 1282 filter_detach(kn); 1283 knote_drop(kn, p); 1284 s = splhigh(); 1285 } else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) { 1286 s = splhigh(); 1287 if (kevp->flags & EV_DISPATCH) 1288 kn->kn_status |= KN_DISABLED; 1289 if ((kn->kn_status & KN_QUEUED) == 0) 1290 kn->kn_status &= ~KN_ACTIVE; 1291 KASSERT(kn->kn_status & KN_ATTACHED); 1292 knote_release(kn); 1293 } else { 1294 s = splhigh(); 1295 if ((kn->kn_status & KN_QUEUED) == 0) { 1296 kqueue_check(kq); 1297 kq->kq_count++; 1298 kn->kn_status |= KN_QUEUED; 1299 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1300 } 1301 KASSERT(kn->kn_status & KN_ATTACHED); 1302 knote_release(kn); 1303 } 1304 kqueue_check(kq); 1305 1306 kevp++; 1307 nkev++; 1308 scan->kqs_nevent++; 1309 } 1310 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); 1311 splx(s); 1312 if (scan->kqs_nevent == 0) 1313 goto retry; 1314 done: 1315 *errorp = error; 1316 return (nkev); 1317 } 1318 1319 void 1320 kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq) 1321 { 1322 memset(scan, 0, sizeof(*scan)); 1323 1324 KQREF(kq); 1325 scan->kqs_kq = kq; 1326 scan->kqs_start.kn_filter = EVFILT_MARKER; 1327 scan->kqs_start.kn_status = KN_PROCESSING; 1328 scan->kqs_end.kn_filter = EVFILT_MARKER; 1329 scan->kqs_end.kn_status = KN_PROCESSING; 1330 } 1331 1332 void 1333 kqueue_scan_finish(struct kqueue_scan_state *scan) 1334 { 1335 struct kqueue *kq = scan->kqs_kq; 1336 int s; 1337 1338 KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER); 1339 KASSERT(scan->kqs_start.kn_status == KN_PROCESSING); 1340 KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER); 1341 KASSERT(scan->kqs_end.kn_status == KN_PROCESSING); 1342 1343 if (scan->kqs_queued) { 1344 scan->kqs_queued = 0; 1345 s = splhigh(); 1346 TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); 1347 splx(s); 1348 } 1349 KQRELE(kq); 1350 } 1351 1352 /* 1353 * XXX 1354 * This could be expanded to call kqueue_scan, if desired. 1355 */ 1356 int 1357 kqueue_read(struct file *fp, struct uio *uio, int fflags) 1358 { 1359 return (ENXIO); 1360 } 1361 1362 int 1363 kqueue_write(struct file *fp, struct uio *uio, int fflags) 1364 { 1365 return (ENXIO); 1366 } 1367 1368 int 1369 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) 1370 { 1371 return (ENOTTY); 1372 } 1373 1374 int 1375 kqueue_poll(struct file *fp, int events, struct proc *p) 1376 { 1377 struct kqueue *kq = (struct kqueue *)fp->f_data; 1378 int revents = 0; 1379 int s = splhigh(); 1380 1381 if (events & (POLLIN | POLLRDNORM)) { 1382 if (kq->kq_count) { 1383 revents |= events & (POLLIN | POLLRDNORM); 1384 } else { 1385 selrecord(p, &kq->kq_sel); 1386 kq->kq_state |= KQ_SEL; 1387 } 1388 } 1389 splx(s); 1390 return (revents); 1391 } 1392 1393 int 1394 kqueue_stat(struct file *fp, struct stat *st, struct proc *p) 1395 { 1396 struct kqueue *kq = fp->f_data; 1397 1398 memset(st, 0, sizeof(*st)); 1399 st->st_size = kq->kq_count; 1400 st->st_blksize = sizeof(struct kevent); 1401 st->st_mode = S_IFIFO; 1402 return (0); 1403 } 1404 1405 void 1406 kqueue_purge(struct proc *p, struct kqueue *kq) 1407 { 1408 int i; 1409 1410 KERNEL_ASSERT_LOCKED(); 1411 1412 for (i = 0; i < kq->kq_knlistsize; i++) 1413 knote_remove(p, &kq->kq_knlist[i], 1); 1414 if (kq->kq_knhashmask != 0) { 1415 for (i = 0; i < kq->kq_knhashmask + 1; i++) 1416 knote_remove(p, &kq->kq_knhash[i], 1); 1417 } 1418 } 1419 1420 void 1421 kqueue_terminate(struct proc *p, struct kqueue *kq) 1422 { 1423 struct knote *kn; 1424 1425 /* 1426 * Any remaining entries should be scan markers. 1427 * They are removed when the ongoing scans finish. 1428 */ 1429 KASSERT(kq->kq_count == 0); 1430 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) 1431 KASSERT(kn->kn_filter == EVFILT_MARKER); 1432 1433 kq->kq_state |= KQ_DYING; 1434 kqueue_wakeup(kq); 1435 1436 KASSERT(klist_empty(&kq->kq_sel.si_note)); 1437 task_del(systq, &kq->kq_task); 1438 1439 } 1440 1441 int 1442 kqueue_close(struct file *fp, struct proc *p) 1443 { 1444 struct kqueue *kq = fp->f_data; 1445 1446 KERNEL_LOCK(); 1447 kqueue_purge(p, kq); 1448 kqueue_terminate(p, kq); 1449 fp->f_data = NULL; 1450 1451 KQRELE(kq); 1452 1453 KERNEL_UNLOCK(); 1454 1455 return (0); 1456 } 1457 1458 static void 1459 kqueue_task(void *arg) 1460 { 1461 struct kqueue *kq = arg; 1462 1463 if (kq->kq_state & KQ_SEL) { 1464 kq->kq_state &= ~KQ_SEL; 1465 selwakeup(&kq->kq_sel); 1466 } else { 1467 KNOTE(&kq->kq_sel.si_note, 0); 1468 } 1469 KQRELE(kq); 1470 } 1471 1472 void 1473 kqueue_wakeup(struct kqueue *kq) 1474 { 1475 1476 if (kq->kq_state & KQ_SLEEP) { 1477 kq->kq_state &= ~KQ_SLEEP; 1478 wakeup(kq); 1479 } 1480 if ((kq->kq_state & KQ_SEL) || !klist_empty(&kq->kq_sel.si_note)) { 1481 /* Defer activation to avoid recursion. */ 1482 KQREF(kq); 1483 if (!task_add(systq, &kq->kq_task)) 1484 KQRELE(kq); 1485 } 1486 } 1487 1488 static void 1489 kqueue_expand_hash(struct kqueue *kq) 1490 { 1491 struct knlist *hash; 1492 u_long hashmask; 1493 1494 if (kq->kq_knhashmask == 0) { 1495 hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask); 1496 if (kq->kq_knhashmask == 0) { 1497 kq->kq_knhash = hash; 1498 kq->kq_knhashmask = hashmask; 1499 } else { 1500 /* Another thread has allocated the hash. */ 1501 hashfree(hash, KN_HASHSIZE, M_KEVENT); 1502 } 1503 } 1504 } 1505 1506 static void 1507 kqueue_expand_list(struct kqueue *kq, int fd) 1508 { 1509 struct knlist *list; 1510 int size; 1511 1512 if (kq->kq_knlistsize <= fd) { 1513 size = kq->kq_knlistsize; 1514 while (size <= fd) 1515 size += KQEXTENT; 1516 list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK); 1517 if (kq->kq_knlistsize <= fd) { 1518 memcpy(list, kq->kq_knlist, 1519 kq->kq_knlistsize * sizeof(*list)); 1520 memset(&list[kq->kq_knlistsize], 0, 1521 (size - kq->kq_knlistsize) * sizeof(*list)); 1522 free(kq->kq_knlist, M_KEVENT, 1523 kq->kq_knlistsize * sizeof(*list)); 1524 kq->kq_knlist = list; 1525 kq->kq_knlistsize = size; 1526 } else { 1527 /* Another thread has expanded the list. */ 1528 free(list, M_KEVENT, size * sizeof(*list)); 1529 } 1530 } 1531 } 1532 1533 /* 1534 * Acquire a knote, return non-zero on success, 0 on failure. 1535 * 1536 * If we cannot acquire the knote we sleep and return 0. The knote 1537 * may be stale on return in this case and the caller must restart 1538 * whatever loop they are in. 1539 * 1540 * If we are about to sleep and klist is non-NULL, the list is unlocked 1541 * before sleep and remains unlocked on return. 1542 */ 1543 int 1544 knote_acquire(struct knote *kn, struct klist *klist, int ls) 1545 { 1546 splassert(IPL_HIGH); 1547 KASSERT(kn->kn_filter != EVFILT_MARKER); 1548 1549 if (kn->kn_status & KN_PROCESSING) { 1550 kn->kn_status |= KN_WAITING; 1551 if (klist != NULL) 1552 klist_unlock(klist, ls); 1553 tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1)); 1554 /* knote may be stale now */ 1555 return (0); 1556 } 1557 kn->kn_status |= KN_PROCESSING; 1558 return (1); 1559 } 1560 1561 /* 1562 * Release an acquired knote, clearing KN_PROCESSING. 1563 */ 1564 void 1565 knote_release(struct knote *kn) 1566 { 1567 splassert(IPL_HIGH); 1568 KASSERT(kn->kn_filter != EVFILT_MARKER); 1569 KASSERT(kn->kn_status & KN_PROCESSING); 1570 1571 if (kn->kn_status & KN_WAITING) { 1572 kn->kn_status &= ~KN_WAITING; 1573 wakeup(kn); 1574 } 1575 kn->kn_status &= ~KN_PROCESSING; 1576 /* kn should not be accessed anymore */ 1577 } 1578 1579 /* 1580 * activate one knote. 1581 */ 1582 void 1583 knote_activate(struct knote *kn) 1584 { 1585 int s; 1586 1587 s = splhigh(); 1588 kn->kn_status |= KN_ACTIVE; 1589 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) 1590 knote_enqueue(kn); 1591 splx(s); 1592 } 1593 1594 /* 1595 * walk down a list of knotes, activating them if their event has triggered. 1596 */ 1597 void 1598 knote(struct klist *list, long hint) 1599 { 1600 struct knote *kn, *kn0; 1601 1602 KLIST_ASSERT_LOCKED(list); 1603 1604 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) 1605 if (filter_event(kn, hint)) 1606 knote_activate(kn); 1607 } 1608 1609 /* 1610 * remove all knotes from a specified knlist 1611 */ 1612 void 1613 knote_remove(struct proc *p, struct knlist *list, int purge) 1614 { 1615 struct knote *kn; 1616 int s; 1617 1618 while ((kn = SLIST_FIRST(list)) != NULL) { 1619 s = splhigh(); 1620 if (!knote_acquire(kn, NULL, 0)) { 1621 splx(s); 1622 continue; 1623 } 1624 splx(s); 1625 filter_detach(kn); 1626 1627 /* 1628 * Notify poll(2) and select(2) when a monitored 1629 * file descriptor is closed. 1630 * 1631 * This reuses the original knote for delivering the 1632 * notification so as to avoid allocating memory. 1633 * The knote will be reachable only through the queue 1634 * of active knotes and is freed either by kqueue_scan() 1635 * or kqpoll_dequeue(). 1636 */ 1637 if (!purge && (kn->kn_flags & __EV_POLL) != 0) { 1638 KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD); 1639 knote_detach(kn); 1640 FRELE(kn->kn_fp, p); 1641 kn->kn_fp = NULL; 1642 1643 kn->kn_fop = &badfd_filtops; 1644 filter_event(kn, 0); 1645 knote_activate(kn); 1646 s = splhigh(); 1647 knote_release(kn); 1648 splx(s); 1649 continue; 1650 } 1651 1652 knote_drop(kn, p); 1653 } 1654 } 1655 1656 /* 1657 * remove all knotes referencing a specified fd 1658 */ 1659 void 1660 knote_fdclose(struct proc *p, int fd) 1661 { 1662 struct filedesc *fdp = p->p_p->ps_fd; 1663 struct kqueue *kq; 1664 struct knlist *list; 1665 1666 /* 1667 * fdplock can be ignored if the file descriptor table is being freed 1668 * because no other thread can access the fdp. 1669 */ 1670 if (fdp->fd_refcnt != 0) 1671 fdpassertlocked(fdp); 1672 1673 if (LIST_EMPTY(&fdp->fd_kqlist)) 1674 return; 1675 1676 KERNEL_LOCK(); 1677 LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) { 1678 if (fd >= kq->kq_knlistsize) 1679 continue; 1680 1681 list = &kq->kq_knlist[fd]; 1682 knote_remove(p, list, 0); 1683 } 1684 KERNEL_UNLOCK(); 1685 } 1686 1687 /* 1688 * handle a process exiting, including the triggering of NOTE_EXIT notes 1689 * XXX this could be more efficient, doing a single pass down the klist 1690 */ 1691 void 1692 knote_processexit(struct proc *p) 1693 { 1694 struct process *pr = p->p_p; 1695 1696 KASSERT(p == curproc); 1697 1698 KNOTE(&pr->ps_klist, NOTE_EXIT); 1699 1700 /* remove other knotes hanging off the process */ 1701 klist_invalidate(&pr->ps_klist); 1702 } 1703 1704 void 1705 knote_attach(struct knote *kn) 1706 { 1707 struct kqueue *kq = kn->kn_kq; 1708 struct knlist *list; 1709 int s; 1710 1711 KASSERT(kn->kn_status & KN_PROCESSING); 1712 KASSERT((kn->kn_status & KN_ATTACHED) == 0); 1713 1714 s = splhigh(); 1715 kn->kn_status |= KN_ATTACHED; 1716 splx(s); 1717 1718 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1719 KASSERT(kq->kq_knlistsize > kn->kn_id); 1720 list = &kq->kq_knlist[kn->kn_id]; 1721 } else { 1722 KASSERT(kq->kq_knhashmask != 0); 1723 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1724 } 1725 SLIST_INSERT_HEAD(list, kn, kn_link); 1726 } 1727 1728 void 1729 knote_detach(struct knote *kn) 1730 { 1731 struct kqueue *kq = kn->kn_kq; 1732 struct knlist *list; 1733 int s; 1734 1735 KASSERT(kn->kn_status & KN_PROCESSING); 1736 1737 if ((kn->kn_status & KN_ATTACHED) == 0) 1738 return; 1739 1740 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 1741 list = &kq->kq_knlist[kn->kn_id]; 1742 else 1743 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1744 SLIST_REMOVE(list, kn, knote, kn_link); 1745 1746 s = splhigh(); 1747 kn->kn_status &= ~KN_ATTACHED; 1748 splx(s); 1749 } 1750 1751 /* 1752 * should be called at spl == 0, since we don't want to hold spl 1753 * while calling FRELE and pool_put. 1754 */ 1755 void 1756 knote_drop(struct knote *kn, struct proc *p) 1757 { 1758 int s; 1759 1760 KASSERT(kn->kn_filter != EVFILT_MARKER); 1761 1762 knote_detach(kn); 1763 1764 s = splhigh(); 1765 if (kn->kn_status & KN_QUEUED) 1766 knote_dequeue(kn); 1767 if (kn->kn_status & KN_WAITING) { 1768 kn->kn_status &= ~KN_WAITING; 1769 wakeup(kn); 1770 } 1771 splx(s); 1772 if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL) 1773 FRELE(kn->kn_fp, p); 1774 pool_put(&knote_pool, kn); 1775 } 1776 1777 1778 void 1779 knote_enqueue(struct knote *kn) 1780 { 1781 struct kqueue *kq = kn->kn_kq; 1782 1783 splassert(IPL_HIGH); 1784 KASSERT(kn->kn_filter != EVFILT_MARKER); 1785 KASSERT((kn->kn_status & KN_QUEUED) == 0); 1786 1787 kqueue_check(kq); 1788 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1789 kn->kn_status |= KN_QUEUED; 1790 kq->kq_count++; 1791 kqueue_check(kq); 1792 kqueue_wakeup(kq); 1793 } 1794 1795 void 1796 knote_dequeue(struct knote *kn) 1797 { 1798 struct kqueue *kq = kn->kn_kq; 1799 1800 splassert(IPL_HIGH); 1801 KASSERT(kn->kn_filter != EVFILT_MARKER); 1802 KASSERT(kn->kn_status & KN_QUEUED); 1803 1804 kqueue_check(kq); 1805 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1806 kn->kn_status &= ~KN_QUEUED; 1807 kq->kq_count--; 1808 kqueue_check(kq); 1809 } 1810 1811 /* 1812 * Modify the knote's parameters. 1813 * 1814 * The knote's object lock must be held. 1815 */ 1816 void 1817 knote_modify(const struct kevent *kev, struct knote *kn) 1818 { 1819 kn->kn_sfflags = kev->fflags; 1820 kn->kn_sdata = kev->data; 1821 kn->kn_udata = kev->udata; 1822 } 1823 1824 /* 1825 * Submit the knote's event for delivery. 1826 * 1827 * The knote's object lock must be held. 1828 */ 1829 void 1830 knote_submit(struct knote *kn, struct kevent *kev) 1831 { 1832 if (kev != NULL) { 1833 *kev = kn->kn_kevent; 1834 if (kn->kn_flags & EV_CLEAR) { 1835 kn->kn_fflags = 0; 1836 kn->kn_data = 0; 1837 } 1838 } 1839 } 1840 1841 void 1842 klist_init(struct klist *klist, const struct klistops *ops, void *arg) 1843 { 1844 SLIST_INIT(&klist->kl_list); 1845 klist->kl_ops = ops; 1846 klist->kl_arg = arg; 1847 } 1848 1849 void 1850 klist_free(struct klist *klist) 1851 { 1852 KASSERT(SLIST_EMPTY(&klist->kl_list)); 1853 } 1854 1855 void 1856 klist_insert(struct klist *klist, struct knote *kn) 1857 { 1858 int ls; 1859 1860 ls = klist_lock(klist); 1861 SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); 1862 klist_unlock(klist, ls); 1863 } 1864 1865 void 1866 klist_insert_locked(struct klist *klist, struct knote *kn) 1867 { 1868 KLIST_ASSERT_LOCKED(klist); 1869 1870 SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); 1871 } 1872 1873 void 1874 klist_remove(struct klist *klist, struct knote *kn) 1875 { 1876 int ls; 1877 1878 ls = klist_lock(klist); 1879 SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); 1880 klist_unlock(klist, ls); 1881 } 1882 1883 void 1884 klist_remove_locked(struct klist *klist, struct knote *kn) 1885 { 1886 KLIST_ASSERT_LOCKED(klist); 1887 1888 SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); 1889 } 1890 1891 int 1892 klist_empty(struct klist *klist) 1893 { 1894 return (SLIST_EMPTY(&klist->kl_list)); 1895 } 1896 1897 /* 1898 * Detach all knotes from klist. The knotes are rewired to indicate EOF. 1899 * 1900 * The caller of this function must not hold any locks that can block 1901 * filterops callbacks that run with KN_PROCESSING. 1902 * Otherwise this function might deadlock. 1903 */ 1904 void 1905 klist_invalidate(struct klist *list) 1906 { 1907 struct knote *kn; 1908 struct proc *p = curproc; 1909 int ls, s; 1910 1911 NET_ASSERT_UNLOCKED(); 1912 1913 s = splhigh(); 1914 ls = klist_lock(list); 1915 while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) { 1916 if (!knote_acquire(kn, list, ls)) { 1917 /* knote_acquire() has unlocked list. */ 1918 ls = klist_lock(list); 1919 continue; 1920 } 1921 klist_unlock(list, ls); 1922 splx(s); 1923 filter_detach(kn); 1924 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1925 kn->kn_fop = &dead_filtops; 1926 filter_event(kn, 0); 1927 knote_activate(kn); 1928 s = splhigh(); 1929 knote_release(kn); 1930 } else { 1931 knote_drop(kn, p); 1932 s = splhigh(); 1933 } 1934 ls = klist_lock(list); 1935 } 1936 klist_unlock(list, ls); 1937 splx(s); 1938 } 1939 1940 static int 1941 klist_lock(struct klist *list) 1942 { 1943 int ls = 0; 1944 1945 if (list->kl_ops != NULL) { 1946 ls = list->kl_ops->klo_lock(list->kl_arg); 1947 } else { 1948 KERNEL_LOCK(); 1949 ls = splhigh(); 1950 } 1951 return ls; 1952 } 1953 1954 static void 1955 klist_unlock(struct klist *list, int ls) 1956 { 1957 if (list->kl_ops != NULL) { 1958 list->kl_ops->klo_unlock(list->kl_arg, ls); 1959 } else { 1960 splx(ls); 1961 KERNEL_UNLOCK(); 1962 } 1963 } 1964 1965 static void 1966 klist_mutex_assertlk(void *arg) 1967 { 1968 struct mutex *mtx = arg; 1969 1970 (void)mtx; 1971 1972 MUTEX_ASSERT_LOCKED(mtx); 1973 } 1974 1975 static int 1976 klist_mutex_lock(void *arg) 1977 { 1978 struct mutex *mtx = arg; 1979 1980 mtx_enter(mtx); 1981 return 0; 1982 } 1983 1984 static void 1985 klist_mutex_unlock(void *arg, int s) 1986 { 1987 struct mutex *mtx = arg; 1988 1989 mtx_leave(mtx); 1990 } 1991 1992 static const struct klistops mutex_klistops = { 1993 .klo_assertlk = klist_mutex_assertlk, 1994 .klo_lock = klist_mutex_lock, 1995 .klo_unlock = klist_mutex_unlock, 1996 }; 1997 1998 void 1999 klist_init_mutex(struct klist *klist, struct mutex *mtx) 2000 { 2001 klist_init(klist, &mutex_klistops, mtx); 2002 } 2003 2004 static void 2005 klist_rwlock_assertlk(void *arg) 2006 { 2007 struct rwlock *rwl = arg; 2008 2009 (void)rwl; 2010 2011 rw_assert_wrlock(rwl); 2012 } 2013 2014 static int 2015 klist_rwlock_lock(void *arg) 2016 { 2017 struct rwlock *rwl = arg; 2018 2019 rw_enter_write(rwl); 2020 return 0; 2021 } 2022 2023 static void 2024 klist_rwlock_unlock(void *arg, int s) 2025 { 2026 struct rwlock *rwl = arg; 2027 2028 rw_exit_write(rwl); 2029 } 2030 2031 static const struct klistops rwlock_klistops = { 2032 .klo_assertlk = klist_rwlock_assertlk, 2033 .klo_lock = klist_rwlock_lock, 2034 .klo_unlock = klist_rwlock_unlock, 2035 }; 2036 2037 void 2038 klist_init_rwlock(struct klist *klist, struct rwlock *rwl) 2039 { 2040 klist_init(klist, &rwlock_klistops, rwl); 2041 } 2042