1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/proc.h> 33 #include <sys/malloc.h> 34 #include <sys/unistd.h> 35 #include <sys/file.h> 36 #include <sys/lock.h> 37 #include <sys/fcntl.h> 38 #include <sys/queue.h> 39 #include <sys/event.h> 40 #include <sys/eventvar.h> 41 #include <sys/protosw.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/stat.h> 45 #include <sys/sysctl.h> 46 #include <sys/sysproto.h> 47 #include <sys/thread.h> 48 #include <sys/uio.h> 49 #include <sys/signalvar.h> 50 #include <sys/filio.h> 51 #include <sys/ktr.h> 52 53 #include <sys/thread2.h> 54 #include <sys/file2.h> 55 #include <sys/mplock2.h> 56 57 #define EVENT_REGISTER 1 58 #define EVENT_PROCESS 2 59 60 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 61 62 struct kevent_copyin_args { 63 struct kevent_args *ka; 64 int pchanges; 65 }; 66 67 #define KNOTE_CACHE_MAX 8 68 69 struct knote_cache_list { 70 struct klist knote_cache; 71 int knote_cache_cnt; 72 } __cachealign; 73 74 static int kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 75 struct knote *marker); 76 static int kqueue_read(struct file *fp, struct uio *uio, 77 struct ucred *cred, int flags); 78 static int kqueue_write(struct file *fp, struct uio *uio, 79 struct ucred *cred, int flags); 80 static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 81 struct ucred *cred, struct sysmsg *msg); 82 static int kqueue_kqfilter(struct file *fp, struct knote *kn); 83 static int kqueue_stat(struct file *fp, struct stat *st, 84 struct ucred *cred); 85 static int kqueue_close(struct file *fp); 86 static void kqueue_wakeup(struct kqueue *kq); 87 static int filter_attach(struct knote *kn); 88 static int filter_event(struct knote *kn, long hint); 89 90 /* 91 * MPSAFE 92 */ 93 static struct fileops kqueueops = { 94 .fo_read = kqueue_read, 95 .fo_write = kqueue_write, 96 .fo_ioctl = kqueue_ioctl, 97 .fo_kqfilter = kqueue_kqfilter, 98 .fo_stat = kqueue_stat, 99 .fo_close = kqueue_close, 100 .fo_shutdown = nofo_shutdown 101 }; 102 103 static void knote_attach(struct knote *kn); 104 static void knote_drop(struct knote *kn); 105 static void knote_detach_and_drop(struct knote *kn); 106 static void knote_enqueue(struct knote *kn); 107 static void knote_dequeue(struct knote *kn); 108 static struct knote *knote_alloc(void); 109 static void knote_free(struct knote *kn); 110 111 static void precise_sleep_intr(systimer_t info, int in_ipi, 112 struct intrframe *frame); 113 static int precise_sleep(void *ident, int flags, const char *wmesg, 114 int us); 115 116 static void filt_kqdetach(struct knote *kn); 117 static int filt_kqueue(struct knote *kn, long hint); 118 static int filt_procattach(struct knote *kn); 119 static void filt_procdetach(struct knote *kn); 120 static int filt_proc(struct knote *kn, long hint); 121 static int filt_fileattach(struct knote *kn); 122 static void filt_timerexpire(void *knx); 123 static int filt_timerattach(struct knote *kn); 124 static void filt_timerdetach(struct knote *kn); 125 static int filt_timer(struct knote *kn, long hint); 126 static int filt_userattach(struct knote *kn); 127 static void filt_userdetach(struct knote *kn); 128 static int filt_user(struct knote *kn, long hint); 129 static void filt_usertouch(struct knote *kn, struct kevent *kev, 130 u_long type); 131 static int filt_fsattach(struct knote *kn); 132 static void filt_fsdetach(struct knote *kn); 133 static int filt_fs(struct knote *kn, long hint); 134 135 static struct filterops file_filtops = 136 { FILTEROP_ISFD | FILTEROP_MPSAFE, filt_fileattach, NULL, NULL }; 137 static struct filterops kqread_filtops = 138 { FILTEROP_ISFD | FILTEROP_MPSAFE, NULL, filt_kqdetach, filt_kqueue }; 139 static struct filterops proc_filtops = 140 { FILTEROP_MPSAFE, filt_procattach, filt_procdetach, filt_proc }; 141 static struct filterops timer_filtops = 142 { FILTEROP_MPSAFE, filt_timerattach, filt_timerdetach, filt_timer }; 143 static struct filterops user_filtops = 144 { FILTEROP_MPSAFE, filt_userattach, filt_userdetach, filt_user }; 145 static struct filterops fs_filtops = 146 { FILTEROP_MPSAFE, filt_fsattach, filt_fsdetach, filt_fs }; 147 148 static int kq_ncallouts = 0; 149 static int kq_calloutmax = (4 * 1024); 150 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 151 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 152 static int kq_checkloop = 1000000; 153 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW, 154 &kq_checkloop, 0, "Maximum number of loops for kqueue scan"); 155 static int kq_sleep_threshold = 20000; 156 SYSCTL_INT(_kern, OID_AUTO, kq_sleep_threshold, CTLFLAG_RW, 157 &kq_sleep_threshold, 0, "Minimum sleep duration without busy-looping"); 158 159 #define KNOTE_ACTIVATE(kn) do { \ 160 kn->kn_status |= KN_ACTIVE; \ 161 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 162 knote_enqueue(kn); \ 163 } while(0) 164 165 #define KN_HASHSIZE 64 /* XXX should be tunable */ 166 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 167 168 extern struct filterops aio_filtops; 169 extern struct filterops sig_filtops; 170 171 /* 172 * Table for for all system-defined filters. 173 */ 174 static struct filterops *sysfilt_ops[] = { 175 &file_filtops, /* EVFILT_READ */ 176 &file_filtops, /* EVFILT_WRITE */ 177 &aio_filtops, /* EVFILT_AIO */ 178 &file_filtops, /* EVFILT_VNODE */ 179 &proc_filtops, /* EVFILT_PROC */ 180 &sig_filtops, /* EVFILT_SIGNAL */ 181 &timer_filtops, /* EVFILT_TIMER */ 182 &file_filtops, /* EVFILT_EXCEPT */ 183 &user_filtops, /* EVFILT_USER */ 184 &fs_filtops, /* EVFILT_FS */ 185 }; 186 187 static struct knote_cache_list knote_cache_lists[MAXCPU]; 188 189 /* 190 * Acquire a knote, return non-zero on success, 0 on failure. 191 * 192 * If we cannot acquire the knote we sleep and return 0. The knote 193 * may be stale on return in this case and the caller must restart 194 * whatever loop they are in. 195 * 196 * Related kq token must be held. 197 */ 198 static __inline int 199 knote_acquire(struct knote *kn) 200 { 201 if (kn->kn_status & KN_PROCESSING) { 202 kn->kn_status |= KN_WAITING | KN_REPROCESS; 203 tsleep(kn, 0, "kqepts", hz); 204 /* knote may be stale now */ 205 return(0); 206 } 207 kn->kn_status |= KN_PROCESSING; 208 return(1); 209 } 210 211 /* 212 * Release an acquired knote, clearing KN_PROCESSING and handling any 213 * KN_REPROCESS events. 214 * 215 * Caller must be holding the related kq token 216 * 217 * Non-zero is returned if the knote is destroyed or detached. 218 */ 219 static __inline int 220 knote_release(struct knote *kn) 221 { 222 int ret; 223 224 while (kn->kn_status & KN_REPROCESS) { 225 kn->kn_status &= ~KN_REPROCESS; 226 if (kn->kn_status & KN_WAITING) { 227 kn->kn_status &= ~KN_WAITING; 228 wakeup(kn); 229 } 230 if (kn->kn_status & KN_DELETING) { 231 knote_detach_and_drop(kn); 232 return(1); 233 /* NOT REACHED */ 234 } 235 if (filter_event(kn, 0)) 236 KNOTE_ACTIVATE(kn); 237 } 238 if (kn->kn_status & KN_DETACHED) 239 ret = 1; 240 else 241 ret = 0; 242 kn->kn_status &= ~KN_PROCESSING; 243 /* kn should not be accessed anymore */ 244 return ret; 245 } 246 247 static int 248 filt_fileattach(struct knote *kn) 249 { 250 return (fo_kqfilter(kn->kn_fp, kn)); 251 } 252 253 /* 254 * MPSAFE 255 */ 256 static int 257 kqueue_kqfilter(struct file *fp, struct knote *kn) 258 { 259 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 260 261 if (kn->kn_filter != EVFILT_READ) 262 return (EOPNOTSUPP); 263 264 kn->kn_fop = &kqread_filtops; 265 knote_insert(&kq->kq_kqinfo.ki_note, kn); 266 return (0); 267 } 268 269 static void 270 filt_kqdetach(struct knote *kn) 271 { 272 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 273 274 knote_remove(&kq->kq_kqinfo.ki_note, kn); 275 } 276 277 /*ARGSUSED*/ 278 static int 279 filt_kqueue(struct knote *kn, long hint) 280 { 281 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 282 283 kn->kn_data = kq->kq_count; 284 return (kn->kn_data > 0); 285 } 286 287 static int 288 filt_procattach(struct knote *kn) 289 { 290 struct proc *p; 291 int immediate; 292 293 immediate = 0; 294 p = pfind(kn->kn_id); 295 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 296 p = zpfind(kn->kn_id); 297 immediate = 1; 298 } 299 if (p == NULL) { 300 return (ESRCH); 301 } 302 if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) { 303 if (p) 304 PRELE(p); 305 return (EACCES); 306 } 307 308 lwkt_gettoken(&p->p_token); 309 kn->kn_ptr.p_proc = p; 310 kn->kn_flags |= EV_CLEAR; /* automatically set */ 311 312 /* 313 * internal flag indicating registration done by kernel 314 */ 315 if (kn->kn_flags & EV_FLAG1) { 316 kn->kn_data = kn->kn_sdata; /* ppid */ 317 kn->kn_fflags = NOTE_CHILD; 318 kn->kn_flags &= ~EV_FLAG1; 319 } 320 321 knote_insert(&p->p_klist, kn); 322 323 /* 324 * Immediately activate any exit notes if the target process is a 325 * zombie. This is necessary to handle the case where the target 326 * process, e.g. a child, dies before the kevent is negistered. 327 */ 328 if (immediate && filt_proc(kn, NOTE_EXIT)) 329 KNOTE_ACTIVATE(kn); 330 lwkt_reltoken(&p->p_token); 331 PRELE(p); 332 333 return (0); 334 } 335 336 /* 337 * The knote may be attached to a different process, which may exit, 338 * leaving nothing for the knote to be attached to. So when the process 339 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 340 * it will be deleted when read out. However, as part of the knote deletion, 341 * this routine is called, so a check is needed to avoid actually performing 342 * a detach, because the original process does not exist any more. 343 */ 344 static void 345 filt_procdetach(struct knote *kn) 346 { 347 struct proc *p; 348 349 if (kn->kn_status & KN_DETACHED) 350 return; 351 p = kn->kn_ptr.p_proc; 352 knote_remove(&p->p_klist, kn); 353 } 354 355 static int 356 filt_proc(struct knote *kn, long hint) 357 { 358 u_int event; 359 360 /* 361 * mask off extra data 362 */ 363 event = (u_int)hint & NOTE_PCTRLMASK; 364 365 /* 366 * if the user is interested in this event, record it. 367 */ 368 if (kn->kn_sfflags & event) 369 kn->kn_fflags |= event; 370 371 /* 372 * Process is gone, so flag the event as finished. Detach the 373 * knote from the process now because the process will be poof, 374 * gone later on. 375 */ 376 if (event == NOTE_EXIT) { 377 struct proc *p = kn->kn_ptr.p_proc; 378 if ((kn->kn_status & KN_DETACHED) == 0) { 379 PHOLD(p); 380 knote_remove(&p->p_klist, kn); 381 kn->kn_status |= KN_DETACHED; 382 kn->kn_data = p->p_xstat; 383 kn->kn_ptr.p_proc = NULL; 384 PRELE(p); 385 } 386 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 387 return (1); 388 } 389 390 /* 391 * process forked, and user wants to track the new process, 392 * so attach a new knote to it, and immediately report an 393 * event with the parent's pid. 394 */ 395 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 396 struct kevent kev; 397 int error; 398 399 /* 400 * register knote with new process. 401 */ 402 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 403 kev.filter = kn->kn_filter; 404 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 405 kev.fflags = kn->kn_sfflags; 406 kev.data = kn->kn_id; /* parent */ 407 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 408 error = kqueue_register(kn->kn_kq, &kev); 409 if (error) 410 kn->kn_fflags |= NOTE_TRACKERR; 411 } 412 413 return (kn->kn_fflags != 0); 414 } 415 416 static void 417 filt_timerreset(struct knote *kn) 418 { 419 struct callout *calloutp; 420 struct timeval tv; 421 int tticks; 422 423 tv.tv_sec = kn->kn_sdata / 1000; 424 tv.tv_usec = (kn->kn_sdata % 1000) * 1000; 425 tticks = tvtohz_high(&tv); 426 calloutp = (struct callout *)kn->kn_hook; 427 callout_reset(calloutp, tticks, filt_timerexpire, kn); 428 } 429 430 /* 431 * The callout interlocks with callout_terminate() but can still 432 * race a deletion so if KN_DELETING is set we just don't touch 433 * the knote. 434 */ 435 static void 436 filt_timerexpire(void *knx) 437 { 438 struct knote *kn = knx; 439 struct kqueue *kq = kn->kn_kq; 440 441 lwkt_getpooltoken(kq); 442 443 /* 444 * Open knote_acquire(), since we can't sleep in callout, 445 * however, we do need to record this expiration. 446 */ 447 kn->kn_data++; 448 if (kn->kn_status & KN_PROCESSING) { 449 kn->kn_status |= KN_REPROCESS; 450 if ((kn->kn_status & KN_DELETING) == 0 && 451 (kn->kn_flags & EV_ONESHOT) == 0) 452 filt_timerreset(kn); 453 lwkt_relpooltoken(kq); 454 return; 455 } 456 KASSERT((kn->kn_status & KN_DELETING) == 0, 457 ("acquire a deleting knote %#x", kn->kn_status)); 458 kn->kn_status |= KN_PROCESSING; 459 460 KNOTE_ACTIVATE(kn); 461 if ((kn->kn_flags & EV_ONESHOT) == 0) 462 filt_timerreset(kn); 463 464 knote_release(kn); 465 466 lwkt_relpooltoken(kq); 467 } 468 469 /* 470 * data contains amount of time to sleep, in milliseconds 471 */ 472 static int 473 filt_timerattach(struct knote *kn) 474 { 475 struct callout *calloutp; 476 int prev_ncallouts; 477 478 prev_ncallouts = atomic_fetchadd_int(&kq_ncallouts, 1); 479 if (prev_ncallouts >= kq_calloutmax) { 480 atomic_subtract_int(&kq_ncallouts, 1); 481 kn->kn_hook = NULL; 482 return (ENOMEM); 483 } 484 485 kn->kn_flags |= EV_CLEAR; /* automatically set */ 486 calloutp = kmalloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK); 487 callout_init_mp(calloutp); 488 kn->kn_hook = (caddr_t)calloutp; 489 490 filt_timerreset(kn); 491 return (0); 492 } 493 494 /* 495 * This function is called with the knote flagged locked but it is 496 * still possible to race a callout event due to the callback blocking. 497 * We must call callout_terminate() instead of callout_stop() to deal 498 * with the race. 499 */ 500 static void 501 filt_timerdetach(struct knote *kn) 502 { 503 struct callout *calloutp; 504 505 calloutp = (struct callout *)kn->kn_hook; 506 callout_terminate(calloutp); 507 kn->kn_hook = NULL; 508 kfree(calloutp, M_KQUEUE); 509 atomic_subtract_int(&kq_ncallouts, 1); 510 } 511 512 static int 513 filt_timer(struct knote *kn, long hint) 514 { 515 return (kn->kn_data != 0); 516 } 517 518 /* 519 * EVFILT_USER 520 */ 521 static int 522 filt_userattach(struct knote *kn) 523 { 524 u_int ffctrl; 525 526 kn->kn_hook = NULL; 527 if (kn->kn_sfflags & NOTE_TRIGGER) 528 kn->kn_ptr.hookid = 1; 529 else 530 kn->kn_ptr.hookid = 0; 531 532 ffctrl = kn->kn_sfflags & NOTE_FFCTRLMASK; 533 kn->kn_sfflags &= NOTE_FFLAGSMASK; 534 switch (ffctrl) { 535 case NOTE_FFNOP: 536 break; 537 538 case NOTE_FFAND: 539 kn->kn_fflags &= kn->kn_sfflags; 540 break; 541 542 case NOTE_FFOR: 543 kn->kn_fflags |= kn->kn_sfflags; 544 break; 545 546 case NOTE_FFCOPY: 547 kn->kn_fflags = kn->kn_sfflags; 548 break; 549 550 default: 551 /* XXX Return error? */ 552 break; 553 } 554 /* We just happen to copy this value as well. Undocumented. */ 555 kn->kn_data = kn->kn_sdata; 556 557 return 0; 558 } 559 560 static void 561 filt_userdetach(struct knote *kn) 562 { 563 /* nothing to do */ 564 } 565 566 static int 567 filt_user(struct knote *kn, long hint) 568 { 569 return (kn->kn_ptr.hookid); 570 } 571 572 static void 573 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 574 { 575 u_int ffctrl; 576 577 switch (type) { 578 case EVENT_REGISTER: 579 if (kev->fflags & NOTE_TRIGGER) 580 kn->kn_ptr.hookid = 1; 581 582 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 583 kev->fflags &= NOTE_FFLAGSMASK; 584 switch (ffctrl) { 585 case NOTE_FFNOP: 586 break; 587 588 case NOTE_FFAND: 589 kn->kn_fflags &= kev->fflags; 590 break; 591 592 case NOTE_FFOR: 593 kn->kn_fflags |= kev->fflags; 594 break; 595 596 case NOTE_FFCOPY: 597 kn->kn_fflags = kev->fflags; 598 break; 599 600 default: 601 /* XXX Return error? */ 602 break; 603 } 604 /* We just happen to copy this value as well. Undocumented. */ 605 kn->kn_data = kev->data; 606 607 /* 608 * This is not the correct use of EV_CLEAR in an event 609 * modification, it should have been passed as a NOTE instead. 610 * But we need to maintain compatibility with Apple & FreeBSD. 611 * 612 * Note however that EV_CLEAR can still be used when doing 613 * the initial registration of the event and works as expected 614 * (clears the event on reception). 615 */ 616 if (kev->flags & EV_CLEAR) { 617 kn->kn_ptr.hookid = 0; 618 /* 619 * Clearing kn->kn_data is fine, since it gets set 620 * every time anyway. We just shouldn't clear 621 * kn->kn_fflags here, since that would limit the 622 * possible uses of this API. NOTE_FFAND or 623 * NOTE_FFCOPY should be used for explicitly clearing 624 * kn->kn_fflags. 625 */ 626 kn->kn_data = 0; 627 } 628 break; 629 630 case EVENT_PROCESS: 631 *kev = kn->kn_kevent; 632 kev->fflags = kn->kn_fflags; 633 kev->data = kn->kn_data; 634 if (kn->kn_flags & EV_CLEAR) { 635 kn->kn_ptr.hookid = 0; 636 /* kn_data, kn_fflags handled by parent */ 637 } 638 break; 639 640 default: 641 panic("filt_usertouch() - invalid type (%ld)", type); 642 break; 643 } 644 } 645 646 /* 647 * EVFILT_FS 648 */ 649 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist); 650 651 static int 652 filt_fsattach(struct knote *kn) 653 { 654 kn->kn_flags |= EV_CLEAR; 655 knote_insert(&fs_klist, kn); 656 657 return (0); 658 } 659 660 static void 661 filt_fsdetach(struct knote *kn) 662 { 663 knote_remove(&fs_klist, kn); 664 } 665 666 static int 667 filt_fs(struct knote *kn, long hint) 668 { 669 kn->kn_fflags |= hint; 670 return (kn->kn_fflags != 0); 671 } 672 673 /* 674 * Initialize a kqueue. 675 * 676 * NOTE: The lwp/proc code initializes a kqueue for select/poll ops. 677 * 678 * MPSAFE 679 */ 680 void 681 kqueue_init(struct kqueue *kq, struct filedesc *fdp) 682 { 683 TAILQ_INIT(&kq->kq_knpend); 684 TAILQ_INIT(&kq->kq_knlist); 685 kq->kq_count = 0; 686 kq->kq_fdp = fdp; 687 SLIST_INIT(&kq->kq_kqinfo.ki_note); 688 } 689 690 /* 691 * Terminate a kqueue. Freeing the actual kq itself is left up to the 692 * caller (it might be embedded in a lwp so we don't do it here). 693 * 694 * The kq's knlist must be completely eradicated so block on any 695 * processing races. 696 */ 697 void 698 kqueue_terminate(struct kqueue *kq) 699 { 700 struct knote *kn; 701 702 lwkt_getpooltoken(kq); 703 while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) { 704 if (knote_acquire(kn)) 705 knote_detach_and_drop(kn); 706 } 707 lwkt_relpooltoken(kq); 708 709 if (kq->kq_knhash) { 710 hashdestroy(kq->kq_knhash, M_KQUEUE, kq->kq_knhashmask); 711 kq->kq_knhash = NULL; 712 kq->kq_knhashmask = 0; 713 } 714 } 715 716 /* 717 * MPSAFE 718 */ 719 int 720 sys_kqueue(struct kqueue_args *uap) 721 { 722 struct thread *td = curthread; 723 struct kqueue *kq; 724 struct file *fp; 725 int fd, error; 726 727 error = falloc(td->td_lwp, &fp, &fd); 728 if (error) 729 return (error); 730 fp->f_flag = FREAD | FWRITE; 731 fp->f_type = DTYPE_KQUEUE; 732 fp->f_ops = &kqueueops; 733 734 kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); 735 kqueue_init(kq, td->td_proc->p_fd); 736 fp->f_data = kq; 737 738 fsetfd(kq->kq_fdp, fp, fd); 739 uap->sysmsg_result = fd; 740 fdrop(fp); 741 return (error); 742 } 743 744 /* 745 * Copy 'count' items into the destination list pointed to by uap->eventlist. 746 */ 747 static int 748 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res) 749 { 750 struct kevent_copyin_args *kap; 751 int error; 752 753 kap = (struct kevent_copyin_args *)arg; 754 755 error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp)); 756 if (error == 0) { 757 kap->ka->eventlist += count; 758 *res += count; 759 } else { 760 *res = -1; 761 } 762 763 return (error); 764 } 765 766 /* 767 * Copy at most 'max' items from the list pointed to by kap->changelist, 768 * return number of items in 'events'. 769 */ 770 static int 771 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events) 772 { 773 struct kevent_copyin_args *kap; 774 int error, count; 775 776 kap = (struct kevent_copyin_args *)arg; 777 778 count = min(kap->ka->nchanges - kap->pchanges, max); 779 error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp); 780 if (error == 0) { 781 kap->ka->changelist += count; 782 kap->pchanges += count; 783 *events = count; 784 } 785 786 return (error); 787 } 788 789 /* 790 * MPSAFE 791 */ 792 int 793 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, 794 k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn, 795 struct timespec *tsp_in, int flags) 796 { 797 struct kevent *kevp; 798 struct timespec *tsp, ats; 799 int i, n, total, error, nerrors = 0; 800 int lres; 801 int limit = kq_checkloop; 802 struct kevent kev[KQ_NEVENTS]; 803 struct knote marker; 804 struct lwkt_token *tok; 805 806 if (tsp_in == NULL || tsp_in->tv_sec || tsp_in->tv_nsec) 807 atomic_set_int(&curthread->td_mpflags, TDF_MP_BATCH_DEMARC); 808 809 tsp = tsp_in; 810 *res = 0; 811 812 for (;;) { 813 n = 0; 814 error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n); 815 if (error) 816 return error; 817 if (n == 0) 818 break; 819 for (i = 0; i < n; i++) { 820 kevp = &kev[i]; 821 kevp->flags &= ~EV_SYSFLAGS; 822 error = kqueue_register(kq, kevp); 823 824 /* 825 * If a registration returns an error we 826 * immediately post the error. The kevent() 827 * call itself will fail with the error if 828 * no space is available for posting. 829 * 830 * Such errors normally bypass the timeout/blocking 831 * code. However, if the copyoutfn function refuses 832 * to post the error (see sys_poll()), then we 833 * ignore it too. 834 */ 835 if (error || (kevp->flags & EV_RECEIPT)) { 836 kevp->flags = EV_ERROR; 837 kevp->data = error; 838 lres = *res; 839 kevent_copyoutfn(uap, kevp, 1, res); 840 if (*res < 0) { 841 return error; 842 } else if (lres != *res) { 843 nevents--; 844 nerrors++; 845 } 846 } 847 } 848 } 849 if (nerrors) 850 return 0; 851 852 /* 853 * Acquire/wait for events - setup timeout 854 */ 855 if (tsp != NULL) { 856 if (tsp->tv_sec || tsp->tv_nsec) { 857 getnanouptime(&ats); 858 timespecadd(tsp, &ats); /* tsp = target time */ 859 } 860 } 861 862 /* 863 * Loop as required. 864 * 865 * Collect as many events as we can. Sleeping on successive 866 * loops is disabled if copyoutfn has incremented (*res). 867 * 868 * The loop stops if an error occurs, all events have been 869 * scanned (the marker has been reached), or fewer than the 870 * maximum number of events is found. 871 * 872 * The copyoutfn function does not have to increment (*res) in 873 * order for the loop to continue. 874 * 875 * NOTE: doselect() usually passes 0x7FFFFFFF for nevents. 876 */ 877 total = 0; 878 error = 0; 879 marker.kn_filter = EVFILT_MARKER; 880 marker.kn_status = KN_PROCESSING; 881 tok = lwkt_token_pool_lookup(kq); 882 lwkt_gettoken(tok); 883 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); 884 lwkt_reltoken(tok); 885 while ((n = nevents - total) > 0) { 886 if (n > KQ_NEVENTS) 887 n = KQ_NEVENTS; 888 889 /* 890 * If no events are pending sleep until timeout (if any) 891 * or an event occurs. 892 * 893 * After the sleep completes the marker is moved to the 894 * end of the list, making any received events available 895 * to our scan. 896 */ 897 if (kq->kq_count == 0 && *res == 0) { 898 int timeout, ustimeout = 0; 899 900 if (tsp == NULL) { 901 timeout = 0; 902 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 903 error = EWOULDBLOCK; 904 break; 905 } else { 906 struct timespec atx = *tsp; 907 908 getnanouptime(&ats); 909 timespecsub(&atx, &ats); 910 if (atx.tv_sec < 0) { 911 error = EWOULDBLOCK; 912 break; 913 } else { 914 timeout = atx.tv_sec > 24 * 60 * 60 ? 915 24 * 60 * 60 * hz : 916 tstohz_high(&atx); 917 } 918 if (flags & KEVENT_TIMEOUT_PRECISE && 919 timeout != 0) { 920 if (atx.tv_sec == 0 && 921 atx.tv_nsec < kq_sleep_threshold) { 922 DELAY(atx.tv_nsec / 1000); 923 error = EWOULDBLOCK; 924 break; 925 } else if (atx.tv_sec < 2000) { 926 ustimeout = atx.tv_sec * 927 1000000 + atx.tv_nsec/1000; 928 } else { 929 ustimeout = 2000000000; 930 } 931 } 932 } 933 934 lwkt_gettoken(tok); 935 if (kq->kq_count == 0) { 936 kq->kq_sleep_cnt++; 937 if (__predict_false(kq->kq_sleep_cnt == 0)) { 938 /* 939 * Guard against possible wrapping. And 940 * set it to 2, so that kqueue_wakeup() 941 * can wake everyone up. 942 */ 943 kq->kq_sleep_cnt = 2; 944 } 945 if ((flags & KEVENT_TIMEOUT_PRECISE) && 946 timeout != 0) { 947 error = precise_sleep(kq, PCATCH, 948 "kqread", ustimeout); 949 } else { 950 error = tsleep(kq, PCATCH, "kqread", 951 timeout); 952 } 953 954 /* don't restart after signals... */ 955 if (error == ERESTART) 956 error = EINTR; 957 if (error) { 958 lwkt_reltoken(tok); 959 break; 960 } 961 962 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 963 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, 964 kn_tqe); 965 } 966 lwkt_reltoken(tok); 967 } 968 969 /* 970 * Process all received events 971 * Account for all non-spurious events in our total 972 */ 973 i = kqueue_scan(kq, kev, n, &marker); 974 if (i) { 975 lres = *res; 976 error = kevent_copyoutfn(uap, kev, i, res); 977 total += *res - lres; 978 if (error) 979 break; 980 } 981 if (limit && --limit == 0) 982 panic("kqueue: checkloop failed i=%d", i); 983 984 /* 985 * Normally when fewer events are returned than requested 986 * we can stop. However, if only spurious events were 987 * collected the copyout will not bump (*res) and we have 988 * to continue. 989 */ 990 if (i < n && *res) 991 break; 992 993 /* 994 * Deal with an edge case where spurious events can cause 995 * a loop to occur without moving the marker. This can 996 * prevent kqueue_scan() from picking up new events which 997 * race us. We must be sure to move the marker for this 998 * case. 999 * 1000 * NOTE: We do not want to move the marker if events 1001 * were scanned because normal kqueue operations 1002 * may reactivate events. Moving the marker in 1003 * that case could result in duplicates for the 1004 * same event. 1005 */ 1006 if (i == 0) { 1007 lwkt_gettoken(tok); 1008 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 1009 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); 1010 lwkt_reltoken(tok); 1011 } 1012 } 1013 lwkt_gettoken(tok); 1014 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 1015 lwkt_reltoken(tok); 1016 1017 /* Timeouts do not return EWOULDBLOCK. */ 1018 if (error == EWOULDBLOCK) 1019 error = 0; 1020 return error; 1021 } 1022 1023 /* 1024 * MPALMOSTSAFE 1025 */ 1026 int 1027 sys_kevent(struct kevent_args *uap) 1028 { 1029 struct thread *td = curthread; 1030 struct proc *p = td->td_proc; 1031 struct timespec ts, *tsp; 1032 struct kqueue *kq; 1033 struct file *fp = NULL; 1034 struct kevent_copyin_args *kap, ka; 1035 int error; 1036 1037 if (uap->timeout) { 1038 error = copyin(uap->timeout, &ts, sizeof(ts)); 1039 if (error) 1040 return (error); 1041 tsp = &ts; 1042 } else { 1043 tsp = NULL; 1044 } 1045 fp = holdfp(p->p_fd, uap->fd, -1); 1046 if (fp == NULL) 1047 return (EBADF); 1048 if (fp->f_type != DTYPE_KQUEUE) { 1049 fdrop(fp); 1050 return (EBADF); 1051 } 1052 1053 kq = (struct kqueue *)fp->f_data; 1054 1055 kap = &ka; 1056 kap->ka = uap; 1057 kap->pchanges = 0; 1058 1059 error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap, 1060 kevent_copyin, kevent_copyout, tsp, 0); 1061 1062 fdrop(fp); 1063 1064 return (error); 1065 } 1066 1067 int 1068 kqueue_register(struct kqueue *kq, struct kevent *kev) 1069 { 1070 struct filedesc *fdp = kq->kq_fdp; 1071 struct klist *list = NULL; 1072 struct filterops *fops; 1073 struct file *fp = NULL; 1074 struct knote *kn = NULL; 1075 struct thread *td; 1076 int error = 0; 1077 struct knote_cache_list *cache_list; 1078 1079 if (kev->filter < 0) { 1080 if (kev->filter + EVFILT_SYSCOUNT < 0) 1081 return (EINVAL); 1082 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ 1083 } else { 1084 /* 1085 * XXX 1086 * filter attach routine is responsible for insuring that 1087 * the identifier can be attached to it. 1088 */ 1089 return (EINVAL); 1090 } 1091 1092 if (fops->f_flags & FILTEROP_ISFD) { 1093 /* validate descriptor */ 1094 fp = holdfp(fdp, kev->ident, -1); 1095 if (fp == NULL) 1096 return (EBADF); 1097 } 1098 1099 cache_list = &knote_cache_lists[mycpuid]; 1100 if (SLIST_EMPTY(&cache_list->knote_cache)) { 1101 struct knote *new_kn; 1102 1103 new_kn = knote_alloc(); 1104 crit_enter(); 1105 SLIST_INSERT_HEAD(&cache_list->knote_cache, new_kn, kn_link); 1106 cache_list->knote_cache_cnt++; 1107 crit_exit(); 1108 } 1109 1110 td = curthread; 1111 lwkt_getpooltoken(kq); 1112 1113 /* 1114 * Make sure that only one thread can register event on this kqueue, 1115 * so that we would not suffer any race, even if the registration 1116 * blocked, i.e. kq token was released, and the kqueue was shared 1117 * between threads (this should be rare though). 1118 */ 1119 while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) { 1120 kq->kq_state |= KQ_REGWAIT; 1121 tsleep(&kq->kq_regtd, 0, "kqreg", 0); 1122 } 1123 if (__predict_false(kq->kq_regtd != NULL)) { 1124 /* Recursive calling of kqueue_register() */ 1125 td = NULL; 1126 } else { 1127 /* Owner of the kq_regtd, i.e. td != NULL */ 1128 kq->kq_regtd = td; 1129 } 1130 1131 if (fp != NULL) { 1132 list = &fp->f_klist; 1133 } else if (kq->kq_knhashmask) { 1134 list = &kq->kq_knhash[ 1135 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1136 } 1137 if (list != NULL) { 1138 lwkt_getpooltoken(list); 1139 again: 1140 SLIST_FOREACH(kn, list, kn_link) { 1141 if (kn->kn_kq == kq && 1142 kn->kn_filter == kev->filter && 1143 kn->kn_id == kev->ident) { 1144 if (knote_acquire(kn) == 0) 1145 goto again; 1146 break; 1147 } 1148 } 1149 lwkt_relpooltoken(list); 1150 } 1151 1152 /* 1153 * NOTE: At this point if kn is non-NULL we will have acquired 1154 * it and set KN_PROCESSING. 1155 */ 1156 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 1157 error = ENOENT; 1158 goto done; 1159 } 1160 1161 /* 1162 * kn now contains the matching knote, or NULL if no match 1163 */ 1164 if (kev->flags & EV_ADD) { 1165 if (kn == NULL) { 1166 crit_enter(); 1167 kn = SLIST_FIRST(&cache_list->knote_cache); 1168 if (kn == NULL) { 1169 crit_exit(); 1170 kn = knote_alloc(); 1171 } else { 1172 SLIST_REMOVE_HEAD(&cache_list->knote_cache, 1173 kn_link); 1174 cache_list->knote_cache_cnt--; 1175 crit_exit(); 1176 } 1177 kn->kn_fp = fp; 1178 kn->kn_kq = kq; 1179 kn->kn_fop = fops; 1180 1181 /* 1182 * apply reference count to knote structure, and 1183 * do not release it at the end of this routine. 1184 */ 1185 fp = NULL; 1186 1187 kn->kn_sfflags = kev->fflags; 1188 kn->kn_sdata = kev->data; 1189 kev->fflags = 0; 1190 kev->data = 0; 1191 kn->kn_kevent = *kev; 1192 1193 /* 1194 * KN_PROCESSING prevents the knote from getting 1195 * ripped out from under us while we are trying 1196 * to attach it, in case the attach blocks. 1197 */ 1198 kn->kn_status = KN_PROCESSING; 1199 knote_attach(kn); 1200 if ((error = filter_attach(kn)) != 0) { 1201 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1202 knote_drop(kn); 1203 goto done; 1204 } 1205 1206 /* 1207 * Interlock against close races which either tried 1208 * to remove our knote while we were blocked or missed 1209 * it entirely prior to our attachment. We do not 1210 * want to end up with a knote on a closed descriptor. 1211 */ 1212 if ((fops->f_flags & FILTEROP_ISFD) && 1213 checkfdclosed(fdp, kev->ident, kn->kn_fp)) { 1214 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1215 } 1216 } else { 1217 /* 1218 * The user may change some filter values after the 1219 * initial EV_ADD, but doing so will not reset any 1220 * filter which have already been triggered. 1221 */ 1222 KKASSERT(kn->kn_status & KN_PROCESSING); 1223 if (fops == &user_filtops) { 1224 filt_usertouch(kn, kev, EVENT_REGISTER); 1225 } else { 1226 kn->kn_sfflags = kev->fflags; 1227 kn->kn_sdata = kev->data; 1228 kn->kn_kevent.udata = kev->udata; 1229 } 1230 } 1231 1232 /* 1233 * Execute the filter event to immediately activate the 1234 * knote if necessary. If reprocessing events are pending 1235 * due to blocking above we do not run the filter here 1236 * but instead let knote_release() do it. Otherwise we 1237 * might run the filter on a deleted event. 1238 */ 1239 if ((kn->kn_status & KN_REPROCESS) == 0) { 1240 if (filter_event(kn, 0)) 1241 KNOTE_ACTIVATE(kn); 1242 } 1243 } else if (kev->flags & EV_DELETE) { 1244 /* 1245 * Delete the existing knote 1246 */ 1247 knote_detach_and_drop(kn); 1248 goto done; 1249 } else { 1250 /* 1251 * Modify an existing event. 1252 * 1253 * The user may change some filter values after the 1254 * initial EV_ADD, but doing so will not reset any 1255 * filter which have already been triggered. 1256 */ 1257 KKASSERT(kn->kn_status & KN_PROCESSING); 1258 if (fops == &user_filtops) { 1259 filt_usertouch(kn, kev, EVENT_REGISTER); 1260 } else { 1261 kn->kn_sfflags = kev->fflags; 1262 kn->kn_sdata = kev->data; 1263 kn->kn_kevent.udata = kev->udata; 1264 } 1265 1266 /* 1267 * Execute the filter event to immediately activate the 1268 * knote if necessary. If reprocessing events are pending 1269 * due to blocking above we do not run the filter here 1270 * but instead let knote_release() do it. Otherwise we 1271 * might run the filter on a deleted event. 1272 */ 1273 if ((kn->kn_status & KN_REPROCESS) == 0) { 1274 if (filter_event(kn, 0)) 1275 KNOTE_ACTIVATE(kn); 1276 } 1277 } 1278 1279 /* 1280 * Disablement does not deactivate a knote here. 1281 */ 1282 if ((kev->flags & EV_DISABLE) && 1283 ((kn->kn_status & KN_DISABLED) == 0)) { 1284 kn->kn_status |= KN_DISABLED; 1285 } 1286 1287 /* 1288 * Re-enablement may have to immediately enqueue an active knote. 1289 */ 1290 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 1291 kn->kn_status &= ~KN_DISABLED; 1292 if ((kn->kn_status & KN_ACTIVE) && 1293 ((kn->kn_status & KN_QUEUED) == 0)) { 1294 knote_enqueue(kn); 1295 } 1296 } 1297 1298 /* 1299 * Handle any required reprocessing 1300 */ 1301 knote_release(kn); 1302 /* kn may be invalid now */ 1303 1304 done: 1305 if (td != NULL) { /* Owner of the kq_regtd */ 1306 kq->kq_regtd = NULL; 1307 if (__predict_false(kq->kq_state & KQ_REGWAIT)) { 1308 kq->kq_state &= ~KQ_REGWAIT; 1309 wakeup(&kq->kq_regtd); 1310 } 1311 } 1312 lwkt_relpooltoken(kq); 1313 if (fp != NULL) 1314 fdrop(fp); 1315 return (error); 1316 } 1317 1318 /* 1319 * Scan the kqueue, return the number of active events placed in kevp up 1320 * to count. 1321 * 1322 * Continuous mode events may get recycled, do not continue scanning past 1323 * marker unless no events have been collected. 1324 */ 1325 static int 1326 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 1327 struct knote *marker) 1328 { 1329 struct knote *kn, local_marker; 1330 int total; 1331 1332 total = 0; 1333 local_marker.kn_filter = EVFILT_MARKER; 1334 local_marker.kn_status = KN_PROCESSING; 1335 1336 lwkt_getpooltoken(kq); 1337 1338 /* 1339 * Collect events. 1340 */ 1341 TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe); 1342 while (count) { 1343 kn = TAILQ_NEXT(&local_marker, kn_tqe); 1344 if (kn->kn_filter == EVFILT_MARKER) { 1345 /* Marker reached, we are done */ 1346 if (kn == marker) 1347 break; 1348 1349 /* Move local marker past some other threads marker */ 1350 kn = TAILQ_NEXT(kn, kn_tqe); 1351 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1352 TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe); 1353 continue; 1354 } 1355 1356 /* 1357 * We can't skip a knote undergoing processing, otherwise 1358 * we risk not returning it when the user process expects 1359 * it should be returned. Sleep and retry. 1360 */ 1361 if (knote_acquire(kn) == 0) 1362 continue; 1363 1364 /* 1365 * Remove the event for processing. 1366 * 1367 * WARNING! We must leave KN_QUEUED set to prevent the 1368 * event from being KNOTE_ACTIVATE()d while 1369 * the queue state is in limbo, in case we 1370 * block. 1371 */ 1372 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 1373 kq->kq_count--; 1374 1375 /* 1376 * We have to deal with an extremely important race against 1377 * file descriptor close()s here. The file descriptor can 1378 * disappear MPSAFE, and there is a small window of 1379 * opportunity between that and the call to knote_fdclose(). 1380 * 1381 * If we hit that window here while doselect or dopoll is 1382 * trying to delete a spurious event they will not be able 1383 * to match up the event against a knote and will go haywire. 1384 */ 1385 if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && 1386 checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) { 1387 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1388 } 1389 1390 if (kn->kn_status & KN_DISABLED) { 1391 /* 1392 * If disabled we ensure the event is not queued 1393 * but leave its active bit set. On re-enablement 1394 * the event may be immediately triggered. 1395 */ 1396 kn->kn_status &= ~KN_QUEUED; 1397 } else if ((kn->kn_flags & EV_ONESHOT) == 0 && 1398 (kn->kn_status & KN_DELETING) == 0 && 1399 filter_event(kn, 0) == 0) { 1400 /* 1401 * If not running in one-shot mode and the event 1402 * is no longer present we ensure it is removed 1403 * from the queue and ignore it. 1404 */ 1405 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1406 } else { 1407 /* 1408 * Post the event 1409 */ 1410 if (kn->kn_fop == &user_filtops) 1411 filt_usertouch(kn, kevp, EVENT_PROCESS); 1412 else 1413 *kevp = kn->kn_kevent; 1414 ++kevp; 1415 ++total; 1416 --count; 1417 1418 if (kn->kn_flags & EV_ONESHOT) { 1419 kn->kn_status &= ~KN_QUEUED; 1420 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1421 } else { 1422 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1423 if (kn->kn_flags & EV_CLEAR) { 1424 kn->kn_data = 0; 1425 kn->kn_fflags = 0; 1426 } 1427 if (kn->kn_flags & EV_DISPATCH) { 1428 kn->kn_status |= KN_DISABLED; 1429 } 1430 kn->kn_status &= ~(KN_QUEUED | 1431 KN_ACTIVE); 1432 } else { 1433 TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); 1434 kq->kq_count++; 1435 } 1436 } 1437 } 1438 1439 /* 1440 * Handle any post-processing states 1441 */ 1442 knote_release(kn); 1443 } 1444 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1445 1446 lwkt_relpooltoken(kq); 1447 return (total); 1448 } 1449 1450 /* 1451 * XXX 1452 * This could be expanded to call kqueue_scan, if desired. 1453 * 1454 * MPSAFE 1455 */ 1456 static int 1457 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1458 { 1459 return (ENXIO); 1460 } 1461 1462 /* 1463 * MPSAFE 1464 */ 1465 static int 1466 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1467 { 1468 return (ENXIO); 1469 } 1470 1471 /* 1472 * MPALMOSTSAFE 1473 */ 1474 static int 1475 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 1476 struct ucred *cred, struct sysmsg *msg) 1477 { 1478 struct kqueue *kq; 1479 int error; 1480 1481 kq = (struct kqueue *)fp->f_data; 1482 lwkt_getpooltoken(kq); 1483 switch(com) { 1484 case FIOASYNC: 1485 if (*(int *)data) 1486 kq->kq_state |= KQ_ASYNC; 1487 else 1488 kq->kq_state &= ~KQ_ASYNC; 1489 error = 0; 1490 break; 1491 case FIOSETOWN: 1492 error = fsetown(*(int *)data, &kq->kq_sigio); 1493 break; 1494 default: 1495 error = ENOTTY; 1496 break; 1497 } 1498 lwkt_relpooltoken(kq); 1499 return (error); 1500 } 1501 1502 /* 1503 * MPSAFE 1504 */ 1505 static int 1506 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred) 1507 { 1508 struct kqueue *kq = (struct kqueue *)fp->f_data; 1509 1510 bzero((void *)st, sizeof(*st)); 1511 st->st_size = kq->kq_count; 1512 st->st_blksize = sizeof(struct kevent); 1513 st->st_mode = S_IFIFO; 1514 return (0); 1515 } 1516 1517 /* 1518 * MPSAFE 1519 */ 1520 static int 1521 kqueue_close(struct file *fp) 1522 { 1523 struct kqueue *kq = (struct kqueue *)fp->f_data; 1524 1525 kqueue_terminate(kq); 1526 1527 fp->f_data = NULL; 1528 funsetown(&kq->kq_sigio); 1529 1530 kfree(kq, M_KQUEUE); 1531 return (0); 1532 } 1533 1534 static void 1535 kqueue_wakeup(struct kqueue *kq) 1536 { 1537 if (kq->kq_sleep_cnt) { 1538 u_int sleep_cnt = kq->kq_sleep_cnt; 1539 1540 kq->kq_sleep_cnt = 0; 1541 if (sleep_cnt == 1) 1542 wakeup_one(kq); 1543 else 1544 wakeup(kq); 1545 } 1546 KNOTE(&kq->kq_kqinfo.ki_note, 0); 1547 } 1548 1549 /* 1550 * Calls filterops f_attach function, acquiring mplock if filter is not 1551 * marked as FILTEROP_MPSAFE. 1552 * 1553 * Caller must be holding the related kq token 1554 */ 1555 static int 1556 filter_attach(struct knote *kn) 1557 { 1558 int ret; 1559 1560 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1561 ret = kn->kn_fop->f_attach(kn); 1562 } else { 1563 get_mplock(); 1564 ret = kn->kn_fop->f_attach(kn); 1565 rel_mplock(); 1566 } 1567 return (ret); 1568 } 1569 1570 /* 1571 * Detach the knote and drop it, destroying the knote. 1572 * 1573 * Calls filterops f_detach function, acquiring mplock if filter is not 1574 * marked as FILTEROP_MPSAFE. 1575 * 1576 * Caller must be holding the related kq token 1577 */ 1578 static void 1579 knote_detach_and_drop(struct knote *kn) 1580 { 1581 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1582 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1583 kn->kn_fop->f_detach(kn); 1584 } else { 1585 get_mplock(); 1586 kn->kn_fop->f_detach(kn); 1587 rel_mplock(); 1588 } 1589 knote_drop(kn); 1590 } 1591 1592 /* 1593 * Calls filterops f_event function, acquiring mplock if filter is not 1594 * marked as FILTEROP_MPSAFE. 1595 * 1596 * If the knote is in the middle of being created or deleted we cannot 1597 * safely call the filter op. 1598 * 1599 * Caller must be holding the related kq token 1600 */ 1601 static int 1602 filter_event(struct knote *kn, long hint) 1603 { 1604 int ret; 1605 1606 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1607 ret = kn->kn_fop->f_event(kn, hint); 1608 } else { 1609 get_mplock(); 1610 ret = kn->kn_fop->f_event(kn, hint); 1611 rel_mplock(); 1612 } 1613 return (ret); 1614 } 1615 1616 /* 1617 * Walk down a list of knotes, activating them if their event has triggered. 1618 * 1619 * If we encounter any knotes which are undergoing processing we just mark 1620 * them for reprocessing and do not try to [re]activate the knote. However, 1621 * if a hint is being passed we have to wait and that makes things a bit 1622 * sticky. 1623 */ 1624 void 1625 knote(struct klist *list, long hint) 1626 { 1627 struct kqueue *kq; 1628 struct knote *kn; 1629 struct knote *kntmp; 1630 1631 lwkt_getpooltoken(list); 1632 restart: 1633 SLIST_FOREACH(kn, list, kn_next) { 1634 kq = kn->kn_kq; 1635 lwkt_getpooltoken(kq); 1636 1637 /* temporary verification hack */ 1638 SLIST_FOREACH(kntmp, list, kn_next) { 1639 if (kn == kntmp) 1640 break; 1641 } 1642 if (kn != kntmp || kn->kn_kq != kq) { 1643 lwkt_relpooltoken(kq); 1644 goto restart; 1645 } 1646 1647 if (kn->kn_status & KN_PROCESSING) { 1648 /* 1649 * Someone else is processing the knote, ask the 1650 * other thread to reprocess it and don't mess 1651 * with it otherwise. 1652 */ 1653 if (hint == 0) { 1654 kn->kn_status |= KN_REPROCESS; 1655 lwkt_relpooltoken(kq); 1656 continue; 1657 } 1658 1659 /* 1660 * If the hint is non-zero we have to wait or risk 1661 * losing the state the caller is trying to update. 1662 * 1663 * XXX This is a real problem, certain process 1664 * and signal filters will bump kn_data for 1665 * already-processed notes more than once if 1666 * we restart the list scan. FIXME. 1667 */ 1668 kn->kn_status |= KN_WAITING | KN_REPROCESS; 1669 tsleep(kn, 0, "knotec", hz); 1670 lwkt_relpooltoken(kq); 1671 goto restart; 1672 } 1673 1674 /* 1675 * Become the reprocessing master ourselves. 1676 * 1677 * If hint is non-zero running the event is mandatory 1678 * when not deleting so do it whether reprocessing is 1679 * set or not. 1680 */ 1681 kn->kn_status |= KN_PROCESSING; 1682 if ((kn->kn_status & KN_DELETING) == 0) { 1683 if (filter_event(kn, hint)) 1684 KNOTE_ACTIVATE(kn); 1685 } 1686 if (knote_release(kn)) { 1687 lwkt_relpooltoken(kq); 1688 goto restart; 1689 } 1690 lwkt_relpooltoken(kq); 1691 } 1692 lwkt_relpooltoken(list); 1693 } 1694 1695 /* 1696 * Insert knote at head of klist. 1697 * 1698 * This function may only be called via a filter function and thus 1699 * kq_token should already be held and marked for processing. 1700 */ 1701 void 1702 knote_insert(struct klist *klist, struct knote *kn) 1703 { 1704 lwkt_getpooltoken(klist); 1705 KKASSERT(kn->kn_status & KN_PROCESSING); 1706 SLIST_INSERT_HEAD(klist, kn, kn_next); 1707 lwkt_relpooltoken(klist); 1708 } 1709 1710 /* 1711 * Remove knote from a klist 1712 * 1713 * This function may only be called via a filter function and thus 1714 * kq_token should already be held and marked for processing. 1715 */ 1716 void 1717 knote_remove(struct klist *klist, struct knote *kn) 1718 { 1719 lwkt_getpooltoken(klist); 1720 KKASSERT(kn->kn_status & KN_PROCESSING); 1721 SLIST_REMOVE(klist, kn, knote, kn_next); 1722 lwkt_relpooltoken(klist); 1723 } 1724 1725 void 1726 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst, 1727 struct filterops *ops, void *hook) 1728 { 1729 struct kqueue *kq; 1730 struct knote *kn; 1731 1732 lwkt_getpooltoken(&src->ki_note); 1733 lwkt_getpooltoken(&dst->ki_note); 1734 while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) { 1735 kq = kn->kn_kq; 1736 lwkt_getpooltoken(kq); 1737 if (SLIST_FIRST(&src->ki_note) != kn || kn->kn_kq != kq) { 1738 lwkt_relpooltoken(kq); 1739 continue; 1740 } 1741 if (knote_acquire(kn)) { 1742 knote_remove(&src->ki_note, kn); 1743 kn->kn_fop = ops; 1744 kn->kn_hook = hook; 1745 knote_insert(&dst->ki_note, kn); 1746 knote_release(kn); 1747 /* kn may be invalid now */ 1748 } 1749 lwkt_relpooltoken(kq); 1750 } 1751 lwkt_relpooltoken(&dst->ki_note); 1752 lwkt_relpooltoken(&src->ki_note); 1753 } 1754 1755 /* 1756 * Remove all knotes referencing a specified fd 1757 */ 1758 void 1759 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd) 1760 { 1761 struct kqueue *kq; 1762 struct knote *kn; 1763 struct knote *kntmp; 1764 1765 lwkt_getpooltoken(&fp->f_klist); 1766 restart: 1767 SLIST_FOREACH(kn, &fp->f_klist, kn_link) { 1768 if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) { 1769 kq = kn->kn_kq; 1770 lwkt_getpooltoken(kq); 1771 1772 /* temporary verification hack */ 1773 SLIST_FOREACH(kntmp, &fp->f_klist, kn_link) { 1774 if (kn == kntmp) 1775 break; 1776 } 1777 if (kn != kntmp || kn->kn_kq->kq_fdp != fdp || 1778 kn->kn_id != fd || kn->kn_kq != kq) { 1779 lwkt_relpooltoken(kq); 1780 goto restart; 1781 } 1782 if (knote_acquire(kn)) 1783 knote_detach_and_drop(kn); 1784 lwkt_relpooltoken(kq); 1785 goto restart; 1786 } 1787 } 1788 lwkt_relpooltoken(&fp->f_klist); 1789 } 1790 1791 /* 1792 * Low level attach function. 1793 * 1794 * The knote should already be marked for processing. 1795 * Caller must hold the related kq token. 1796 */ 1797 static void 1798 knote_attach(struct knote *kn) 1799 { 1800 struct klist *list; 1801 struct kqueue *kq = kn->kn_kq; 1802 1803 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1804 KKASSERT(kn->kn_fp); 1805 list = &kn->kn_fp->f_klist; 1806 } else { 1807 if (kq->kq_knhashmask == 0) 1808 kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1809 &kq->kq_knhashmask); 1810 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1811 } 1812 lwkt_getpooltoken(list); 1813 SLIST_INSERT_HEAD(list, kn, kn_link); 1814 lwkt_relpooltoken(list); 1815 TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink); 1816 } 1817 1818 /* 1819 * Low level drop function. 1820 * 1821 * The knote should already be marked for processing. 1822 * Caller must hold the related kq token. 1823 */ 1824 static void 1825 knote_drop(struct knote *kn) 1826 { 1827 struct kqueue *kq; 1828 struct klist *list; 1829 1830 kq = kn->kn_kq; 1831 1832 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 1833 list = &kn->kn_fp->f_klist; 1834 else 1835 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1836 1837 lwkt_getpooltoken(list); 1838 SLIST_REMOVE(list, kn, knote, kn_link); 1839 lwkt_relpooltoken(list); 1840 TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink); 1841 if (kn->kn_status & KN_QUEUED) 1842 knote_dequeue(kn); 1843 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1844 fdrop(kn->kn_fp); 1845 kn->kn_fp = NULL; 1846 } 1847 knote_free(kn); 1848 } 1849 1850 /* 1851 * Low level enqueue function. 1852 * 1853 * The knote should already be marked for processing. 1854 * Caller must be holding the kq token 1855 */ 1856 static void 1857 knote_enqueue(struct knote *kn) 1858 { 1859 struct kqueue *kq = kn->kn_kq; 1860 1861 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 1862 TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); 1863 kn->kn_status |= KN_QUEUED; 1864 ++kq->kq_count; 1865 1866 /* 1867 * Send SIGIO on request (typically set up as a mailbox signal) 1868 */ 1869 if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1) 1870 pgsigio(kq->kq_sigio, SIGIO, 0); 1871 1872 kqueue_wakeup(kq); 1873 } 1874 1875 /* 1876 * Low level dequeue function. 1877 * 1878 * The knote should already be marked for processing. 1879 * Caller must be holding the kq token 1880 */ 1881 static void 1882 knote_dequeue(struct knote *kn) 1883 { 1884 struct kqueue *kq = kn->kn_kq; 1885 1886 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 1887 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 1888 kn->kn_status &= ~KN_QUEUED; 1889 kq->kq_count--; 1890 } 1891 1892 static struct knote * 1893 knote_alloc(void) 1894 { 1895 return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK); 1896 } 1897 1898 static void 1899 knote_free(struct knote *kn) 1900 { 1901 struct knote_cache_list *cache_list; 1902 1903 cache_list = &knote_cache_lists[mycpuid]; 1904 if (cache_list->knote_cache_cnt < KNOTE_CACHE_MAX) { 1905 crit_enter(); 1906 SLIST_INSERT_HEAD(&cache_list->knote_cache, kn, kn_link); 1907 cache_list->knote_cache_cnt++; 1908 crit_exit(); 1909 return; 1910 } 1911 kfree(kn, M_KQUEUE); 1912 } 1913 1914 struct sleepinfo { 1915 void *ident; 1916 int timedout; 1917 }; 1918 1919 static void 1920 precise_sleep_intr(systimer_t info, int in_ipi, struct intrframe *frame) 1921 { 1922 struct sleepinfo *si; 1923 1924 si = info->data; 1925 si->timedout = 1; 1926 wakeup(si->ident); 1927 } 1928 1929 static int 1930 precise_sleep(void *ident, int flags, const char *wmesg, int us) 1931 { 1932 struct systimer info; 1933 struct sleepinfo si = { 1934 .ident = ident, 1935 .timedout = 0, 1936 }; 1937 int r; 1938 1939 tsleep_interlock(ident, flags); 1940 systimer_init_oneshot(&info, precise_sleep_intr, &si, 1941 us == 0 ? 1 : us); 1942 r = tsleep(ident, flags | PINTERLOCKED, wmesg, 0); 1943 systimer_del(&info); 1944 if (si.timedout) 1945 r = EWOULDBLOCK; 1946 1947 return r; 1948 } 1949