1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/proc.h> 33 #include <sys/malloc.h> 34 #include <sys/unistd.h> 35 #include <sys/file.h> 36 #include <sys/lock.h> 37 #include <sys/fcntl.h> 38 #include <sys/queue.h> 39 #include <sys/event.h> 40 #include <sys/eventvar.h> 41 #include <sys/protosw.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/stat.h> 45 #include <sys/sysctl.h> 46 #include <sys/sysproto.h> 47 #include <sys/thread.h> 48 #include <sys/uio.h> 49 #include <sys/signalvar.h> 50 #include <sys/filio.h> 51 #include <sys/ktr.h> 52 53 #include <sys/thread2.h> 54 #include <sys/file2.h> 55 #include <sys/mplock2.h> 56 57 #define EVENT_REGISTER 1 58 #define EVENT_PROCESS 2 59 60 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 61 62 struct kevent_copyin_args { 63 struct kevent_args *ka; 64 int pchanges; 65 }; 66 67 #define KNOTE_CACHE_MAX 8 68 69 struct knote_cache_list { 70 struct klist knote_cache; 71 int knote_cache_cnt; 72 } __cachealign; 73 74 static int kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 75 struct knote *marker); 76 static int kqueue_read(struct file *fp, struct uio *uio, 77 struct ucred *cred, int flags); 78 static int kqueue_write(struct file *fp, struct uio *uio, 79 struct ucred *cred, int flags); 80 static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 81 struct ucred *cred, struct sysmsg *msg); 82 static int kqueue_kqfilter(struct file *fp, struct knote *kn); 83 static int kqueue_stat(struct file *fp, struct stat *st, 84 struct ucred *cred); 85 static int kqueue_close(struct file *fp); 86 static void kqueue_wakeup(struct kqueue *kq); 87 static int filter_attach(struct knote *kn); 88 static int filter_event(struct knote *kn, long hint); 89 90 /* 91 * MPSAFE 92 */ 93 static struct fileops kqueueops = { 94 .fo_read = kqueue_read, 95 .fo_write = kqueue_write, 96 .fo_ioctl = kqueue_ioctl, 97 .fo_kqfilter = kqueue_kqfilter, 98 .fo_stat = kqueue_stat, 99 .fo_close = kqueue_close, 100 .fo_shutdown = nofo_shutdown 101 }; 102 103 static void knote_attach(struct knote *kn); 104 static void knote_drop(struct knote *kn); 105 static void knote_detach_and_drop(struct knote *kn); 106 static void knote_enqueue(struct knote *kn); 107 static void knote_dequeue(struct knote *kn); 108 static struct knote *knote_alloc(void); 109 static void knote_free(struct knote *kn); 110 111 static void precise_sleep_intr(systimer_t info, int in_ipi, 112 struct intrframe *frame); 113 static int precise_sleep(void *ident, int flags, const char *wmesg, 114 int us); 115 116 static void filt_kqdetach(struct knote *kn); 117 static int filt_kqueue(struct knote *kn, long hint); 118 static int filt_procattach(struct knote *kn); 119 static void filt_procdetach(struct knote *kn); 120 static int filt_proc(struct knote *kn, long hint); 121 static int filt_fileattach(struct knote *kn); 122 static void filt_timerexpire(void *knx); 123 static int filt_timerattach(struct knote *kn); 124 static void filt_timerdetach(struct knote *kn); 125 static int filt_timer(struct knote *kn, long hint); 126 static int filt_userattach(struct knote *kn); 127 static void filt_userdetach(struct knote *kn); 128 static int filt_user(struct knote *kn, long hint); 129 static void filt_usertouch(struct knote *kn, struct kevent *kev, 130 u_long type); 131 static int filt_fsattach(struct knote *kn); 132 static void filt_fsdetach(struct knote *kn); 133 static int filt_fs(struct knote *kn, long hint); 134 135 static struct filterops file_filtops = 136 { FILTEROP_ISFD | FILTEROP_MPSAFE, filt_fileattach, NULL, NULL }; 137 static struct filterops kqread_filtops = 138 { FILTEROP_ISFD | FILTEROP_MPSAFE, NULL, filt_kqdetach, filt_kqueue }; 139 static struct filterops proc_filtops = 140 { 0, filt_procattach, filt_procdetach, filt_proc }; 141 static struct filterops timer_filtops = 142 { FILTEROP_MPSAFE, filt_timerattach, filt_timerdetach, filt_timer }; 143 static struct filterops user_filtops = 144 { FILTEROP_MPSAFE, filt_userattach, filt_userdetach, filt_user }; 145 static struct filterops fs_filtops = 146 { FILTEROP_MPSAFE, filt_fsattach, filt_fsdetach, filt_fs }; 147 148 static int kq_ncallouts = 0; 149 static int kq_calloutmax = (4 * 1024); 150 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 151 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 152 static int kq_checkloop = 1000000; 153 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW, 154 &kq_checkloop, 0, "Maximum number of loops for kqueue scan"); 155 static int kq_sleep_threshold = 20000; 156 SYSCTL_INT(_kern, OID_AUTO, kq_sleep_threshold, CTLFLAG_RW, 157 &kq_sleep_threshold, 0, "Minimum sleep duration without busy-looping"); 158 159 #define KNOTE_ACTIVATE(kn) do { \ 160 kn->kn_status |= KN_ACTIVE; \ 161 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 162 knote_enqueue(kn); \ 163 } while(0) 164 165 #define KN_HASHSIZE 64 /* XXX should be tunable */ 166 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 167 168 extern struct filterops aio_filtops; 169 extern struct filterops sig_filtops; 170 171 /* 172 * Table for for all system-defined filters. 173 */ 174 static struct filterops *sysfilt_ops[] = { 175 &file_filtops, /* EVFILT_READ */ 176 &file_filtops, /* EVFILT_WRITE */ 177 &aio_filtops, /* EVFILT_AIO */ 178 &file_filtops, /* EVFILT_VNODE */ 179 &proc_filtops, /* EVFILT_PROC */ 180 &sig_filtops, /* EVFILT_SIGNAL */ 181 &timer_filtops, /* EVFILT_TIMER */ 182 &file_filtops, /* EVFILT_EXCEPT */ 183 &user_filtops, /* EVFILT_USER */ 184 &fs_filtops, /* EVFILT_FS */ 185 }; 186 187 static struct knote_cache_list knote_cache_lists[MAXCPU]; 188 189 /* 190 * Acquire a knote, return non-zero on success, 0 on failure. 191 * 192 * If we cannot acquire the knote we sleep and return 0. The knote 193 * may be stale on return in this case and the caller must restart 194 * whatever loop they are in. 195 * 196 * Related kq token must be held. 197 */ 198 static __inline int 199 knote_acquire(struct knote *kn) 200 { 201 if (kn->kn_status & KN_PROCESSING) { 202 kn->kn_status |= KN_WAITING | KN_REPROCESS; 203 tsleep(kn, 0, "kqepts", hz); 204 /* knote may be stale now */ 205 return(0); 206 } 207 kn->kn_status |= KN_PROCESSING; 208 return(1); 209 } 210 211 /* 212 * Release an acquired knote, clearing KN_PROCESSING and handling any 213 * KN_REPROCESS events. 214 * 215 * Caller must be holding the related kq token 216 * 217 * Non-zero is returned if the knote is destroyed or detached. 218 */ 219 static __inline int 220 knote_release(struct knote *kn) 221 { 222 int ret; 223 224 while (kn->kn_status & KN_REPROCESS) { 225 kn->kn_status &= ~KN_REPROCESS; 226 if (kn->kn_status & KN_WAITING) { 227 kn->kn_status &= ~KN_WAITING; 228 wakeup(kn); 229 } 230 if (kn->kn_status & KN_DELETING) { 231 knote_detach_and_drop(kn); 232 return(1); 233 /* NOT REACHED */ 234 } 235 if (filter_event(kn, 0)) 236 KNOTE_ACTIVATE(kn); 237 } 238 if (kn->kn_status & KN_DETACHED) 239 ret = 1; 240 else 241 ret = 0; 242 kn->kn_status &= ~KN_PROCESSING; 243 /* kn should not be accessed anymore */ 244 return ret; 245 } 246 247 static int 248 filt_fileattach(struct knote *kn) 249 { 250 return (fo_kqfilter(kn->kn_fp, kn)); 251 } 252 253 /* 254 * MPSAFE 255 */ 256 static int 257 kqueue_kqfilter(struct file *fp, struct knote *kn) 258 { 259 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 260 261 if (kn->kn_filter != EVFILT_READ) 262 return (EOPNOTSUPP); 263 264 kn->kn_fop = &kqread_filtops; 265 knote_insert(&kq->kq_kqinfo.ki_note, kn); 266 return (0); 267 } 268 269 static void 270 filt_kqdetach(struct knote *kn) 271 { 272 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 273 274 knote_remove(&kq->kq_kqinfo.ki_note, kn); 275 } 276 277 /*ARGSUSED*/ 278 static int 279 filt_kqueue(struct knote *kn, long hint) 280 { 281 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 282 283 kn->kn_data = kq->kq_count; 284 return (kn->kn_data > 0); 285 } 286 287 static int 288 filt_procattach(struct knote *kn) 289 { 290 struct proc *p; 291 int immediate; 292 293 immediate = 0; 294 p = pfind(kn->kn_id); 295 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 296 p = zpfind(kn->kn_id); 297 immediate = 1; 298 } 299 if (p == NULL) { 300 return (ESRCH); 301 } 302 if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) { 303 if (p) 304 PRELE(p); 305 return (EACCES); 306 } 307 308 lwkt_gettoken(&p->p_token); 309 kn->kn_ptr.p_proc = p; 310 kn->kn_flags |= EV_CLEAR; /* automatically set */ 311 312 /* 313 * internal flag indicating registration done by kernel 314 */ 315 if (kn->kn_flags & EV_FLAG1) { 316 kn->kn_data = kn->kn_sdata; /* ppid */ 317 kn->kn_fflags = NOTE_CHILD; 318 kn->kn_flags &= ~EV_FLAG1; 319 } 320 321 knote_insert(&p->p_klist, kn); 322 323 /* 324 * Immediately activate any exit notes if the target process is a 325 * zombie. This is necessary to handle the case where the target 326 * process, e.g. a child, dies before the kevent is negistered. 327 */ 328 if (immediate && filt_proc(kn, NOTE_EXIT)) 329 KNOTE_ACTIVATE(kn); 330 lwkt_reltoken(&p->p_token); 331 PRELE(p); 332 333 return (0); 334 } 335 336 /* 337 * The knote may be attached to a different process, which may exit, 338 * leaving nothing for the knote to be attached to. So when the process 339 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 340 * it will be deleted when read out. However, as part of the knote deletion, 341 * this routine is called, so a check is needed to avoid actually performing 342 * a detach, because the original process does not exist any more. 343 */ 344 static void 345 filt_procdetach(struct knote *kn) 346 { 347 struct proc *p; 348 349 if (kn->kn_status & KN_DETACHED) 350 return; 351 p = kn->kn_ptr.p_proc; 352 knote_remove(&p->p_klist, kn); 353 } 354 355 static int 356 filt_proc(struct knote *kn, long hint) 357 { 358 u_int event; 359 360 /* 361 * mask off extra data 362 */ 363 event = (u_int)hint & NOTE_PCTRLMASK; 364 365 /* 366 * if the user is interested in this event, record it. 367 */ 368 if (kn->kn_sfflags & event) 369 kn->kn_fflags |= event; 370 371 /* 372 * Process is gone, so flag the event as finished. Detach the 373 * knote from the process now because the process will be poof, 374 * gone later on. 375 */ 376 if (event == NOTE_EXIT) { 377 struct proc *p = kn->kn_ptr.p_proc; 378 if ((kn->kn_status & KN_DETACHED) == 0) { 379 PHOLD(p); 380 knote_remove(&p->p_klist, kn); 381 kn->kn_status |= KN_DETACHED; 382 kn->kn_data = p->p_xstat; 383 kn->kn_ptr.p_proc = NULL; 384 PRELE(p); 385 } 386 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 387 return (1); 388 } 389 390 /* 391 * process forked, and user wants to track the new process, 392 * so attach a new knote to it, and immediately report an 393 * event with the parent's pid. 394 */ 395 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 396 struct kevent kev; 397 int error; 398 399 /* 400 * register knote with new process. 401 */ 402 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 403 kev.filter = kn->kn_filter; 404 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 405 kev.fflags = kn->kn_sfflags; 406 kev.data = kn->kn_id; /* parent */ 407 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 408 error = kqueue_register(kn->kn_kq, &kev); 409 if (error) 410 kn->kn_fflags |= NOTE_TRACKERR; 411 } 412 413 return (kn->kn_fflags != 0); 414 } 415 416 static void 417 filt_timerreset(struct knote *kn) 418 { 419 struct callout *calloutp; 420 struct timeval tv; 421 int tticks; 422 423 tv.tv_sec = kn->kn_sdata / 1000; 424 tv.tv_usec = (kn->kn_sdata % 1000) * 1000; 425 tticks = tvtohz_high(&tv); 426 calloutp = (struct callout *)kn->kn_hook; 427 callout_reset(calloutp, tticks, filt_timerexpire, kn); 428 } 429 430 /* 431 * The callout interlocks with callout_terminate() but can still 432 * race a deletion so if KN_DELETING is set we just don't touch 433 * the knote. 434 */ 435 static void 436 filt_timerexpire(void *knx) 437 { 438 struct knote *kn = knx; 439 struct kqueue *kq = kn->kn_kq; 440 441 lwkt_getpooltoken(kq); 442 443 /* 444 * Open knote_acquire(), since we can't sleep in callout, 445 * however, we do need to record this expiration. 446 */ 447 kn->kn_data++; 448 if (kn->kn_status & KN_PROCESSING) { 449 kn->kn_status |= KN_REPROCESS; 450 if ((kn->kn_status & KN_DELETING) == 0 && 451 (kn->kn_flags & EV_ONESHOT) == 0) 452 filt_timerreset(kn); 453 lwkt_relpooltoken(kq); 454 return; 455 } 456 KASSERT((kn->kn_status & KN_DELETING) == 0, 457 ("acquire a deleting knote %#x", kn->kn_status)); 458 kn->kn_status |= KN_PROCESSING; 459 460 KNOTE_ACTIVATE(kn); 461 if ((kn->kn_flags & EV_ONESHOT) == 0) 462 filt_timerreset(kn); 463 464 knote_release(kn); 465 466 lwkt_relpooltoken(kq); 467 } 468 469 /* 470 * data contains amount of time to sleep, in milliseconds 471 */ 472 static int 473 filt_timerattach(struct knote *kn) 474 { 475 struct callout *calloutp; 476 int prev_ncallouts; 477 478 prev_ncallouts = atomic_fetchadd_int(&kq_ncallouts, 1); 479 if (prev_ncallouts >= kq_calloutmax) { 480 atomic_subtract_int(&kq_ncallouts, 1); 481 kn->kn_hook = NULL; 482 return (ENOMEM); 483 } 484 485 kn->kn_flags |= EV_CLEAR; /* automatically set */ 486 calloutp = kmalloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK); 487 callout_init_mp(calloutp); 488 kn->kn_hook = (caddr_t)calloutp; 489 490 filt_timerreset(kn); 491 return (0); 492 } 493 494 /* 495 * This function is called with the knote flagged locked but it is 496 * still possible to race a callout event due to the callback blocking. 497 * We must call callout_terminate() instead of callout_stop() to deal 498 * with the race. 499 */ 500 static void 501 filt_timerdetach(struct knote *kn) 502 { 503 struct callout *calloutp; 504 505 calloutp = (struct callout *)kn->kn_hook; 506 callout_terminate(calloutp); 507 kfree(calloutp, M_KQUEUE); 508 atomic_subtract_int(&kq_ncallouts, 1); 509 } 510 511 static int 512 filt_timer(struct knote *kn, long hint) 513 { 514 515 return (kn->kn_data != 0); 516 } 517 518 /* 519 * EVFILT_USER 520 */ 521 static int 522 filt_userattach(struct knote *kn) 523 { 524 kn->kn_hook = NULL; 525 if (kn->kn_fflags & NOTE_TRIGGER) 526 kn->kn_ptr.hookid = 1; 527 else 528 kn->kn_ptr.hookid = 0; 529 return 0; 530 } 531 532 static void 533 filt_userdetach(struct knote *kn) 534 { 535 /* nothing to do */ 536 } 537 538 static int 539 filt_user(struct knote *kn, long hint) 540 { 541 return (kn->kn_ptr.hookid); 542 } 543 544 static void 545 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 546 { 547 u_int ffctrl; 548 549 switch (type) { 550 case EVENT_REGISTER: 551 if (kev->fflags & NOTE_TRIGGER) 552 kn->kn_ptr.hookid = 1; 553 554 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 555 kev->fflags &= NOTE_FFLAGSMASK; 556 switch (ffctrl) { 557 case NOTE_FFNOP: 558 break; 559 560 case NOTE_FFAND: 561 kn->kn_sfflags &= kev->fflags; 562 break; 563 564 case NOTE_FFOR: 565 kn->kn_sfflags |= kev->fflags; 566 break; 567 568 case NOTE_FFCOPY: 569 kn->kn_sfflags = kev->fflags; 570 break; 571 572 default: 573 /* XXX Return error? */ 574 break; 575 } 576 kn->kn_sdata = kev->data; 577 578 /* 579 * This is not the correct use of EV_CLEAR in an event 580 * modification, it should have been passed as a NOTE instead. 581 * But we need to maintain compatibility with Apple & FreeBSD. 582 * 583 * Note however that EV_CLEAR can still be used when doing 584 * the initial registration of the event and works as expected 585 * (clears the event on reception). 586 */ 587 if (kev->flags & EV_CLEAR) { 588 kn->kn_ptr.hookid = 0; 589 kn->kn_data = 0; 590 kn->kn_fflags = 0; 591 } 592 break; 593 594 case EVENT_PROCESS: 595 *kev = kn->kn_kevent; 596 kev->fflags = kn->kn_sfflags; 597 kev->data = kn->kn_sdata; 598 if (kn->kn_flags & EV_CLEAR) { 599 kn->kn_ptr.hookid = 0; 600 /* kn_data, kn_fflags handled by parent */ 601 } 602 break; 603 604 default: 605 panic("filt_usertouch() - invalid type (%ld)", type); 606 break; 607 } 608 } 609 610 /* 611 * EVFILT_FS 612 */ 613 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist); 614 615 static int 616 filt_fsattach(struct knote *kn) 617 { 618 kn->kn_flags |= EV_CLEAR; 619 knote_insert(&fs_klist, kn); 620 621 return (0); 622 } 623 624 static void 625 filt_fsdetach(struct knote *kn) 626 { 627 knote_remove(&fs_klist, kn); 628 } 629 630 static int 631 filt_fs(struct knote *kn, long hint) 632 { 633 kn->kn_fflags |= hint; 634 return (kn->kn_fflags != 0); 635 } 636 637 /* 638 * Initialize a kqueue. 639 * 640 * NOTE: The lwp/proc code initializes a kqueue for select/poll ops. 641 * 642 * MPSAFE 643 */ 644 void 645 kqueue_init(struct kqueue *kq, struct filedesc *fdp) 646 { 647 TAILQ_INIT(&kq->kq_knpend); 648 TAILQ_INIT(&kq->kq_knlist); 649 kq->kq_count = 0; 650 kq->kq_fdp = fdp; 651 SLIST_INIT(&kq->kq_kqinfo.ki_note); 652 } 653 654 /* 655 * Terminate a kqueue. Freeing the actual kq itself is left up to the 656 * caller (it might be embedded in a lwp so we don't do it here). 657 * 658 * The kq's knlist must be completely eradicated so block on any 659 * processing races. 660 */ 661 void 662 kqueue_terminate(struct kqueue *kq) 663 { 664 struct knote *kn; 665 666 lwkt_getpooltoken(kq); 667 while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) { 668 if (knote_acquire(kn)) 669 knote_detach_and_drop(kn); 670 } 671 lwkt_relpooltoken(kq); 672 673 if (kq->kq_knhash) { 674 hashdestroy(kq->kq_knhash, M_KQUEUE, kq->kq_knhashmask); 675 kq->kq_knhash = NULL; 676 kq->kq_knhashmask = 0; 677 } 678 } 679 680 /* 681 * MPSAFE 682 */ 683 int 684 sys_kqueue(struct kqueue_args *uap) 685 { 686 struct thread *td = curthread; 687 struct kqueue *kq; 688 struct file *fp; 689 int fd, error; 690 691 error = falloc(td->td_lwp, &fp, &fd); 692 if (error) 693 return (error); 694 fp->f_flag = FREAD | FWRITE; 695 fp->f_type = DTYPE_KQUEUE; 696 fp->f_ops = &kqueueops; 697 698 kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); 699 kqueue_init(kq, td->td_proc->p_fd); 700 fp->f_data = kq; 701 702 fsetfd(kq->kq_fdp, fp, fd); 703 uap->sysmsg_result = fd; 704 fdrop(fp); 705 return (error); 706 } 707 708 /* 709 * Copy 'count' items into the destination list pointed to by uap->eventlist. 710 */ 711 static int 712 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res) 713 { 714 struct kevent_copyin_args *kap; 715 int error; 716 717 kap = (struct kevent_copyin_args *)arg; 718 719 error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp)); 720 if (error == 0) { 721 kap->ka->eventlist += count; 722 *res += count; 723 } else { 724 *res = -1; 725 } 726 727 return (error); 728 } 729 730 /* 731 * Copy at most 'max' items from the list pointed to by kap->changelist, 732 * return number of items in 'events'. 733 */ 734 static int 735 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events) 736 { 737 struct kevent_copyin_args *kap; 738 int error, count; 739 740 kap = (struct kevent_copyin_args *)arg; 741 742 count = min(kap->ka->nchanges - kap->pchanges, max); 743 error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp); 744 if (error == 0) { 745 kap->ka->changelist += count; 746 kap->pchanges += count; 747 *events = count; 748 } 749 750 return (error); 751 } 752 753 /* 754 * MPSAFE 755 */ 756 int 757 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, 758 k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn, 759 struct timespec *tsp_in, int flags) 760 { 761 struct kevent *kevp; 762 struct timespec *tsp, ats; 763 int i, n, total, error, nerrors = 0; 764 int lres; 765 int limit = kq_checkloop; 766 struct kevent kev[KQ_NEVENTS]; 767 struct knote marker; 768 struct lwkt_token *tok; 769 770 if (tsp_in == NULL || tsp_in->tv_sec || tsp_in->tv_nsec) 771 atomic_set_int(&curthread->td_mpflags, TDF_MP_BATCH_DEMARC); 772 773 tsp = tsp_in; 774 *res = 0; 775 776 for (;;) { 777 n = 0; 778 error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n); 779 if (error) 780 return error; 781 if (n == 0) 782 break; 783 for (i = 0; i < n; i++) { 784 kevp = &kev[i]; 785 kevp->flags &= ~EV_SYSFLAGS; 786 error = kqueue_register(kq, kevp); 787 788 /* 789 * If a registration returns an error we 790 * immediately post the error. The kevent() 791 * call itself will fail with the error if 792 * no space is available for posting. 793 * 794 * Such errors normally bypass the timeout/blocking 795 * code. However, if the copyoutfn function refuses 796 * to post the error (see sys_poll()), then we 797 * ignore it too. 798 */ 799 if (error || (kevp->flags & EV_RECEIPT)) { 800 kevp->flags = EV_ERROR; 801 kevp->data = error; 802 lres = *res; 803 kevent_copyoutfn(uap, kevp, 1, res); 804 if (*res < 0) { 805 return error; 806 } else if (lres != *res) { 807 nevents--; 808 nerrors++; 809 } 810 } 811 } 812 } 813 if (nerrors) 814 return 0; 815 816 /* 817 * Acquire/wait for events - setup timeout 818 */ 819 if (tsp != NULL) { 820 if (tsp->tv_sec || tsp->tv_nsec) { 821 getnanouptime(&ats); 822 timespecadd(tsp, &ats); /* tsp = target time */ 823 } 824 } 825 826 /* 827 * Loop as required. 828 * 829 * Collect as many events as we can. Sleeping on successive 830 * loops is disabled if copyoutfn has incremented (*res). 831 * 832 * The loop stops if an error occurs, all events have been 833 * scanned (the marker has been reached), or fewer than the 834 * maximum number of events is found. 835 * 836 * The copyoutfn function does not have to increment (*res) in 837 * order for the loop to continue. 838 * 839 * NOTE: doselect() usually passes 0x7FFFFFFF for nevents. 840 */ 841 total = 0; 842 error = 0; 843 marker.kn_filter = EVFILT_MARKER; 844 marker.kn_status = KN_PROCESSING; 845 tok = lwkt_token_pool_lookup(kq); 846 lwkt_gettoken(tok); 847 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); 848 lwkt_reltoken(tok); 849 while ((n = nevents - total) > 0) { 850 if (n > KQ_NEVENTS) 851 n = KQ_NEVENTS; 852 853 /* 854 * If no events are pending sleep until timeout (if any) 855 * or an event occurs. 856 * 857 * After the sleep completes the marker is moved to the 858 * end of the list, making any received events available 859 * to our scan. 860 */ 861 if (kq->kq_count == 0 && *res == 0) { 862 int timeout, ustimeout = 0; 863 864 if (tsp == NULL) { 865 timeout = 0; 866 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 867 error = EWOULDBLOCK; 868 break; 869 } else { 870 struct timespec atx = *tsp; 871 872 getnanouptime(&ats); 873 timespecsub(&atx, &ats); 874 if (atx.tv_sec < 0) { 875 error = EWOULDBLOCK; 876 break; 877 } else { 878 timeout = atx.tv_sec > 24 * 60 * 60 ? 879 24 * 60 * 60 * hz : 880 tstohz_high(&atx); 881 } 882 if (flags & KEVENT_TIMEOUT_PRECISE && 883 timeout != 0) { 884 if (atx.tv_sec == 0 && 885 atx.tv_nsec < kq_sleep_threshold) { 886 DELAY(atx.tv_nsec / 1000); 887 error = EWOULDBLOCK; 888 break; 889 } else if (atx.tv_sec < 2000) { 890 ustimeout = atx.tv_sec * 891 1000000 + atx.tv_nsec/1000; 892 } else { 893 ustimeout = 2000000000; 894 } 895 } 896 } 897 898 lwkt_gettoken(tok); 899 if (kq->kq_count == 0) { 900 kq->kq_sleep_cnt++; 901 if (__predict_false(kq->kq_sleep_cnt == 0)) { 902 /* 903 * Guard against possible wrapping. And 904 * set it to 2, so that kqueue_wakeup() 905 * can wake everyone up. 906 */ 907 kq->kq_sleep_cnt = 2; 908 } 909 if ((flags & KEVENT_TIMEOUT_PRECISE) && 910 timeout != 0) { 911 error = precise_sleep(kq, PCATCH, 912 "kqread", ustimeout); 913 } else { 914 error = tsleep(kq, PCATCH, "kqread", 915 timeout); 916 } 917 918 /* don't restart after signals... */ 919 if (error == ERESTART) 920 error = EINTR; 921 if (error) { 922 lwkt_reltoken(tok); 923 break; 924 } 925 926 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 927 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, 928 kn_tqe); 929 } 930 lwkt_reltoken(tok); 931 } 932 933 /* 934 * Process all received events 935 * Account for all non-spurious events in our total 936 */ 937 i = kqueue_scan(kq, kev, n, &marker); 938 if (i) { 939 lres = *res; 940 error = kevent_copyoutfn(uap, kev, i, res); 941 total += *res - lres; 942 if (error) 943 break; 944 } 945 if (limit && --limit == 0) 946 panic("kqueue: checkloop failed i=%d", i); 947 948 /* 949 * Normally when fewer events are returned than requested 950 * we can stop. However, if only spurious events were 951 * collected the copyout will not bump (*res) and we have 952 * to continue. 953 */ 954 if (i < n && *res) 955 break; 956 957 /* 958 * Deal with an edge case where spurious events can cause 959 * a loop to occur without moving the marker. This can 960 * prevent kqueue_scan() from picking up new events which 961 * race us. We must be sure to move the marker for this 962 * case. 963 * 964 * NOTE: We do not want to move the marker if events 965 * were scanned because normal kqueue operations 966 * may reactivate events. Moving the marker in 967 * that case could result in duplicates for the 968 * same event. 969 */ 970 if (i == 0) { 971 lwkt_gettoken(tok); 972 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 973 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); 974 lwkt_reltoken(tok); 975 } 976 } 977 lwkt_gettoken(tok); 978 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 979 lwkt_reltoken(tok); 980 981 /* Timeouts do not return EWOULDBLOCK. */ 982 if (error == EWOULDBLOCK) 983 error = 0; 984 return error; 985 } 986 987 /* 988 * MPALMOSTSAFE 989 */ 990 int 991 sys_kevent(struct kevent_args *uap) 992 { 993 struct thread *td = curthread; 994 struct proc *p = td->td_proc; 995 struct timespec ts, *tsp; 996 struct kqueue *kq; 997 struct file *fp = NULL; 998 struct kevent_copyin_args *kap, ka; 999 int error; 1000 1001 if (uap->timeout) { 1002 error = copyin(uap->timeout, &ts, sizeof(ts)); 1003 if (error) 1004 return (error); 1005 tsp = &ts; 1006 } else { 1007 tsp = NULL; 1008 } 1009 fp = holdfp(p->p_fd, uap->fd, -1); 1010 if (fp == NULL) 1011 return (EBADF); 1012 if (fp->f_type != DTYPE_KQUEUE) { 1013 fdrop(fp); 1014 return (EBADF); 1015 } 1016 1017 kq = (struct kqueue *)fp->f_data; 1018 1019 kap = &ka; 1020 kap->ka = uap; 1021 kap->pchanges = 0; 1022 1023 error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap, 1024 kevent_copyin, kevent_copyout, tsp, 0); 1025 1026 fdrop(fp); 1027 1028 return (error); 1029 } 1030 1031 int 1032 kqueue_register(struct kqueue *kq, struct kevent *kev) 1033 { 1034 struct filedesc *fdp = kq->kq_fdp; 1035 struct klist *list = NULL; 1036 struct filterops *fops; 1037 struct file *fp = NULL; 1038 struct knote *kn = NULL; 1039 struct thread *td; 1040 int error = 0; 1041 struct knote_cache_list *cache_list; 1042 1043 if (kev->filter < 0) { 1044 if (kev->filter + EVFILT_SYSCOUNT < 0) 1045 return (EINVAL); 1046 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ 1047 } else { 1048 /* 1049 * XXX 1050 * filter attach routine is responsible for insuring that 1051 * the identifier can be attached to it. 1052 */ 1053 return (EINVAL); 1054 } 1055 1056 if (fops->f_flags & FILTEROP_ISFD) { 1057 /* validate descriptor */ 1058 fp = holdfp(fdp, kev->ident, -1); 1059 if (fp == NULL) 1060 return (EBADF); 1061 } 1062 1063 cache_list = &knote_cache_lists[mycpuid]; 1064 if (SLIST_EMPTY(&cache_list->knote_cache)) { 1065 struct knote *new_kn; 1066 1067 new_kn = knote_alloc(); 1068 crit_enter(); 1069 SLIST_INSERT_HEAD(&cache_list->knote_cache, new_kn, kn_link); 1070 cache_list->knote_cache_cnt++; 1071 crit_exit(); 1072 } 1073 1074 td = curthread; 1075 lwkt_getpooltoken(kq); 1076 1077 /* 1078 * Make sure that only one thread can register event on this kqueue, 1079 * so that we would not suffer any race, even if the registration 1080 * blocked, i.e. kq token was released, and the kqueue was shared 1081 * between threads (this should be rare though). 1082 */ 1083 while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) { 1084 kq->kq_state |= KQ_REGWAIT; 1085 tsleep(&kq->kq_regtd, 0, "kqreg", 0); 1086 } 1087 if (__predict_false(kq->kq_regtd != NULL)) { 1088 /* Recursive calling of kqueue_register() */ 1089 td = NULL; 1090 } else { 1091 /* Owner of the kq_regtd, i.e. td != NULL */ 1092 kq->kq_regtd = td; 1093 } 1094 1095 if (fp != NULL) { 1096 list = &fp->f_klist; 1097 } else if (kq->kq_knhashmask) { 1098 list = &kq->kq_knhash[ 1099 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1100 } 1101 if (list != NULL) { 1102 lwkt_getpooltoken(list); 1103 again: 1104 SLIST_FOREACH(kn, list, kn_link) { 1105 if (kn->kn_kq == kq && 1106 kn->kn_filter == kev->filter && 1107 kn->kn_id == kev->ident) { 1108 if (knote_acquire(kn) == 0) 1109 goto again; 1110 break; 1111 } 1112 } 1113 lwkt_relpooltoken(list); 1114 } 1115 1116 /* 1117 * NOTE: At this point if kn is non-NULL we will have acquired 1118 * it and set KN_PROCESSING. 1119 */ 1120 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 1121 error = ENOENT; 1122 goto done; 1123 } 1124 1125 /* 1126 * kn now contains the matching knote, or NULL if no match 1127 */ 1128 if (kev->flags & EV_ADD) { 1129 if (kn == NULL) { 1130 crit_enter(); 1131 kn = SLIST_FIRST(&cache_list->knote_cache); 1132 if (kn == NULL) { 1133 crit_exit(); 1134 kn = knote_alloc(); 1135 } else { 1136 SLIST_REMOVE_HEAD(&cache_list->knote_cache, 1137 kn_link); 1138 cache_list->knote_cache_cnt--; 1139 crit_exit(); 1140 } 1141 kn->kn_fp = fp; 1142 kn->kn_kq = kq; 1143 kn->kn_fop = fops; 1144 1145 /* 1146 * apply reference count to knote structure, and 1147 * do not release it at the end of this routine. 1148 */ 1149 fp = NULL; 1150 1151 kn->kn_sfflags = kev->fflags; 1152 kn->kn_sdata = kev->data; 1153 kev->fflags = 0; 1154 kev->data = 0; 1155 kn->kn_kevent = *kev; 1156 1157 /* 1158 * KN_PROCESSING prevents the knote from getting 1159 * ripped out from under us while we are trying 1160 * to attach it, in case the attach blocks. 1161 */ 1162 kn->kn_status = KN_PROCESSING; 1163 knote_attach(kn); 1164 if ((error = filter_attach(kn)) != 0) { 1165 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1166 knote_drop(kn); 1167 goto done; 1168 } 1169 1170 /* 1171 * Interlock against close races which either tried 1172 * to remove our knote while we were blocked or missed 1173 * it entirely prior to our attachment. We do not 1174 * want to end up with a knote on a closed descriptor. 1175 */ 1176 if ((fops->f_flags & FILTEROP_ISFD) && 1177 checkfdclosed(fdp, kev->ident, kn->kn_fp)) { 1178 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1179 } 1180 } else { 1181 /* 1182 * The user may change some filter values after the 1183 * initial EV_ADD, but doing so will not reset any 1184 * filter which have already been triggered. 1185 */ 1186 KKASSERT(kn->kn_status & KN_PROCESSING); 1187 if (fops == &user_filtops) { 1188 filt_usertouch(kn, kev, EVENT_REGISTER); 1189 } else { 1190 kn->kn_sfflags = kev->fflags; 1191 kn->kn_sdata = kev->data; 1192 kn->kn_kevent.udata = kev->udata; 1193 } 1194 } 1195 1196 /* 1197 * Execute the filter event to immediately activate the 1198 * knote if necessary. If reprocessing events are pending 1199 * due to blocking above we do not run the filter here 1200 * but instead let knote_release() do it. Otherwise we 1201 * might run the filter on a deleted event. 1202 */ 1203 if ((kn->kn_status & KN_REPROCESS) == 0) { 1204 if (filter_event(kn, 0)) 1205 KNOTE_ACTIVATE(kn); 1206 } 1207 } else if (kev->flags & EV_DELETE) { 1208 /* 1209 * Delete the existing knote 1210 */ 1211 knote_detach_and_drop(kn); 1212 goto done; 1213 } else { 1214 /* 1215 * Modify an existing event. 1216 * 1217 * The user may change some filter values after the 1218 * initial EV_ADD, but doing so will not reset any 1219 * filter which have already been triggered. 1220 */ 1221 KKASSERT(kn->kn_status & KN_PROCESSING); 1222 if (fops == &user_filtops) { 1223 filt_usertouch(kn, kev, EVENT_REGISTER); 1224 } else { 1225 kn->kn_sfflags = kev->fflags; 1226 kn->kn_sdata = kev->data; 1227 kn->kn_kevent.udata = kev->udata; 1228 } 1229 1230 /* 1231 * Execute the filter event to immediately activate the 1232 * knote if necessary. If reprocessing events are pending 1233 * due to blocking above we do not run the filter here 1234 * but instead let knote_release() do it. Otherwise we 1235 * might run the filter on a deleted event. 1236 */ 1237 if ((kn->kn_status & KN_REPROCESS) == 0) { 1238 if (filter_event(kn, 0)) 1239 KNOTE_ACTIVATE(kn); 1240 } 1241 } 1242 1243 /* 1244 * Disablement does not deactivate a knote here. 1245 */ 1246 if ((kev->flags & EV_DISABLE) && 1247 ((kn->kn_status & KN_DISABLED) == 0)) { 1248 kn->kn_status |= KN_DISABLED; 1249 } 1250 1251 /* 1252 * Re-enablement may have to immediately enqueue an active knote. 1253 */ 1254 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 1255 kn->kn_status &= ~KN_DISABLED; 1256 if ((kn->kn_status & KN_ACTIVE) && 1257 ((kn->kn_status & KN_QUEUED) == 0)) { 1258 knote_enqueue(kn); 1259 } 1260 } 1261 1262 /* 1263 * Handle any required reprocessing 1264 */ 1265 knote_release(kn); 1266 /* kn may be invalid now */ 1267 1268 done: 1269 if (td != NULL) { /* Owner of the kq_regtd */ 1270 kq->kq_regtd = NULL; 1271 if (__predict_false(kq->kq_state & KQ_REGWAIT)) { 1272 kq->kq_state &= ~KQ_REGWAIT; 1273 wakeup(&kq->kq_regtd); 1274 } 1275 } 1276 lwkt_relpooltoken(kq); 1277 if (fp != NULL) 1278 fdrop(fp); 1279 return (error); 1280 } 1281 1282 /* 1283 * Scan the kqueue, return the number of active events placed in kevp up 1284 * to count. 1285 * 1286 * Continuous mode events may get recycled, do not continue scanning past 1287 * marker unless no events have been collected. 1288 */ 1289 static int 1290 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 1291 struct knote *marker) 1292 { 1293 struct knote *kn, local_marker; 1294 int total; 1295 1296 total = 0; 1297 local_marker.kn_filter = EVFILT_MARKER; 1298 local_marker.kn_status = KN_PROCESSING; 1299 1300 lwkt_getpooltoken(kq); 1301 1302 /* 1303 * Collect events. 1304 */ 1305 TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe); 1306 while (count) { 1307 kn = TAILQ_NEXT(&local_marker, kn_tqe); 1308 if (kn->kn_filter == EVFILT_MARKER) { 1309 /* Marker reached, we are done */ 1310 if (kn == marker) 1311 break; 1312 1313 /* Move local marker past some other threads marker */ 1314 kn = TAILQ_NEXT(kn, kn_tqe); 1315 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1316 TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe); 1317 continue; 1318 } 1319 1320 /* 1321 * We can't skip a knote undergoing processing, otherwise 1322 * we risk not returning it when the user process expects 1323 * it should be returned. Sleep and retry. 1324 */ 1325 if (knote_acquire(kn) == 0) 1326 continue; 1327 1328 /* 1329 * Remove the event for processing. 1330 * 1331 * WARNING! We must leave KN_QUEUED set to prevent the 1332 * event from being KNOTE_ACTIVATE()d while 1333 * the queue state is in limbo, in case we 1334 * block. 1335 */ 1336 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 1337 kq->kq_count--; 1338 1339 /* 1340 * We have to deal with an extremely important race against 1341 * file descriptor close()s here. The file descriptor can 1342 * disappear MPSAFE, and there is a small window of 1343 * opportunity between that and the call to knote_fdclose(). 1344 * 1345 * If we hit that window here while doselect or dopoll is 1346 * trying to delete a spurious event they will not be able 1347 * to match up the event against a knote and will go haywire. 1348 */ 1349 if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && 1350 checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) { 1351 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1352 } 1353 1354 if (kn->kn_status & KN_DISABLED) { 1355 /* 1356 * If disabled we ensure the event is not queued 1357 * but leave its active bit set. On re-enablement 1358 * the event may be immediately triggered. 1359 */ 1360 kn->kn_status &= ~KN_QUEUED; 1361 } else if ((kn->kn_flags & EV_ONESHOT) == 0 && 1362 (kn->kn_status & KN_DELETING) == 0 && 1363 filter_event(kn, 0) == 0) { 1364 /* 1365 * If not running in one-shot mode and the event 1366 * is no longer present we ensure it is removed 1367 * from the queue and ignore it. 1368 */ 1369 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1370 } else { 1371 /* 1372 * Post the event 1373 */ 1374 if (kn->kn_fop == &user_filtops) 1375 filt_usertouch(kn, kevp, EVENT_PROCESS); 1376 else 1377 *kevp = kn->kn_kevent; 1378 ++kevp; 1379 ++total; 1380 --count; 1381 1382 if (kn->kn_flags & EV_ONESHOT) { 1383 kn->kn_status &= ~KN_QUEUED; 1384 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1385 } else { 1386 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1387 if (kn->kn_flags & EV_CLEAR) { 1388 kn->kn_data = 0; 1389 kn->kn_fflags = 0; 1390 } 1391 if (kn->kn_flags & EV_DISPATCH) { 1392 kn->kn_status |= KN_DISABLED; 1393 } 1394 kn->kn_status &= ~(KN_QUEUED | 1395 KN_ACTIVE); 1396 } else { 1397 TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); 1398 kq->kq_count++; 1399 } 1400 } 1401 } 1402 1403 /* 1404 * Handle any post-processing states 1405 */ 1406 knote_release(kn); 1407 } 1408 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1409 1410 lwkt_relpooltoken(kq); 1411 return (total); 1412 } 1413 1414 /* 1415 * XXX 1416 * This could be expanded to call kqueue_scan, if desired. 1417 * 1418 * MPSAFE 1419 */ 1420 static int 1421 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1422 { 1423 return (ENXIO); 1424 } 1425 1426 /* 1427 * MPSAFE 1428 */ 1429 static int 1430 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1431 { 1432 return (ENXIO); 1433 } 1434 1435 /* 1436 * MPALMOSTSAFE 1437 */ 1438 static int 1439 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 1440 struct ucred *cred, struct sysmsg *msg) 1441 { 1442 struct kqueue *kq; 1443 int error; 1444 1445 kq = (struct kqueue *)fp->f_data; 1446 lwkt_getpooltoken(kq); 1447 switch(com) { 1448 case FIOASYNC: 1449 if (*(int *)data) 1450 kq->kq_state |= KQ_ASYNC; 1451 else 1452 kq->kq_state &= ~KQ_ASYNC; 1453 error = 0; 1454 break; 1455 case FIOSETOWN: 1456 error = fsetown(*(int *)data, &kq->kq_sigio); 1457 break; 1458 default: 1459 error = ENOTTY; 1460 break; 1461 } 1462 lwkt_relpooltoken(kq); 1463 return (error); 1464 } 1465 1466 /* 1467 * MPSAFE 1468 */ 1469 static int 1470 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred) 1471 { 1472 struct kqueue *kq = (struct kqueue *)fp->f_data; 1473 1474 bzero((void *)st, sizeof(*st)); 1475 st->st_size = kq->kq_count; 1476 st->st_blksize = sizeof(struct kevent); 1477 st->st_mode = S_IFIFO; 1478 return (0); 1479 } 1480 1481 /* 1482 * MPSAFE 1483 */ 1484 static int 1485 kqueue_close(struct file *fp) 1486 { 1487 struct kqueue *kq = (struct kqueue *)fp->f_data; 1488 1489 kqueue_terminate(kq); 1490 1491 fp->f_data = NULL; 1492 funsetown(&kq->kq_sigio); 1493 1494 kfree(kq, M_KQUEUE); 1495 return (0); 1496 } 1497 1498 static void 1499 kqueue_wakeup(struct kqueue *kq) 1500 { 1501 if (kq->kq_sleep_cnt) { 1502 u_int sleep_cnt = kq->kq_sleep_cnt; 1503 1504 kq->kq_sleep_cnt = 0; 1505 if (sleep_cnt == 1) 1506 wakeup_one(kq); 1507 else 1508 wakeup(kq); 1509 } 1510 KNOTE(&kq->kq_kqinfo.ki_note, 0); 1511 } 1512 1513 /* 1514 * Calls filterops f_attach function, acquiring mplock if filter is not 1515 * marked as FILTEROP_MPSAFE. 1516 * 1517 * Caller must be holding the related kq token 1518 */ 1519 static int 1520 filter_attach(struct knote *kn) 1521 { 1522 int ret; 1523 1524 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1525 ret = kn->kn_fop->f_attach(kn); 1526 } else { 1527 get_mplock(); 1528 ret = kn->kn_fop->f_attach(kn); 1529 rel_mplock(); 1530 } 1531 return (ret); 1532 } 1533 1534 /* 1535 * Detach the knote and drop it, destroying the knote. 1536 * 1537 * Calls filterops f_detach function, acquiring mplock if filter is not 1538 * marked as FILTEROP_MPSAFE. 1539 * 1540 * Caller must be holding the related kq token 1541 */ 1542 static void 1543 knote_detach_and_drop(struct knote *kn) 1544 { 1545 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1546 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1547 kn->kn_fop->f_detach(kn); 1548 } else { 1549 get_mplock(); 1550 kn->kn_fop->f_detach(kn); 1551 rel_mplock(); 1552 } 1553 knote_drop(kn); 1554 } 1555 1556 /* 1557 * Calls filterops f_event function, acquiring mplock if filter is not 1558 * marked as FILTEROP_MPSAFE. 1559 * 1560 * If the knote is in the middle of being created or deleted we cannot 1561 * safely call the filter op. 1562 * 1563 * Caller must be holding the related kq token 1564 */ 1565 static int 1566 filter_event(struct knote *kn, long hint) 1567 { 1568 int ret; 1569 1570 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1571 ret = kn->kn_fop->f_event(kn, hint); 1572 } else { 1573 get_mplock(); 1574 ret = kn->kn_fop->f_event(kn, hint); 1575 rel_mplock(); 1576 } 1577 return (ret); 1578 } 1579 1580 /* 1581 * Walk down a list of knotes, activating them if their event has triggered. 1582 * 1583 * If we encounter any knotes which are undergoing processing we just mark 1584 * them for reprocessing and do not try to [re]activate the knote. However, 1585 * if a hint is being passed we have to wait and that makes things a bit 1586 * sticky. 1587 */ 1588 void 1589 knote(struct klist *list, long hint) 1590 { 1591 struct kqueue *kq; 1592 struct knote *kn; 1593 struct knote *kntmp; 1594 1595 lwkt_getpooltoken(list); 1596 restart: 1597 SLIST_FOREACH(kn, list, kn_next) { 1598 kq = kn->kn_kq; 1599 lwkt_getpooltoken(kq); 1600 1601 /* temporary verification hack */ 1602 SLIST_FOREACH(kntmp, list, kn_next) { 1603 if (kn == kntmp) 1604 break; 1605 } 1606 if (kn != kntmp || kn->kn_kq != kq) { 1607 lwkt_relpooltoken(kq); 1608 goto restart; 1609 } 1610 1611 if (kn->kn_status & KN_PROCESSING) { 1612 /* 1613 * Someone else is processing the knote, ask the 1614 * other thread to reprocess it and don't mess 1615 * with it otherwise. 1616 */ 1617 if (hint == 0) { 1618 kn->kn_status |= KN_REPROCESS; 1619 lwkt_relpooltoken(kq); 1620 continue; 1621 } 1622 1623 /* 1624 * If the hint is non-zero we have to wait or risk 1625 * losing the state the caller is trying to update. 1626 * 1627 * XXX This is a real problem, certain process 1628 * and signal filters will bump kn_data for 1629 * already-processed notes more than once if 1630 * we restart the list scan. FIXME. 1631 */ 1632 kn->kn_status |= KN_WAITING | KN_REPROCESS; 1633 tsleep(kn, 0, "knotec", hz); 1634 lwkt_relpooltoken(kq); 1635 goto restart; 1636 } 1637 1638 /* 1639 * Become the reprocessing master ourselves. 1640 * 1641 * If hint is non-zero running the event is mandatory 1642 * when not deleting so do it whether reprocessing is 1643 * set or not. 1644 */ 1645 kn->kn_status |= KN_PROCESSING; 1646 if ((kn->kn_status & KN_DELETING) == 0) { 1647 if (filter_event(kn, hint)) 1648 KNOTE_ACTIVATE(kn); 1649 } 1650 if (knote_release(kn)) { 1651 lwkt_relpooltoken(kq); 1652 goto restart; 1653 } 1654 lwkt_relpooltoken(kq); 1655 } 1656 lwkt_relpooltoken(list); 1657 } 1658 1659 /* 1660 * Insert knote at head of klist. 1661 * 1662 * This function may only be called via a filter function and thus 1663 * kq_token should already be held and marked for processing. 1664 */ 1665 void 1666 knote_insert(struct klist *klist, struct knote *kn) 1667 { 1668 lwkt_getpooltoken(klist); 1669 KKASSERT(kn->kn_status & KN_PROCESSING); 1670 SLIST_INSERT_HEAD(klist, kn, kn_next); 1671 lwkt_relpooltoken(klist); 1672 } 1673 1674 /* 1675 * Remove knote from a klist 1676 * 1677 * This function may only be called via a filter function and thus 1678 * kq_token should already be held and marked for processing. 1679 */ 1680 void 1681 knote_remove(struct klist *klist, struct knote *kn) 1682 { 1683 lwkt_getpooltoken(klist); 1684 KKASSERT(kn->kn_status & KN_PROCESSING); 1685 SLIST_REMOVE(klist, kn, knote, kn_next); 1686 lwkt_relpooltoken(klist); 1687 } 1688 1689 void 1690 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst, 1691 struct filterops *ops, void *hook) 1692 { 1693 struct kqueue *kq; 1694 struct knote *kn; 1695 1696 lwkt_getpooltoken(&src->ki_note); 1697 lwkt_getpooltoken(&dst->ki_note); 1698 while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) { 1699 kq = kn->kn_kq; 1700 lwkt_getpooltoken(kq); 1701 if (SLIST_FIRST(&src->ki_note) != kn || kn->kn_kq != kq) { 1702 lwkt_relpooltoken(kq); 1703 continue; 1704 } 1705 if (knote_acquire(kn)) { 1706 knote_remove(&src->ki_note, kn); 1707 kn->kn_fop = ops; 1708 kn->kn_hook = hook; 1709 knote_insert(&dst->ki_note, kn); 1710 knote_release(kn); 1711 /* kn may be invalid now */ 1712 } 1713 lwkt_relpooltoken(kq); 1714 } 1715 lwkt_relpooltoken(&dst->ki_note); 1716 lwkt_relpooltoken(&src->ki_note); 1717 } 1718 1719 /* 1720 * Remove all knotes referencing a specified fd 1721 */ 1722 void 1723 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd) 1724 { 1725 struct kqueue *kq; 1726 struct knote *kn; 1727 struct knote *kntmp; 1728 1729 lwkt_getpooltoken(&fp->f_klist); 1730 restart: 1731 SLIST_FOREACH(kn, &fp->f_klist, kn_link) { 1732 if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) { 1733 kq = kn->kn_kq; 1734 lwkt_getpooltoken(kq); 1735 1736 /* temporary verification hack */ 1737 SLIST_FOREACH(kntmp, &fp->f_klist, kn_link) { 1738 if (kn == kntmp) 1739 break; 1740 } 1741 if (kn != kntmp || kn->kn_kq->kq_fdp != fdp || 1742 kn->kn_id != fd || kn->kn_kq != kq) { 1743 lwkt_relpooltoken(kq); 1744 goto restart; 1745 } 1746 if (knote_acquire(kn)) 1747 knote_detach_and_drop(kn); 1748 lwkt_relpooltoken(kq); 1749 goto restart; 1750 } 1751 } 1752 lwkt_relpooltoken(&fp->f_klist); 1753 } 1754 1755 /* 1756 * Low level attach function. 1757 * 1758 * The knote should already be marked for processing. 1759 * Caller must hold the related kq token. 1760 */ 1761 static void 1762 knote_attach(struct knote *kn) 1763 { 1764 struct klist *list; 1765 struct kqueue *kq = kn->kn_kq; 1766 1767 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1768 KKASSERT(kn->kn_fp); 1769 list = &kn->kn_fp->f_klist; 1770 } else { 1771 if (kq->kq_knhashmask == 0) 1772 kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1773 &kq->kq_knhashmask); 1774 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1775 } 1776 lwkt_getpooltoken(list); 1777 SLIST_INSERT_HEAD(list, kn, kn_link); 1778 lwkt_relpooltoken(list); 1779 TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink); 1780 } 1781 1782 /* 1783 * Low level drop function. 1784 * 1785 * The knote should already be marked for processing. 1786 * Caller must hold the related kq token. 1787 */ 1788 static void 1789 knote_drop(struct knote *kn) 1790 { 1791 struct kqueue *kq; 1792 struct klist *list; 1793 1794 kq = kn->kn_kq; 1795 1796 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 1797 list = &kn->kn_fp->f_klist; 1798 else 1799 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1800 1801 lwkt_getpooltoken(list); 1802 SLIST_REMOVE(list, kn, knote, kn_link); 1803 lwkt_relpooltoken(list); 1804 TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink); 1805 if (kn->kn_status & KN_QUEUED) 1806 knote_dequeue(kn); 1807 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1808 fdrop(kn->kn_fp); 1809 kn->kn_fp = NULL; 1810 } 1811 knote_free(kn); 1812 } 1813 1814 /* 1815 * Low level enqueue function. 1816 * 1817 * The knote should already be marked for processing. 1818 * Caller must be holding the kq token 1819 */ 1820 static void 1821 knote_enqueue(struct knote *kn) 1822 { 1823 struct kqueue *kq = kn->kn_kq; 1824 1825 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 1826 TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); 1827 kn->kn_status |= KN_QUEUED; 1828 ++kq->kq_count; 1829 1830 /* 1831 * Send SIGIO on request (typically set up as a mailbox signal) 1832 */ 1833 if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1) 1834 pgsigio(kq->kq_sigio, SIGIO, 0); 1835 1836 kqueue_wakeup(kq); 1837 } 1838 1839 /* 1840 * Low level dequeue function. 1841 * 1842 * The knote should already be marked for processing. 1843 * Caller must be holding the kq token 1844 */ 1845 static void 1846 knote_dequeue(struct knote *kn) 1847 { 1848 struct kqueue *kq = kn->kn_kq; 1849 1850 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 1851 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 1852 kn->kn_status &= ~KN_QUEUED; 1853 kq->kq_count--; 1854 } 1855 1856 static struct knote * 1857 knote_alloc(void) 1858 { 1859 return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK); 1860 } 1861 1862 static void 1863 knote_free(struct knote *kn) 1864 { 1865 struct knote_cache_list *cache_list; 1866 1867 cache_list = &knote_cache_lists[mycpuid]; 1868 if (cache_list->knote_cache_cnt < KNOTE_CACHE_MAX) { 1869 crit_enter(); 1870 SLIST_INSERT_HEAD(&cache_list->knote_cache, kn, kn_link); 1871 cache_list->knote_cache_cnt++; 1872 crit_exit(); 1873 return; 1874 } 1875 kfree(kn, M_KQUEUE); 1876 } 1877 1878 struct sleepinfo { 1879 void *ident; 1880 int timedout; 1881 }; 1882 1883 static void 1884 precise_sleep_intr(systimer_t info, int in_ipi, struct intrframe *frame) 1885 { 1886 struct sleepinfo *si; 1887 1888 si = info->data; 1889 si->timedout = 1; 1890 wakeup(si->ident); 1891 } 1892 1893 static int 1894 precise_sleep(void *ident, int flags, const char *wmesg, int us) 1895 { 1896 struct systimer info; 1897 struct sleepinfo si = { 1898 .ident = ident, 1899 .timedout = 0, 1900 }; 1901 int r; 1902 1903 tsleep_interlock(ident, flags); 1904 systimer_init_oneshot(&info, precise_sleep_intr, &si, 1905 us == 0 ? 1 : us); 1906 r = tsleep(ident, flags | PINTERLOCKED, wmesg, 0); 1907 systimer_del(&info); 1908 if (si.timedout) 1909 r = EWOULDBLOCK; 1910 1911 return r; 1912 } 1913