1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/proc.h> 33 #include <sys/malloc.h> 34 #include <sys/unistd.h> 35 #include <sys/file.h> 36 #include <sys/lock.h> 37 #include <sys/fcntl.h> 38 #include <sys/queue.h> 39 #include <sys/event.h> 40 #include <sys/eventvar.h> 41 #include <sys/protosw.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/stat.h> 45 #include <sys/sysctl.h> 46 #include <sys/sysproto.h> 47 #include <sys/thread.h> 48 #include <sys/uio.h> 49 #include <sys/signalvar.h> 50 #include <sys/filio.h> 51 #include <sys/ktr.h> 52 53 #include <sys/thread2.h> 54 #include <sys/file2.h> 55 #include <sys/mplock2.h> 56 57 #define EVENT_REGISTER 1 58 #define EVENT_PROCESS 2 59 60 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 61 62 struct kevent_copyin_args { 63 struct kevent_args *ka; 64 int pchanges; 65 }; 66 67 #define KNOTE_CACHE_MAX 8 68 69 struct knote_cache_list { 70 struct klist knote_cache; 71 int knote_cache_cnt; 72 } __cachealign; 73 74 static int kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 75 struct knote *marker); 76 static int kqueue_read(struct file *fp, struct uio *uio, 77 struct ucred *cred, int flags); 78 static int kqueue_write(struct file *fp, struct uio *uio, 79 struct ucred *cred, int flags); 80 static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 81 struct ucred *cred, struct sysmsg *msg); 82 static int kqueue_kqfilter(struct file *fp, struct knote *kn); 83 static int kqueue_stat(struct file *fp, struct stat *st, 84 struct ucred *cred); 85 static int kqueue_close(struct file *fp); 86 static void kqueue_wakeup(struct kqueue *kq); 87 static int filter_attach(struct knote *kn); 88 static int filter_event(struct knote *kn, long hint); 89 90 /* 91 * MPSAFE 92 */ 93 static struct fileops kqueueops = { 94 .fo_read = kqueue_read, 95 .fo_write = kqueue_write, 96 .fo_ioctl = kqueue_ioctl, 97 .fo_kqfilter = kqueue_kqfilter, 98 .fo_stat = kqueue_stat, 99 .fo_close = kqueue_close, 100 .fo_shutdown = nofo_shutdown 101 }; 102 103 static void knote_attach(struct knote *kn); 104 static void knote_drop(struct knote *kn); 105 static void knote_detach_and_drop(struct knote *kn); 106 static void knote_enqueue(struct knote *kn); 107 static void knote_dequeue(struct knote *kn); 108 static struct knote *knote_alloc(void); 109 static void knote_free(struct knote *kn); 110 111 static void filt_kqdetach(struct knote *kn); 112 static int filt_kqueue(struct knote *kn, long hint); 113 static int filt_procattach(struct knote *kn); 114 static void filt_procdetach(struct knote *kn); 115 static int filt_proc(struct knote *kn, long hint); 116 static int filt_fileattach(struct knote *kn); 117 static void filt_timerexpire(void *knx); 118 static int filt_timerattach(struct knote *kn); 119 static void filt_timerdetach(struct knote *kn); 120 static int filt_timer(struct knote *kn, long hint); 121 static int filt_userattach(struct knote *kn); 122 static void filt_userdetach(struct knote *kn); 123 static int filt_user(struct knote *kn, long hint); 124 static void filt_usertouch(struct knote *kn, struct kevent *kev, 125 u_long type); 126 static int filt_fsattach(struct knote *kn); 127 static void filt_fsdetach(struct knote *kn); 128 static int filt_fs(struct knote *kn, long hint); 129 130 static struct filterops file_filtops = 131 { FILTEROP_ISFD | FILTEROP_MPSAFE, filt_fileattach, NULL, NULL }; 132 static struct filterops kqread_filtops = 133 { FILTEROP_ISFD | FILTEROP_MPSAFE, NULL, filt_kqdetach, filt_kqueue }; 134 static struct filterops proc_filtops = 135 { 0, filt_procattach, filt_procdetach, filt_proc }; 136 static struct filterops timer_filtops = 137 { FILTEROP_MPSAFE, filt_timerattach, filt_timerdetach, filt_timer }; 138 static struct filterops user_filtops = 139 { FILTEROP_MPSAFE, filt_userattach, filt_userdetach, filt_user }; 140 static struct filterops fs_filtops = 141 { FILTEROP_MPSAFE, filt_fsattach, filt_fsdetach, filt_fs }; 142 143 static int kq_ncallouts = 0; 144 static int kq_calloutmax = (4 * 1024); 145 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 146 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 147 static int kq_checkloop = 1000000; 148 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW, 149 &kq_checkloop, 0, "Maximum number of loops for kqueue scan"); 150 151 #define KNOTE_ACTIVATE(kn) do { \ 152 kn->kn_status |= KN_ACTIVE; \ 153 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 154 knote_enqueue(kn); \ 155 } while(0) 156 157 #define KN_HASHSIZE 64 /* XXX should be tunable */ 158 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 159 160 extern struct filterops aio_filtops; 161 extern struct filterops sig_filtops; 162 163 /* 164 * Table for for all system-defined filters. 165 */ 166 static struct filterops *sysfilt_ops[] = { 167 &file_filtops, /* EVFILT_READ */ 168 &file_filtops, /* EVFILT_WRITE */ 169 &aio_filtops, /* EVFILT_AIO */ 170 &file_filtops, /* EVFILT_VNODE */ 171 &proc_filtops, /* EVFILT_PROC */ 172 &sig_filtops, /* EVFILT_SIGNAL */ 173 &timer_filtops, /* EVFILT_TIMER */ 174 &file_filtops, /* EVFILT_EXCEPT */ 175 &user_filtops, /* EVFILT_USER */ 176 &fs_filtops, /* EVFILT_FS */ 177 }; 178 179 static struct knote_cache_list knote_cache_lists[MAXCPU]; 180 181 /* 182 * Acquire a knote, return non-zero on success, 0 on failure. 183 * 184 * If we cannot acquire the knote we sleep and return 0. The knote 185 * may be stale on return in this case and the caller must restart 186 * whatever loop they are in. 187 * 188 * Related kq token must be held. 189 */ 190 static __inline int 191 knote_acquire(struct knote *kn) 192 { 193 if (kn->kn_status & KN_PROCESSING) { 194 kn->kn_status |= KN_WAITING | KN_REPROCESS; 195 tsleep(kn, 0, "kqepts", hz); 196 /* knote may be stale now */ 197 return(0); 198 } 199 kn->kn_status |= KN_PROCESSING; 200 return(1); 201 } 202 203 /* 204 * Release an acquired knote, clearing KN_PROCESSING and handling any 205 * KN_REPROCESS events. 206 * 207 * Caller must be holding the related kq token 208 * 209 * Non-zero is returned if the knote is destroyed or detached. 210 */ 211 static __inline int 212 knote_release(struct knote *kn) 213 { 214 int ret; 215 216 while (kn->kn_status & KN_REPROCESS) { 217 kn->kn_status &= ~KN_REPROCESS; 218 if (kn->kn_status & KN_WAITING) { 219 kn->kn_status &= ~KN_WAITING; 220 wakeup(kn); 221 } 222 if (kn->kn_status & KN_DELETING) { 223 knote_detach_and_drop(kn); 224 return(1); 225 /* NOT REACHED */ 226 } 227 if (filter_event(kn, 0)) 228 KNOTE_ACTIVATE(kn); 229 } 230 if (kn->kn_status & KN_DETACHED) 231 ret = 1; 232 else 233 ret = 0; 234 kn->kn_status &= ~KN_PROCESSING; 235 /* kn should not be accessed anymore */ 236 return ret; 237 } 238 239 static int 240 filt_fileattach(struct knote *kn) 241 { 242 return (fo_kqfilter(kn->kn_fp, kn)); 243 } 244 245 /* 246 * MPSAFE 247 */ 248 static int 249 kqueue_kqfilter(struct file *fp, struct knote *kn) 250 { 251 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 252 253 if (kn->kn_filter != EVFILT_READ) 254 return (EOPNOTSUPP); 255 256 kn->kn_fop = &kqread_filtops; 257 knote_insert(&kq->kq_kqinfo.ki_note, kn); 258 return (0); 259 } 260 261 static void 262 filt_kqdetach(struct knote *kn) 263 { 264 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 265 266 knote_remove(&kq->kq_kqinfo.ki_note, kn); 267 } 268 269 /*ARGSUSED*/ 270 static int 271 filt_kqueue(struct knote *kn, long hint) 272 { 273 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; 274 275 kn->kn_data = kq->kq_count; 276 return (kn->kn_data > 0); 277 } 278 279 static int 280 filt_procattach(struct knote *kn) 281 { 282 struct proc *p; 283 int immediate; 284 285 immediate = 0; 286 p = pfind(kn->kn_id); 287 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 288 p = zpfind(kn->kn_id); 289 immediate = 1; 290 } 291 if (p == NULL) { 292 return (ESRCH); 293 } 294 if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) { 295 if (p) 296 PRELE(p); 297 return (EACCES); 298 } 299 300 lwkt_gettoken(&p->p_token); 301 kn->kn_ptr.p_proc = p; 302 kn->kn_flags |= EV_CLEAR; /* automatically set */ 303 304 /* 305 * internal flag indicating registration done by kernel 306 */ 307 if (kn->kn_flags & EV_FLAG1) { 308 kn->kn_data = kn->kn_sdata; /* ppid */ 309 kn->kn_fflags = NOTE_CHILD; 310 kn->kn_flags &= ~EV_FLAG1; 311 } 312 313 knote_insert(&p->p_klist, kn); 314 315 /* 316 * Immediately activate any exit notes if the target process is a 317 * zombie. This is necessary to handle the case where the target 318 * process, e.g. a child, dies before the kevent is negistered. 319 */ 320 if (immediate && filt_proc(kn, NOTE_EXIT)) 321 KNOTE_ACTIVATE(kn); 322 lwkt_reltoken(&p->p_token); 323 PRELE(p); 324 325 return (0); 326 } 327 328 /* 329 * The knote may be attached to a different process, which may exit, 330 * leaving nothing for the knote to be attached to. So when the process 331 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 332 * it will be deleted when read out. However, as part of the knote deletion, 333 * this routine is called, so a check is needed to avoid actually performing 334 * a detach, because the original process does not exist any more. 335 */ 336 static void 337 filt_procdetach(struct knote *kn) 338 { 339 struct proc *p; 340 341 if (kn->kn_status & KN_DETACHED) 342 return; 343 p = kn->kn_ptr.p_proc; 344 knote_remove(&p->p_klist, kn); 345 } 346 347 static int 348 filt_proc(struct knote *kn, long hint) 349 { 350 u_int event; 351 352 /* 353 * mask off extra data 354 */ 355 event = (u_int)hint & NOTE_PCTRLMASK; 356 357 /* 358 * if the user is interested in this event, record it. 359 */ 360 if (kn->kn_sfflags & event) 361 kn->kn_fflags |= event; 362 363 /* 364 * Process is gone, so flag the event as finished. Detach the 365 * knote from the process now because the process will be poof, 366 * gone later on. 367 */ 368 if (event == NOTE_EXIT) { 369 struct proc *p = kn->kn_ptr.p_proc; 370 if ((kn->kn_status & KN_DETACHED) == 0) { 371 PHOLD(p); 372 knote_remove(&p->p_klist, kn); 373 kn->kn_status |= KN_DETACHED; 374 kn->kn_data = p->p_xstat; 375 kn->kn_ptr.p_proc = NULL; 376 PRELE(p); 377 } 378 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 379 return (1); 380 } 381 382 /* 383 * process forked, and user wants to track the new process, 384 * so attach a new knote to it, and immediately report an 385 * event with the parent's pid. 386 */ 387 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 388 struct kevent kev; 389 int error; 390 391 /* 392 * register knote with new process. 393 */ 394 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 395 kev.filter = kn->kn_filter; 396 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 397 kev.fflags = kn->kn_sfflags; 398 kev.data = kn->kn_id; /* parent */ 399 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 400 error = kqueue_register(kn->kn_kq, &kev); 401 if (error) 402 kn->kn_fflags |= NOTE_TRACKERR; 403 } 404 405 return (kn->kn_fflags != 0); 406 } 407 408 static void 409 filt_timerreset(struct knote *kn) 410 { 411 struct callout *calloutp; 412 struct timeval tv; 413 int tticks; 414 415 tv.tv_sec = kn->kn_sdata / 1000; 416 tv.tv_usec = (kn->kn_sdata % 1000) * 1000; 417 tticks = tvtohz_high(&tv); 418 calloutp = (struct callout *)kn->kn_hook; 419 callout_reset(calloutp, tticks, filt_timerexpire, kn); 420 } 421 422 /* 423 * The callout interlocks with callout_terminate() but can still 424 * race a deletion so if KN_DELETING is set we just don't touch 425 * the knote. 426 */ 427 static void 428 filt_timerexpire(void *knx) 429 { 430 struct knote *kn = knx; 431 struct kqueue *kq = kn->kn_kq; 432 433 lwkt_getpooltoken(kq); 434 435 /* 436 * Open knote_acquire(), since we can't sleep in callout, 437 * however, we do need to record this expiration. 438 */ 439 kn->kn_data++; 440 if (kn->kn_status & KN_PROCESSING) { 441 kn->kn_status |= KN_REPROCESS; 442 if ((kn->kn_status & KN_DELETING) == 0 && 443 (kn->kn_flags & EV_ONESHOT) == 0) 444 filt_timerreset(kn); 445 lwkt_relpooltoken(kq); 446 return; 447 } 448 KASSERT((kn->kn_status & KN_DELETING) == 0, 449 ("acquire a deleting knote %#x", kn->kn_status)); 450 kn->kn_status |= KN_PROCESSING; 451 452 KNOTE_ACTIVATE(kn); 453 if ((kn->kn_flags & EV_ONESHOT) == 0) 454 filt_timerreset(kn); 455 456 knote_release(kn); 457 458 lwkt_relpooltoken(kq); 459 } 460 461 /* 462 * data contains amount of time to sleep, in milliseconds 463 */ 464 static int 465 filt_timerattach(struct knote *kn) 466 { 467 struct callout *calloutp; 468 int prev_ncallouts; 469 470 prev_ncallouts = atomic_fetchadd_int(&kq_ncallouts, 1); 471 if (prev_ncallouts >= kq_calloutmax) { 472 atomic_subtract_int(&kq_ncallouts, 1); 473 kn->kn_hook = NULL; 474 return (ENOMEM); 475 } 476 477 kn->kn_flags |= EV_CLEAR; /* automatically set */ 478 calloutp = kmalloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK); 479 callout_init_mp(calloutp); 480 kn->kn_hook = (caddr_t)calloutp; 481 482 filt_timerreset(kn); 483 return (0); 484 } 485 486 /* 487 * This function is called with the knote flagged locked but it is 488 * still possible to race a callout event due to the callback blocking. 489 * We must call callout_terminate() instead of callout_stop() to deal 490 * with the race. 491 */ 492 static void 493 filt_timerdetach(struct knote *kn) 494 { 495 struct callout *calloutp; 496 497 calloutp = (struct callout *)kn->kn_hook; 498 callout_terminate(calloutp); 499 kfree(calloutp, M_KQUEUE); 500 atomic_subtract_int(&kq_ncallouts, 1); 501 } 502 503 static int 504 filt_timer(struct knote *kn, long hint) 505 { 506 507 return (kn->kn_data != 0); 508 } 509 510 /* 511 * EVFILT_USER 512 */ 513 static int 514 filt_userattach(struct knote *kn) 515 { 516 kn->kn_hook = NULL; 517 if (kn->kn_fflags & NOTE_TRIGGER) 518 kn->kn_ptr.hookid = 1; 519 else 520 kn->kn_ptr.hookid = 0; 521 return 0; 522 } 523 524 static void 525 filt_userdetach(struct knote *kn) 526 { 527 /* nothing to do */ 528 } 529 530 static int 531 filt_user(struct knote *kn, long hint) 532 { 533 return (kn->kn_ptr.hookid); 534 } 535 536 static void 537 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 538 { 539 u_int ffctrl; 540 541 switch (type) { 542 case EVENT_REGISTER: 543 if (kev->fflags & NOTE_TRIGGER) 544 kn->kn_ptr.hookid = 1; 545 546 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 547 kev->fflags &= NOTE_FFLAGSMASK; 548 switch (ffctrl) { 549 case NOTE_FFNOP: 550 break; 551 552 case NOTE_FFAND: 553 kn->kn_sfflags &= kev->fflags; 554 break; 555 556 case NOTE_FFOR: 557 kn->kn_sfflags |= kev->fflags; 558 break; 559 560 case NOTE_FFCOPY: 561 kn->kn_sfflags = kev->fflags; 562 break; 563 564 default: 565 /* XXX Return error? */ 566 break; 567 } 568 kn->kn_sdata = kev->data; 569 570 /* 571 * This is not the correct use of EV_CLEAR in an event 572 * modification, it should have been passed as a NOTE instead. 573 * But we need to maintain compatibility with Apple & FreeBSD. 574 * 575 * Note however that EV_CLEAR can still be used when doing 576 * the initial registration of the event and works as expected 577 * (clears the event on reception). 578 */ 579 if (kev->flags & EV_CLEAR) { 580 kn->kn_ptr.hookid = 0; 581 kn->kn_data = 0; 582 kn->kn_fflags = 0; 583 } 584 break; 585 586 case EVENT_PROCESS: 587 *kev = kn->kn_kevent; 588 kev->fflags = kn->kn_sfflags; 589 kev->data = kn->kn_sdata; 590 if (kn->kn_flags & EV_CLEAR) { 591 kn->kn_ptr.hookid = 0; 592 /* kn_data, kn_fflags handled by parent */ 593 } 594 break; 595 596 default: 597 panic("filt_usertouch() - invalid type (%ld)", type); 598 break; 599 } 600 } 601 602 /* 603 * EVFILT_FS 604 */ 605 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist); 606 607 static int 608 filt_fsattach(struct knote *kn) 609 { 610 kn->kn_flags |= EV_CLEAR; 611 knote_insert(&fs_klist, kn); 612 613 return (0); 614 } 615 616 static void 617 filt_fsdetach(struct knote *kn) 618 { 619 knote_remove(&fs_klist, kn); 620 } 621 622 static int 623 filt_fs(struct knote *kn, long hint) 624 { 625 kn->kn_fflags |= hint; 626 return (kn->kn_fflags != 0); 627 } 628 629 /* 630 * Initialize a kqueue. 631 * 632 * NOTE: The lwp/proc code initializes a kqueue for select/poll ops. 633 * 634 * MPSAFE 635 */ 636 void 637 kqueue_init(struct kqueue *kq, struct filedesc *fdp) 638 { 639 TAILQ_INIT(&kq->kq_knpend); 640 TAILQ_INIT(&kq->kq_knlist); 641 kq->kq_count = 0; 642 kq->kq_fdp = fdp; 643 SLIST_INIT(&kq->kq_kqinfo.ki_note); 644 } 645 646 /* 647 * Terminate a kqueue. Freeing the actual kq itself is left up to the 648 * caller (it might be embedded in a lwp so we don't do it here). 649 * 650 * The kq's knlist must be completely eradicated so block on any 651 * processing races. 652 */ 653 void 654 kqueue_terminate(struct kqueue *kq) 655 { 656 struct knote *kn; 657 658 lwkt_getpooltoken(kq); 659 while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) { 660 if (knote_acquire(kn)) 661 knote_detach_and_drop(kn); 662 } 663 lwkt_relpooltoken(kq); 664 665 if (kq->kq_knhash) { 666 hashdestroy(kq->kq_knhash, M_KQUEUE, kq->kq_knhashmask); 667 kq->kq_knhash = NULL; 668 kq->kq_knhashmask = 0; 669 } 670 } 671 672 /* 673 * MPSAFE 674 */ 675 int 676 sys_kqueue(struct kqueue_args *uap) 677 { 678 struct thread *td = curthread; 679 struct kqueue *kq; 680 struct file *fp; 681 int fd, error; 682 683 error = falloc(td->td_lwp, &fp, &fd); 684 if (error) 685 return (error); 686 fp->f_flag = FREAD | FWRITE; 687 fp->f_type = DTYPE_KQUEUE; 688 fp->f_ops = &kqueueops; 689 690 kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); 691 kqueue_init(kq, td->td_proc->p_fd); 692 fp->f_data = kq; 693 694 fsetfd(kq->kq_fdp, fp, fd); 695 uap->sysmsg_result = fd; 696 fdrop(fp); 697 return (error); 698 } 699 700 /* 701 * Copy 'count' items into the destination list pointed to by uap->eventlist. 702 */ 703 static int 704 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res) 705 { 706 struct kevent_copyin_args *kap; 707 int error; 708 709 kap = (struct kevent_copyin_args *)arg; 710 711 error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp)); 712 if (error == 0) { 713 kap->ka->eventlist += count; 714 *res += count; 715 } else { 716 *res = -1; 717 } 718 719 return (error); 720 } 721 722 /* 723 * Copy at most 'max' items from the list pointed to by kap->changelist, 724 * return number of items in 'events'. 725 */ 726 static int 727 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events) 728 { 729 struct kevent_copyin_args *kap; 730 int error, count; 731 732 kap = (struct kevent_copyin_args *)arg; 733 734 count = min(kap->ka->nchanges - kap->pchanges, max); 735 error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp); 736 if (error == 0) { 737 kap->ka->changelist += count; 738 kap->pchanges += count; 739 *events = count; 740 } 741 742 return (error); 743 } 744 745 /* 746 * MPSAFE 747 */ 748 int 749 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, 750 k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn, 751 struct timespec *tsp_in) 752 { 753 struct kevent *kevp; 754 struct timespec *tsp, ats; 755 int i, n, total, error, nerrors = 0; 756 int lres; 757 int limit = kq_checkloop; 758 struct kevent kev[KQ_NEVENTS]; 759 struct knote marker; 760 struct lwkt_token *tok; 761 762 if (tsp_in == NULL || tsp_in->tv_sec || tsp_in->tv_nsec) 763 atomic_set_int(&curthread->td_mpflags, TDF_MP_BATCH_DEMARC); 764 765 tsp = tsp_in; 766 *res = 0; 767 768 for (;;) { 769 n = 0; 770 error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n); 771 if (error) 772 return error; 773 if (n == 0) 774 break; 775 for (i = 0; i < n; i++) { 776 kevp = &kev[i]; 777 kevp->flags &= ~EV_SYSFLAGS; 778 error = kqueue_register(kq, kevp); 779 780 /* 781 * If a registration returns an error we 782 * immediately post the error. The kevent() 783 * call itself will fail with the error if 784 * no space is available for posting. 785 * 786 * Such errors normally bypass the timeout/blocking 787 * code. However, if the copyoutfn function refuses 788 * to post the error (see sys_poll()), then we 789 * ignore it too. 790 */ 791 if (error || (kevp->flags & EV_RECEIPT)) { 792 kevp->flags = EV_ERROR; 793 kevp->data = error; 794 lres = *res; 795 kevent_copyoutfn(uap, kevp, 1, res); 796 if (*res < 0) { 797 return error; 798 } else if (lres != *res) { 799 nevents--; 800 nerrors++; 801 } 802 } 803 } 804 } 805 if (nerrors) 806 return 0; 807 808 /* 809 * Acquire/wait for events - setup timeout 810 */ 811 if (tsp != NULL) { 812 if (tsp->tv_sec || tsp->tv_nsec) { 813 getnanouptime(&ats); 814 timespecadd(tsp, &ats); /* tsp = target time */ 815 } 816 } 817 818 /* 819 * Loop as required. 820 * 821 * Collect as many events as we can. Sleeping on successive 822 * loops is disabled if copyoutfn has incremented (*res). 823 * 824 * The loop stops if an error occurs, all events have been 825 * scanned (the marker has been reached), or fewer than the 826 * maximum number of events is found. 827 * 828 * The copyoutfn function does not have to increment (*res) in 829 * order for the loop to continue. 830 * 831 * NOTE: doselect() usually passes 0x7FFFFFFF for nevents. 832 */ 833 total = 0; 834 error = 0; 835 marker.kn_filter = EVFILT_MARKER; 836 marker.kn_status = KN_PROCESSING; 837 tok = lwkt_token_pool_lookup(kq); 838 lwkt_gettoken(tok); 839 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); 840 lwkt_reltoken(tok); 841 while ((n = nevents - total) > 0) { 842 if (n > KQ_NEVENTS) 843 n = KQ_NEVENTS; 844 845 /* 846 * If no events are pending sleep until timeout (if any) 847 * or an event occurs. 848 * 849 * After the sleep completes the marker is moved to the 850 * end of the list, making any received events available 851 * to our scan. 852 */ 853 if (kq->kq_count == 0 && *res == 0) { 854 int timeout; 855 856 if (tsp == NULL) { 857 timeout = 0; 858 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 859 error = EWOULDBLOCK; 860 break; 861 } else { 862 struct timespec atx = *tsp; 863 864 getnanouptime(&ats); 865 timespecsub(&atx, &ats); 866 if (atx.tv_sec < 0) { 867 error = EWOULDBLOCK; 868 break; 869 } else { 870 timeout = atx.tv_sec > 24 * 60 * 60 ? 871 24 * 60 * 60 * hz : 872 tstohz_high(&atx); 873 } 874 } 875 876 lwkt_gettoken(tok); 877 if (kq->kq_count == 0) { 878 kq->kq_sleep_cnt++; 879 if (__predict_false(kq->kq_sleep_cnt == 0)) { 880 /* 881 * Guard against possible wrapping. And 882 * set it to 2, so that kqueue_wakeup() 883 * can wake everyone up. 884 */ 885 kq->kq_sleep_cnt = 2; 886 } 887 error = tsleep(kq, PCATCH, "kqread", timeout); 888 889 /* don't restart after signals... */ 890 if (error == ERESTART) 891 error = EINTR; 892 if (error) { 893 lwkt_reltoken(tok); 894 break; 895 } 896 897 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 898 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, 899 kn_tqe); 900 } 901 lwkt_reltoken(tok); 902 } 903 904 /* 905 * Process all received events 906 * Account for all non-spurious events in our total 907 */ 908 i = kqueue_scan(kq, kev, n, &marker); 909 if (i) { 910 lres = *res; 911 error = kevent_copyoutfn(uap, kev, i, res); 912 total += *res - lres; 913 if (error) 914 break; 915 } 916 if (limit && --limit == 0) 917 panic("kqueue: checkloop failed i=%d", i); 918 919 /* 920 * Normally when fewer events are returned than requested 921 * we can stop. However, if only spurious events were 922 * collected the copyout will not bump (*res) and we have 923 * to continue. 924 */ 925 if (i < n && *res) 926 break; 927 928 /* 929 * Deal with an edge case where spurious events can cause 930 * a loop to occur without moving the marker. This can 931 * prevent kqueue_scan() from picking up new events which 932 * race us. We must be sure to move the marker for this 933 * case. 934 * 935 * NOTE: We do not want to move the marker if events 936 * were scanned because normal kqueue operations 937 * may reactivate events. Moving the marker in 938 * that case could result in duplicates for the 939 * same event. 940 */ 941 if (i == 0) { 942 lwkt_gettoken(tok); 943 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 944 TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); 945 lwkt_reltoken(tok); 946 } 947 } 948 lwkt_gettoken(tok); 949 TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); 950 lwkt_reltoken(tok); 951 952 /* Timeouts do not return EWOULDBLOCK. */ 953 if (error == EWOULDBLOCK) 954 error = 0; 955 return error; 956 } 957 958 /* 959 * MPALMOSTSAFE 960 */ 961 int 962 sys_kevent(struct kevent_args *uap) 963 { 964 struct thread *td = curthread; 965 struct proc *p = td->td_proc; 966 struct timespec ts, *tsp; 967 struct kqueue *kq; 968 struct file *fp = NULL; 969 struct kevent_copyin_args *kap, ka; 970 int error; 971 972 if (uap->timeout) { 973 error = copyin(uap->timeout, &ts, sizeof(ts)); 974 if (error) 975 return (error); 976 tsp = &ts; 977 } else { 978 tsp = NULL; 979 } 980 fp = holdfp(p->p_fd, uap->fd, -1); 981 if (fp == NULL) 982 return (EBADF); 983 if (fp->f_type != DTYPE_KQUEUE) { 984 fdrop(fp); 985 return (EBADF); 986 } 987 988 kq = (struct kqueue *)fp->f_data; 989 990 kap = &ka; 991 kap->ka = uap; 992 kap->pchanges = 0; 993 994 error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap, 995 kevent_copyin, kevent_copyout, tsp); 996 997 fdrop(fp); 998 999 return (error); 1000 } 1001 1002 int 1003 kqueue_register(struct kqueue *kq, struct kevent *kev) 1004 { 1005 struct filedesc *fdp = kq->kq_fdp; 1006 struct klist *list = NULL; 1007 struct filterops *fops; 1008 struct file *fp = NULL; 1009 struct knote *kn = NULL; 1010 struct thread *td; 1011 int error = 0; 1012 struct knote_cache_list *cache_list; 1013 1014 if (kev->filter < 0) { 1015 if (kev->filter + EVFILT_SYSCOUNT < 0) 1016 return (EINVAL); 1017 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ 1018 } else { 1019 /* 1020 * XXX 1021 * filter attach routine is responsible for insuring that 1022 * the identifier can be attached to it. 1023 */ 1024 return (EINVAL); 1025 } 1026 1027 if (fops->f_flags & FILTEROP_ISFD) { 1028 /* validate descriptor */ 1029 fp = holdfp(fdp, kev->ident, -1); 1030 if (fp == NULL) 1031 return (EBADF); 1032 } 1033 1034 cache_list = &knote_cache_lists[mycpuid]; 1035 if (SLIST_EMPTY(&cache_list->knote_cache)) { 1036 struct knote *new_kn; 1037 1038 new_kn = knote_alloc(); 1039 crit_enter(); 1040 SLIST_INSERT_HEAD(&cache_list->knote_cache, new_kn, kn_link); 1041 cache_list->knote_cache_cnt++; 1042 crit_exit(); 1043 } 1044 1045 td = curthread; 1046 lwkt_getpooltoken(kq); 1047 1048 /* 1049 * Make sure that only one thread can register event on this kqueue, 1050 * so that we would not suffer any race, even if the registration 1051 * blocked, i.e. kq token was released, and the kqueue was shared 1052 * between threads (this should be rare though). 1053 */ 1054 while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) { 1055 kq->kq_state |= KQ_REGWAIT; 1056 tsleep(&kq->kq_regtd, 0, "kqreg", 0); 1057 } 1058 if (__predict_false(kq->kq_regtd != NULL)) { 1059 /* Recursive calling of kqueue_register() */ 1060 td = NULL; 1061 } else { 1062 /* Owner of the kq_regtd, i.e. td != NULL */ 1063 kq->kq_regtd = td; 1064 } 1065 1066 if (fp != NULL) { 1067 list = &fp->f_klist; 1068 } else if (kq->kq_knhashmask) { 1069 list = &kq->kq_knhash[ 1070 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1071 } 1072 if (list != NULL) { 1073 lwkt_getpooltoken(list); 1074 again: 1075 SLIST_FOREACH(kn, list, kn_link) { 1076 if (kn->kn_kq == kq && 1077 kn->kn_filter == kev->filter && 1078 kn->kn_id == kev->ident) { 1079 if (knote_acquire(kn) == 0) 1080 goto again; 1081 break; 1082 } 1083 } 1084 lwkt_relpooltoken(list); 1085 } 1086 1087 /* 1088 * NOTE: At this point if kn is non-NULL we will have acquired 1089 * it and set KN_PROCESSING. 1090 */ 1091 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 1092 error = ENOENT; 1093 goto done; 1094 } 1095 1096 /* 1097 * kn now contains the matching knote, or NULL if no match 1098 */ 1099 if (kev->flags & EV_ADD) { 1100 if (kn == NULL) { 1101 crit_enter(); 1102 kn = SLIST_FIRST(&cache_list->knote_cache); 1103 if (kn == NULL) { 1104 crit_exit(); 1105 kn = knote_alloc(); 1106 } else { 1107 SLIST_REMOVE_HEAD(&cache_list->knote_cache, 1108 kn_link); 1109 cache_list->knote_cache_cnt--; 1110 crit_exit(); 1111 } 1112 kn->kn_fp = fp; 1113 kn->kn_kq = kq; 1114 kn->kn_fop = fops; 1115 1116 /* 1117 * apply reference count to knote structure, and 1118 * do not release it at the end of this routine. 1119 */ 1120 fp = NULL; 1121 1122 kn->kn_sfflags = kev->fflags; 1123 kn->kn_sdata = kev->data; 1124 kev->fflags = 0; 1125 kev->data = 0; 1126 kn->kn_kevent = *kev; 1127 1128 /* 1129 * KN_PROCESSING prevents the knote from getting 1130 * ripped out from under us while we are trying 1131 * to attach it, in case the attach blocks. 1132 */ 1133 kn->kn_status = KN_PROCESSING; 1134 knote_attach(kn); 1135 if ((error = filter_attach(kn)) != 0) { 1136 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1137 knote_drop(kn); 1138 goto done; 1139 } 1140 1141 /* 1142 * Interlock against close races which either tried 1143 * to remove our knote while we were blocked or missed 1144 * it entirely prior to our attachment. We do not 1145 * want to end up with a knote on a closed descriptor. 1146 */ 1147 if ((fops->f_flags & FILTEROP_ISFD) && 1148 checkfdclosed(fdp, kev->ident, kn->kn_fp)) { 1149 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1150 } 1151 } else { 1152 /* 1153 * The user may change some filter values after the 1154 * initial EV_ADD, but doing so will not reset any 1155 * filter which have already been triggered. 1156 */ 1157 KKASSERT(kn->kn_status & KN_PROCESSING); 1158 if (fops == &user_filtops) { 1159 filt_usertouch(kn, kev, EVENT_REGISTER); 1160 } else { 1161 kn->kn_sfflags = kev->fflags; 1162 kn->kn_sdata = kev->data; 1163 kn->kn_kevent.udata = kev->udata; 1164 } 1165 } 1166 1167 /* 1168 * Execute the filter event to immediately activate the 1169 * knote if necessary. If reprocessing events are pending 1170 * due to blocking above we do not run the filter here 1171 * but instead let knote_release() do it. Otherwise we 1172 * might run the filter on a deleted event. 1173 */ 1174 if ((kn->kn_status & KN_REPROCESS) == 0) { 1175 if (filter_event(kn, 0)) 1176 KNOTE_ACTIVATE(kn); 1177 } 1178 } else if (kev->flags & EV_DELETE) { 1179 /* 1180 * Delete the existing knote 1181 */ 1182 knote_detach_and_drop(kn); 1183 goto done; 1184 } else { 1185 /* 1186 * Modify an existing event. 1187 * 1188 * The user may change some filter values after the 1189 * initial EV_ADD, but doing so will not reset any 1190 * filter which have already been triggered. 1191 */ 1192 KKASSERT(kn->kn_status & KN_PROCESSING); 1193 if (fops == &user_filtops) { 1194 filt_usertouch(kn, kev, EVENT_REGISTER); 1195 } else { 1196 kn->kn_sfflags = kev->fflags; 1197 kn->kn_sdata = kev->data; 1198 kn->kn_kevent.udata = kev->udata; 1199 } 1200 1201 /* 1202 * Execute the filter event to immediately activate the 1203 * knote if necessary. If reprocessing events are pending 1204 * due to blocking above we do not run the filter here 1205 * but instead let knote_release() do it. Otherwise we 1206 * might run the filter on a deleted event. 1207 */ 1208 if ((kn->kn_status & KN_REPROCESS) == 0) { 1209 if (filter_event(kn, 0)) 1210 KNOTE_ACTIVATE(kn); 1211 } 1212 } 1213 1214 /* 1215 * Disablement does not deactivate a knote here. 1216 */ 1217 if ((kev->flags & EV_DISABLE) && 1218 ((kn->kn_status & KN_DISABLED) == 0)) { 1219 kn->kn_status |= KN_DISABLED; 1220 } 1221 1222 /* 1223 * Re-enablement may have to immediately enqueue an active knote. 1224 */ 1225 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 1226 kn->kn_status &= ~KN_DISABLED; 1227 if ((kn->kn_status & KN_ACTIVE) && 1228 ((kn->kn_status & KN_QUEUED) == 0)) { 1229 knote_enqueue(kn); 1230 } 1231 } 1232 1233 /* 1234 * Handle any required reprocessing 1235 */ 1236 knote_release(kn); 1237 /* kn may be invalid now */ 1238 1239 done: 1240 if (td != NULL) { /* Owner of the kq_regtd */ 1241 kq->kq_regtd = NULL; 1242 if (__predict_false(kq->kq_state & KQ_REGWAIT)) { 1243 kq->kq_state &= ~KQ_REGWAIT; 1244 wakeup(&kq->kq_regtd); 1245 } 1246 } 1247 lwkt_relpooltoken(kq); 1248 if (fp != NULL) 1249 fdrop(fp); 1250 return (error); 1251 } 1252 1253 /* 1254 * Scan the kqueue, return the number of active events placed in kevp up 1255 * to count. 1256 * 1257 * Continuous mode events may get recycled, do not continue scanning past 1258 * marker unless no events have been collected. 1259 */ 1260 static int 1261 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, 1262 struct knote *marker) 1263 { 1264 struct knote *kn, local_marker; 1265 int total; 1266 1267 total = 0; 1268 local_marker.kn_filter = EVFILT_MARKER; 1269 local_marker.kn_status = KN_PROCESSING; 1270 1271 lwkt_getpooltoken(kq); 1272 1273 /* 1274 * Collect events. 1275 */ 1276 TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe); 1277 while (count) { 1278 kn = TAILQ_NEXT(&local_marker, kn_tqe); 1279 if (kn->kn_filter == EVFILT_MARKER) { 1280 /* Marker reached, we are done */ 1281 if (kn == marker) 1282 break; 1283 1284 /* Move local marker past some other threads marker */ 1285 kn = TAILQ_NEXT(kn, kn_tqe); 1286 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1287 TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe); 1288 continue; 1289 } 1290 1291 /* 1292 * We can't skip a knote undergoing processing, otherwise 1293 * we risk not returning it when the user process expects 1294 * it should be returned. Sleep and retry. 1295 */ 1296 if (knote_acquire(kn) == 0) 1297 continue; 1298 1299 /* 1300 * Remove the event for processing. 1301 * 1302 * WARNING! We must leave KN_QUEUED set to prevent the 1303 * event from being KNOTE_ACTIVATE()d while 1304 * the queue state is in limbo, in case we 1305 * block. 1306 */ 1307 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 1308 kq->kq_count--; 1309 1310 /* 1311 * We have to deal with an extremely important race against 1312 * file descriptor close()s here. The file descriptor can 1313 * disappear MPSAFE, and there is a small window of 1314 * opportunity between that and the call to knote_fdclose(). 1315 * 1316 * If we hit that window here while doselect or dopoll is 1317 * trying to delete a spurious event they will not be able 1318 * to match up the event against a knote and will go haywire. 1319 */ 1320 if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && 1321 checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) { 1322 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1323 } 1324 1325 if (kn->kn_status & KN_DISABLED) { 1326 /* 1327 * If disabled we ensure the event is not queued 1328 * but leave its active bit set. On re-enablement 1329 * the event may be immediately triggered. 1330 */ 1331 kn->kn_status &= ~KN_QUEUED; 1332 } else if ((kn->kn_flags & EV_ONESHOT) == 0 && 1333 (kn->kn_status & KN_DELETING) == 0 && 1334 filter_event(kn, 0) == 0) { 1335 /* 1336 * If not running in one-shot mode and the event 1337 * is no longer present we ensure it is removed 1338 * from the queue and ignore it. 1339 */ 1340 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1341 } else { 1342 /* 1343 * Post the event 1344 */ 1345 if (kn->kn_fop == &user_filtops) 1346 filt_usertouch(kn, kevp, EVENT_PROCESS); 1347 else 1348 *kevp = kn->kn_kevent; 1349 ++kevp; 1350 ++total; 1351 --count; 1352 1353 if (kn->kn_flags & EV_ONESHOT) { 1354 kn->kn_status &= ~KN_QUEUED; 1355 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1356 } else { 1357 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1358 if (kn->kn_flags & EV_CLEAR) { 1359 kn->kn_data = 0; 1360 kn->kn_fflags = 0; 1361 } 1362 if (kn->kn_flags & EV_DISPATCH) { 1363 kn->kn_status |= KN_DISABLED; 1364 } 1365 kn->kn_status &= ~(KN_QUEUED | 1366 KN_ACTIVE); 1367 } else { 1368 TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); 1369 kq->kq_count++; 1370 } 1371 } 1372 } 1373 1374 /* 1375 * Handle any post-processing states 1376 */ 1377 knote_release(kn); 1378 } 1379 TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); 1380 1381 lwkt_relpooltoken(kq); 1382 return (total); 1383 } 1384 1385 /* 1386 * XXX 1387 * This could be expanded to call kqueue_scan, if desired. 1388 * 1389 * MPSAFE 1390 */ 1391 static int 1392 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1393 { 1394 return (ENXIO); 1395 } 1396 1397 /* 1398 * MPSAFE 1399 */ 1400 static int 1401 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 1402 { 1403 return (ENXIO); 1404 } 1405 1406 /* 1407 * MPALMOSTSAFE 1408 */ 1409 static int 1410 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, 1411 struct ucred *cred, struct sysmsg *msg) 1412 { 1413 struct kqueue *kq; 1414 int error; 1415 1416 kq = (struct kqueue *)fp->f_data; 1417 lwkt_getpooltoken(kq); 1418 switch(com) { 1419 case FIOASYNC: 1420 if (*(int *)data) 1421 kq->kq_state |= KQ_ASYNC; 1422 else 1423 kq->kq_state &= ~KQ_ASYNC; 1424 error = 0; 1425 break; 1426 case FIOSETOWN: 1427 error = fsetown(*(int *)data, &kq->kq_sigio); 1428 break; 1429 default: 1430 error = ENOTTY; 1431 break; 1432 } 1433 lwkt_relpooltoken(kq); 1434 return (error); 1435 } 1436 1437 /* 1438 * MPSAFE 1439 */ 1440 static int 1441 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred) 1442 { 1443 struct kqueue *kq = (struct kqueue *)fp->f_data; 1444 1445 bzero((void *)st, sizeof(*st)); 1446 st->st_size = kq->kq_count; 1447 st->st_blksize = sizeof(struct kevent); 1448 st->st_mode = S_IFIFO; 1449 return (0); 1450 } 1451 1452 /* 1453 * MPSAFE 1454 */ 1455 static int 1456 kqueue_close(struct file *fp) 1457 { 1458 struct kqueue *kq = (struct kqueue *)fp->f_data; 1459 1460 kqueue_terminate(kq); 1461 1462 fp->f_data = NULL; 1463 funsetown(&kq->kq_sigio); 1464 1465 kfree(kq, M_KQUEUE); 1466 return (0); 1467 } 1468 1469 static void 1470 kqueue_wakeup(struct kqueue *kq) 1471 { 1472 if (kq->kq_sleep_cnt) { 1473 u_int sleep_cnt = kq->kq_sleep_cnt; 1474 1475 kq->kq_sleep_cnt = 0; 1476 if (sleep_cnt == 1) 1477 wakeup_one(kq); 1478 else 1479 wakeup(kq); 1480 } 1481 KNOTE(&kq->kq_kqinfo.ki_note, 0); 1482 } 1483 1484 /* 1485 * Calls filterops f_attach function, acquiring mplock if filter is not 1486 * marked as FILTEROP_MPSAFE. 1487 * 1488 * Caller must be holding the related kq token 1489 */ 1490 static int 1491 filter_attach(struct knote *kn) 1492 { 1493 int ret; 1494 1495 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1496 ret = kn->kn_fop->f_attach(kn); 1497 } else { 1498 get_mplock(); 1499 ret = kn->kn_fop->f_attach(kn); 1500 rel_mplock(); 1501 } 1502 return (ret); 1503 } 1504 1505 /* 1506 * Detach the knote and drop it, destroying the knote. 1507 * 1508 * Calls filterops f_detach function, acquiring mplock if filter is not 1509 * marked as FILTEROP_MPSAFE. 1510 * 1511 * Caller must be holding the related kq token 1512 */ 1513 static void 1514 knote_detach_and_drop(struct knote *kn) 1515 { 1516 kn->kn_status |= KN_DELETING | KN_REPROCESS; 1517 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1518 kn->kn_fop->f_detach(kn); 1519 } else { 1520 get_mplock(); 1521 kn->kn_fop->f_detach(kn); 1522 rel_mplock(); 1523 } 1524 knote_drop(kn); 1525 } 1526 1527 /* 1528 * Calls filterops f_event function, acquiring mplock if filter is not 1529 * marked as FILTEROP_MPSAFE. 1530 * 1531 * If the knote is in the middle of being created or deleted we cannot 1532 * safely call the filter op. 1533 * 1534 * Caller must be holding the related kq token 1535 */ 1536 static int 1537 filter_event(struct knote *kn, long hint) 1538 { 1539 int ret; 1540 1541 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { 1542 ret = kn->kn_fop->f_event(kn, hint); 1543 } else { 1544 get_mplock(); 1545 ret = kn->kn_fop->f_event(kn, hint); 1546 rel_mplock(); 1547 } 1548 return (ret); 1549 } 1550 1551 /* 1552 * Walk down a list of knotes, activating them if their event has triggered. 1553 * 1554 * If we encounter any knotes which are undergoing processing we just mark 1555 * them for reprocessing and do not try to [re]activate the knote. However, 1556 * if a hint is being passed we have to wait and that makes things a bit 1557 * sticky. 1558 */ 1559 void 1560 knote(struct klist *list, long hint) 1561 { 1562 struct kqueue *kq; 1563 struct knote *kn; 1564 struct knote *kntmp; 1565 1566 lwkt_getpooltoken(list); 1567 restart: 1568 SLIST_FOREACH(kn, list, kn_next) { 1569 kq = kn->kn_kq; 1570 lwkt_getpooltoken(kq); 1571 1572 /* temporary verification hack */ 1573 SLIST_FOREACH(kntmp, list, kn_next) { 1574 if (kn == kntmp) 1575 break; 1576 } 1577 if (kn != kntmp || kn->kn_kq != kq) { 1578 lwkt_relpooltoken(kq); 1579 goto restart; 1580 } 1581 1582 if (kn->kn_status & KN_PROCESSING) { 1583 /* 1584 * Someone else is processing the knote, ask the 1585 * other thread to reprocess it and don't mess 1586 * with it otherwise. 1587 */ 1588 if (hint == 0) { 1589 kn->kn_status |= KN_REPROCESS; 1590 lwkt_relpooltoken(kq); 1591 continue; 1592 } 1593 1594 /* 1595 * If the hint is non-zero we have to wait or risk 1596 * losing the state the caller is trying to update. 1597 * 1598 * XXX This is a real problem, certain process 1599 * and signal filters will bump kn_data for 1600 * already-processed notes more than once if 1601 * we restart the list scan. FIXME. 1602 */ 1603 kn->kn_status |= KN_WAITING | KN_REPROCESS; 1604 tsleep(kn, 0, "knotec", hz); 1605 lwkt_relpooltoken(kq); 1606 goto restart; 1607 } 1608 1609 /* 1610 * Become the reprocessing master ourselves. 1611 * 1612 * If hint is non-zero running the event is mandatory 1613 * when not deleting so do it whether reprocessing is 1614 * set or not. 1615 */ 1616 kn->kn_status |= KN_PROCESSING; 1617 if ((kn->kn_status & KN_DELETING) == 0) { 1618 if (filter_event(kn, hint)) 1619 KNOTE_ACTIVATE(kn); 1620 } 1621 if (knote_release(kn)) { 1622 lwkt_relpooltoken(kq); 1623 goto restart; 1624 } 1625 lwkt_relpooltoken(kq); 1626 } 1627 lwkt_relpooltoken(list); 1628 } 1629 1630 /* 1631 * Insert knote at head of klist. 1632 * 1633 * This function may only be called via a filter function and thus 1634 * kq_token should already be held and marked for processing. 1635 */ 1636 void 1637 knote_insert(struct klist *klist, struct knote *kn) 1638 { 1639 lwkt_getpooltoken(klist); 1640 KKASSERT(kn->kn_status & KN_PROCESSING); 1641 SLIST_INSERT_HEAD(klist, kn, kn_next); 1642 lwkt_relpooltoken(klist); 1643 } 1644 1645 /* 1646 * Remove knote from a klist 1647 * 1648 * This function may only be called via a filter function and thus 1649 * kq_token should already be held and marked for processing. 1650 */ 1651 void 1652 knote_remove(struct klist *klist, struct knote *kn) 1653 { 1654 lwkt_getpooltoken(klist); 1655 KKASSERT(kn->kn_status & KN_PROCESSING); 1656 SLIST_REMOVE(klist, kn, knote, kn_next); 1657 lwkt_relpooltoken(klist); 1658 } 1659 1660 void 1661 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst, 1662 struct filterops *ops, void *hook) 1663 { 1664 struct kqueue *kq; 1665 struct knote *kn; 1666 1667 lwkt_getpooltoken(&src->ki_note); 1668 lwkt_getpooltoken(&dst->ki_note); 1669 while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) { 1670 kq = kn->kn_kq; 1671 lwkt_getpooltoken(kq); 1672 if (SLIST_FIRST(&src->ki_note) != kn || kn->kn_kq != kq) { 1673 lwkt_relpooltoken(kq); 1674 continue; 1675 } 1676 if (knote_acquire(kn)) { 1677 knote_remove(&src->ki_note, kn); 1678 kn->kn_fop = ops; 1679 kn->kn_hook = hook; 1680 knote_insert(&dst->ki_note, kn); 1681 knote_release(kn); 1682 /* kn may be invalid now */ 1683 } 1684 lwkt_relpooltoken(kq); 1685 } 1686 lwkt_relpooltoken(&dst->ki_note); 1687 lwkt_relpooltoken(&src->ki_note); 1688 } 1689 1690 /* 1691 * Remove all knotes referencing a specified fd 1692 */ 1693 void 1694 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd) 1695 { 1696 struct kqueue *kq; 1697 struct knote *kn; 1698 struct knote *kntmp; 1699 1700 lwkt_getpooltoken(&fp->f_klist); 1701 restart: 1702 SLIST_FOREACH(kn, &fp->f_klist, kn_link) { 1703 if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) { 1704 kq = kn->kn_kq; 1705 lwkt_getpooltoken(kq); 1706 1707 /* temporary verification hack */ 1708 SLIST_FOREACH(kntmp, &fp->f_klist, kn_link) { 1709 if (kn == kntmp) 1710 break; 1711 } 1712 if (kn != kntmp || kn->kn_kq->kq_fdp != fdp || 1713 kn->kn_id != fd || kn->kn_kq != kq) { 1714 lwkt_relpooltoken(kq); 1715 goto restart; 1716 } 1717 if (knote_acquire(kn)) 1718 knote_detach_and_drop(kn); 1719 lwkt_relpooltoken(kq); 1720 goto restart; 1721 } 1722 } 1723 lwkt_relpooltoken(&fp->f_klist); 1724 } 1725 1726 /* 1727 * Low level attach function. 1728 * 1729 * The knote should already be marked for processing. 1730 * Caller must hold the related kq token. 1731 */ 1732 static void 1733 knote_attach(struct knote *kn) 1734 { 1735 struct klist *list; 1736 struct kqueue *kq = kn->kn_kq; 1737 1738 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1739 KKASSERT(kn->kn_fp); 1740 list = &kn->kn_fp->f_klist; 1741 } else { 1742 if (kq->kq_knhashmask == 0) 1743 kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1744 &kq->kq_knhashmask); 1745 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1746 } 1747 lwkt_getpooltoken(list); 1748 SLIST_INSERT_HEAD(list, kn, kn_link); 1749 lwkt_relpooltoken(list); 1750 TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink); 1751 } 1752 1753 /* 1754 * Low level drop function. 1755 * 1756 * The knote should already be marked for processing. 1757 * Caller must hold the related kq token. 1758 */ 1759 static void 1760 knote_drop(struct knote *kn) 1761 { 1762 struct kqueue *kq; 1763 struct klist *list; 1764 1765 kq = kn->kn_kq; 1766 1767 if (kn->kn_fop->f_flags & FILTEROP_ISFD) 1768 list = &kn->kn_fp->f_klist; 1769 else 1770 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1771 1772 lwkt_getpooltoken(list); 1773 SLIST_REMOVE(list, kn, knote, kn_link); 1774 lwkt_relpooltoken(list); 1775 TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink); 1776 if (kn->kn_status & KN_QUEUED) 1777 knote_dequeue(kn); 1778 if (kn->kn_fop->f_flags & FILTEROP_ISFD) { 1779 fdrop(kn->kn_fp); 1780 kn->kn_fp = NULL; 1781 } 1782 knote_free(kn); 1783 } 1784 1785 /* 1786 * Low level enqueue function. 1787 * 1788 * The knote should already be marked for processing. 1789 * Caller must be holding the kq token 1790 */ 1791 static void 1792 knote_enqueue(struct knote *kn) 1793 { 1794 struct kqueue *kq = kn->kn_kq; 1795 1796 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 1797 TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); 1798 kn->kn_status |= KN_QUEUED; 1799 ++kq->kq_count; 1800 1801 /* 1802 * Send SIGIO on request (typically set up as a mailbox signal) 1803 */ 1804 if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1) 1805 pgsigio(kq->kq_sigio, SIGIO, 0); 1806 1807 kqueue_wakeup(kq); 1808 } 1809 1810 /* 1811 * Low level dequeue function. 1812 * 1813 * The knote should already be marked for processing. 1814 * Caller must be holding the kq token 1815 */ 1816 static void 1817 knote_dequeue(struct knote *kn) 1818 { 1819 struct kqueue *kq = kn->kn_kq; 1820 1821 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 1822 TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); 1823 kn->kn_status &= ~KN_QUEUED; 1824 kq->kq_count--; 1825 } 1826 1827 static struct knote * 1828 knote_alloc(void) 1829 { 1830 return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK); 1831 } 1832 1833 static void 1834 knote_free(struct knote *kn) 1835 { 1836 struct knote_cache_list *cache_list; 1837 1838 cache_list = &knote_cache_lists[mycpuid]; 1839 if (cache_list->knote_cache_cnt < KNOTE_CACHE_MAX) { 1840 crit_enter(); 1841 SLIST_INSERT_HEAD(&cache_list->knote_cache, kn, kn_link); 1842 cache_list->knote_cache_cnt++; 1843 crit_exit(); 1844 return; 1845 } 1846 kfree(kn, M_KQUEUE); 1847 } 1848