xref: /dragonfly/sys/kern/kern_event.c (revision a4fe36f1)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/proc.h>
33 #include <sys/malloc.h>
34 #include <sys/unistd.h>
35 #include <sys/file.h>
36 #include <sys/lock.h>
37 #include <sys/fcntl.h>
38 #include <sys/queue.h>
39 #include <sys/event.h>
40 #include <sys/eventvar.h>
41 #include <sys/protosw.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/stat.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysproto.h>
47 #include <sys/thread.h>
48 #include <sys/uio.h>
49 #include <sys/signalvar.h>
50 #include <sys/filio.h>
51 #include <sys/ktr.h>
52 
53 #include <sys/thread2.h>
54 #include <sys/file2.h>
55 #include <sys/mplock2.h>
56 
57 #define EVENT_REGISTER	1
58 #define EVENT_PROCESS	2
59 
60 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
61 
62 struct kevent_copyin_args {
63 	struct kevent_args	*ka;
64 	int			pchanges;
65 };
66 
67 #define KNOTE_CACHE_MAX		8
68 
69 struct knote_cache_list {
70 	struct klist		knote_cache;
71 	int			knote_cache_cnt;
72 } __cachealign;
73 
74 static int	kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
75 		    struct knote *marker);
76 static int 	kqueue_read(struct file *fp, struct uio *uio,
77 		    struct ucred *cred, int flags);
78 static int	kqueue_write(struct file *fp, struct uio *uio,
79 		    struct ucred *cred, int flags);
80 static int	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
81 		    struct ucred *cred, struct sysmsg *msg);
82 static int 	kqueue_kqfilter(struct file *fp, struct knote *kn);
83 static int 	kqueue_stat(struct file *fp, struct stat *st,
84 		    struct ucred *cred);
85 static int 	kqueue_close(struct file *fp);
86 static void	kqueue_wakeup(struct kqueue *kq);
87 static int	filter_attach(struct knote *kn);
88 static int	filter_event(struct knote *kn, long hint);
89 
90 /*
91  * MPSAFE
92  */
93 static struct fileops kqueueops = {
94 	.fo_read = kqueue_read,
95 	.fo_write = kqueue_write,
96 	.fo_ioctl = kqueue_ioctl,
97 	.fo_kqfilter = kqueue_kqfilter,
98 	.fo_stat = kqueue_stat,
99 	.fo_close = kqueue_close,
100 	.fo_shutdown = nofo_shutdown
101 };
102 
103 static void 	knote_attach(struct knote *kn);
104 static void 	knote_drop(struct knote *kn);
105 static void	knote_detach_and_drop(struct knote *kn);
106 static void 	knote_enqueue(struct knote *kn);
107 static void 	knote_dequeue(struct knote *kn);
108 static struct 	knote *knote_alloc(void);
109 static void 	knote_free(struct knote *kn);
110 
111 static void	filt_kqdetach(struct knote *kn);
112 static int	filt_kqueue(struct knote *kn, long hint);
113 static int	filt_procattach(struct knote *kn);
114 static void	filt_procdetach(struct knote *kn);
115 static int	filt_proc(struct knote *kn, long hint);
116 static int	filt_fileattach(struct knote *kn);
117 static void	filt_timerexpire(void *knx);
118 static int	filt_timerattach(struct knote *kn);
119 static void	filt_timerdetach(struct knote *kn);
120 static int	filt_timer(struct knote *kn, long hint);
121 static int	filt_userattach(struct knote *kn);
122 static void	filt_userdetach(struct knote *kn);
123 static int	filt_user(struct knote *kn, long hint);
124 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
125 				u_long type);
126 static int	filt_fsattach(struct knote *kn);
127 static void	filt_fsdetach(struct knote *kn);
128 static int	filt_fs(struct knote *kn, long hint);
129 
130 static struct filterops file_filtops =
131 	{ FILTEROP_ISFD | FILTEROP_MPSAFE, filt_fileattach, NULL, NULL };
132 static struct filterops kqread_filtops =
133 	{ FILTEROP_ISFD | FILTEROP_MPSAFE, NULL, filt_kqdetach, filt_kqueue };
134 static struct filterops proc_filtops =
135 	{ 0, filt_procattach, filt_procdetach, filt_proc };
136 static struct filterops timer_filtops =
137 	{ FILTEROP_MPSAFE, filt_timerattach, filt_timerdetach, filt_timer };
138 static struct filterops user_filtops =
139 	{ FILTEROP_MPSAFE, filt_userattach, filt_userdetach, filt_user };
140 static struct filterops fs_filtops =
141 	{ FILTEROP_MPSAFE, filt_fsattach, filt_fsdetach, filt_fs };
142 
143 static int 		kq_ncallouts = 0;
144 static int 		kq_calloutmax = (4 * 1024);
145 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
146     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
147 static int		kq_checkloop = 1000000;
148 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
149     &kq_checkloop, 0, "Maximum number of loops for kqueue scan");
150 
151 #define KNOTE_ACTIVATE(kn) do { 					\
152 	kn->kn_status |= KN_ACTIVE;					\
153 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
154 		knote_enqueue(kn);					\
155 } while(0)
156 
157 #define	KN_HASHSIZE		64		/* XXX should be tunable */
158 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
159 
160 extern struct filterops aio_filtops;
161 extern struct filterops sig_filtops;
162 
163 /*
164  * Table for for all system-defined filters.
165  */
166 static struct filterops *sysfilt_ops[] = {
167 	&file_filtops,			/* EVFILT_READ */
168 	&file_filtops,			/* EVFILT_WRITE */
169 	&aio_filtops,			/* EVFILT_AIO */
170 	&file_filtops,			/* EVFILT_VNODE */
171 	&proc_filtops,			/* EVFILT_PROC */
172 	&sig_filtops,			/* EVFILT_SIGNAL */
173 	&timer_filtops,			/* EVFILT_TIMER */
174 	&file_filtops,			/* EVFILT_EXCEPT */
175 	&user_filtops,			/* EVFILT_USER */
176 	&fs_filtops,			/* EVFILT_FS */
177 };
178 
179 static struct knote_cache_list	knote_cache_lists[MAXCPU];
180 
181 /*
182  * Acquire a knote, return non-zero on success, 0 on failure.
183  *
184  * If we cannot acquire the knote we sleep and return 0.  The knote
185  * may be stale on return in this case and the caller must restart
186  * whatever loop they are in.
187  *
188  * Related kq token must be held.
189  */
190 static __inline int
191 knote_acquire(struct knote *kn)
192 {
193 	if (kn->kn_status & KN_PROCESSING) {
194 		kn->kn_status |= KN_WAITING | KN_REPROCESS;
195 		tsleep(kn, 0, "kqepts", hz);
196 		/* knote may be stale now */
197 		return(0);
198 	}
199 	kn->kn_status |= KN_PROCESSING;
200 	return(1);
201 }
202 
203 /*
204  * Release an acquired knote, clearing KN_PROCESSING and handling any
205  * KN_REPROCESS events.
206  *
207  * Caller must be holding the related kq token
208  *
209  * Non-zero is returned if the knote is destroyed or detached.
210  */
211 static __inline int
212 knote_release(struct knote *kn)
213 {
214 	int ret;
215 
216 	while (kn->kn_status & KN_REPROCESS) {
217 		kn->kn_status &= ~KN_REPROCESS;
218 		if (kn->kn_status & KN_WAITING) {
219 			kn->kn_status &= ~KN_WAITING;
220 			wakeup(kn);
221 		}
222 		if (kn->kn_status & KN_DELETING) {
223 			knote_detach_and_drop(kn);
224 			return(1);
225 			/* NOT REACHED */
226 		}
227 		if (filter_event(kn, 0))
228 			KNOTE_ACTIVATE(kn);
229 	}
230 	if (kn->kn_status & KN_DETACHED)
231 		ret = 1;
232 	else
233 		ret = 0;
234 	kn->kn_status &= ~KN_PROCESSING;
235 	/* kn should not be accessed anymore */
236 	return ret;
237 }
238 
239 static int
240 filt_fileattach(struct knote *kn)
241 {
242 	return (fo_kqfilter(kn->kn_fp, kn));
243 }
244 
245 /*
246  * MPSAFE
247  */
248 static int
249 kqueue_kqfilter(struct file *fp, struct knote *kn)
250 {
251 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
252 
253 	if (kn->kn_filter != EVFILT_READ)
254 		return (EOPNOTSUPP);
255 
256 	kn->kn_fop = &kqread_filtops;
257 	knote_insert(&kq->kq_kqinfo.ki_note, kn);
258 	return (0);
259 }
260 
261 static void
262 filt_kqdetach(struct knote *kn)
263 {
264 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
265 
266 	knote_remove(&kq->kq_kqinfo.ki_note, kn);
267 }
268 
269 /*ARGSUSED*/
270 static int
271 filt_kqueue(struct knote *kn, long hint)
272 {
273 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
274 
275 	kn->kn_data = kq->kq_count;
276 	return (kn->kn_data > 0);
277 }
278 
279 static int
280 filt_procattach(struct knote *kn)
281 {
282 	struct proc *p;
283 	int immediate;
284 
285 	immediate = 0;
286 	p = pfind(kn->kn_id);
287 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
288 		p = zpfind(kn->kn_id);
289 		immediate = 1;
290 	}
291 	if (p == NULL) {
292 		return (ESRCH);
293 	}
294 	if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) {
295 		if (p)
296 			PRELE(p);
297 		return (EACCES);
298 	}
299 
300 	lwkt_gettoken(&p->p_token);
301 	kn->kn_ptr.p_proc = p;
302 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
303 
304 	/*
305 	 * internal flag indicating registration done by kernel
306 	 */
307 	if (kn->kn_flags & EV_FLAG1) {
308 		kn->kn_data = kn->kn_sdata;		/* ppid */
309 		kn->kn_fflags = NOTE_CHILD;
310 		kn->kn_flags &= ~EV_FLAG1;
311 	}
312 
313 	knote_insert(&p->p_klist, kn);
314 
315 	/*
316 	 * Immediately activate any exit notes if the target process is a
317 	 * zombie.  This is necessary to handle the case where the target
318 	 * process, e.g. a child, dies before the kevent is negistered.
319 	 */
320 	if (immediate && filt_proc(kn, NOTE_EXIT))
321 		KNOTE_ACTIVATE(kn);
322 	lwkt_reltoken(&p->p_token);
323 	PRELE(p);
324 
325 	return (0);
326 }
327 
328 /*
329  * The knote may be attached to a different process, which may exit,
330  * leaving nothing for the knote to be attached to.  So when the process
331  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
332  * it will be deleted when read out.  However, as part of the knote deletion,
333  * this routine is called, so a check is needed to avoid actually performing
334  * a detach, because the original process does not exist any more.
335  */
336 static void
337 filt_procdetach(struct knote *kn)
338 {
339 	struct proc *p;
340 
341 	if (kn->kn_status & KN_DETACHED)
342 		return;
343 	p = kn->kn_ptr.p_proc;
344 	knote_remove(&p->p_klist, kn);
345 }
346 
347 static int
348 filt_proc(struct knote *kn, long hint)
349 {
350 	u_int event;
351 
352 	/*
353 	 * mask off extra data
354 	 */
355 	event = (u_int)hint & NOTE_PCTRLMASK;
356 
357 	/*
358 	 * if the user is interested in this event, record it.
359 	 */
360 	if (kn->kn_sfflags & event)
361 		kn->kn_fflags |= event;
362 
363 	/*
364 	 * Process is gone, so flag the event as finished.  Detach the
365 	 * knote from the process now because the process will be poof,
366 	 * gone later on.
367 	 */
368 	if (event == NOTE_EXIT) {
369 		struct proc *p = kn->kn_ptr.p_proc;
370 		if ((kn->kn_status & KN_DETACHED) == 0) {
371 			PHOLD(p);
372 			knote_remove(&p->p_klist, kn);
373 			kn->kn_status |= KN_DETACHED;
374 			kn->kn_data = p->p_xstat;
375 			kn->kn_ptr.p_proc = NULL;
376 			PRELE(p);
377 		}
378 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
379 		return (1);
380 	}
381 
382 	/*
383 	 * process forked, and user wants to track the new process,
384 	 * so attach a new knote to it, and immediately report an
385 	 * event with the parent's pid.
386 	 */
387 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
388 		struct kevent kev;
389 		int error;
390 
391 		/*
392 		 * register knote with new process.
393 		 */
394 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
395 		kev.filter = kn->kn_filter;
396 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
397 		kev.fflags = kn->kn_sfflags;
398 		kev.data = kn->kn_id;			/* parent */
399 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
400 		error = kqueue_register(kn->kn_kq, &kev);
401 		if (error)
402 			kn->kn_fflags |= NOTE_TRACKERR;
403 	}
404 
405 	return (kn->kn_fflags != 0);
406 }
407 
408 static void
409 filt_timerreset(struct knote *kn)
410 {
411 	struct callout *calloutp;
412 	struct timeval tv;
413 	int tticks;
414 
415 	tv.tv_sec = kn->kn_sdata / 1000;
416 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
417 	tticks = tvtohz_high(&tv);
418 	calloutp = (struct callout *)kn->kn_hook;
419 	callout_reset(calloutp, tticks, filt_timerexpire, kn);
420 }
421 
422 /*
423  * The callout interlocks with callout_terminate() but can still
424  * race a deletion so if KN_DELETING is set we just don't touch
425  * the knote.
426  */
427 static void
428 filt_timerexpire(void *knx)
429 {
430 	struct knote *kn = knx;
431 	struct kqueue *kq = kn->kn_kq;
432 
433 	lwkt_getpooltoken(kq);
434 
435 	/*
436 	 * Open knote_acquire(), since we can't sleep in callout,
437 	 * however, we do need to record this expiration.
438 	 */
439 	kn->kn_data++;
440 	if (kn->kn_status & KN_PROCESSING) {
441 		kn->kn_status |= KN_REPROCESS;
442 		if ((kn->kn_status & KN_DELETING) == 0 &&
443 		    (kn->kn_flags & EV_ONESHOT) == 0)
444 			filt_timerreset(kn);
445 		lwkt_relpooltoken(kq);
446 		return;
447 	}
448 	KASSERT((kn->kn_status & KN_DELETING) == 0,
449 	    ("acquire a deleting knote %#x", kn->kn_status));
450 	kn->kn_status |= KN_PROCESSING;
451 
452 	KNOTE_ACTIVATE(kn);
453 	if ((kn->kn_flags & EV_ONESHOT) == 0)
454 		filt_timerreset(kn);
455 
456 	knote_release(kn);
457 
458 	lwkt_relpooltoken(kq);
459 }
460 
461 /*
462  * data contains amount of time to sleep, in milliseconds
463  */
464 static int
465 filt_timerattach(struct knote *kn)
466 {
467 	struct callout *calloutp;
468 	int prev_ncallouts;
469 
470 	prev_ncallouts = atomic_fetchadd_int(&kq_ncallouts, 1);
471 	if (prev_ncallouts >= kq_calloutmax) {
472 		atomic_subtract_int(&kq_ncallouts, 1);
473 		kn->kn_hook = NULL;
474 		return (ENOMEM);
475 	}
476 
477 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
478 	calloutp = kmalloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
479 	callout_init_mp(calloutp);
480 	kn->kn_hook = (caddr_t)calloutp;
481 
482 	filt_timerreset(kn);
483 	return (0);
484 }
485 
486 /*
487  * This function is called with the knote flagged locked but it is
488  * still possible to race a callout event due to the callback blocking.
489  * We must call callout_terminate() instead of callout_stop() to deal
490  * with the race.
491  */
492 static void
493 filt_timerdetach(struct knote *kn)
494 {
495 	struct callout *calloutp;
496 
497 	calloutp = (struct callout *)kn->kn_hook;
498 	callout_terminate(calloutp);
499 	kfree(calloutp, M_KQUEUE);
500 	atomic_subtract_int(&kq_ncallouts, 1);
501 }
502 
503 static int
504 filt_timer(struct knote *kn, long hint)
505 {
506 
507 	return (kn->kn_data != 0);
508 }
509 
510 /*
511  * EVFILT_USER
512  */
513 static int
514 filt_userattach(struct knote *kn)
515 {
516 	kn->kn_hook = NULL;
517 	if (kn->kn_fflags & NOTE_TRIGGER)
518 		kn->kn_ptr.hookid = 1;
519 	else
520 		kn->kn_ptr.hookid = 0;
521 	return 0;
522 }
523 
524 static void
525 filt_userdetach(struct knote *kn)
526 {
527 	/* nothing to do */
528 }
529 
530 static int
531 filt_user(struct knote *kn, long hint)
532 {
533 	return (kn->kn_ptr.hookid);
534 }
535 
536 static void
537 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
538 {
539 	u_int ffctrl;
540 
541 	switch (type) {
542 	case EVENT_REGISTER:
543 		if (kev->fflags & NOTE_TRIGGER)
544 			kn->kn_ptr.hookid = 1;
545 
546 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
547 		kev->fflags &= NOTE_FFLAGSMASK;
548 		switch (ffctrl) {
549 		case NOTE_FFNOP:
550 			break;
551 
552 		case NOTE_FFAND:
553 			kn->kn_sfflags &= kev->fflags;
554 			break;
555 
556 		case NOTE_FFOR:
557 			kn->kn_sfflags |= kev->fflags;
558 			break;
559 
560 		case NOTE_FFCOPY:
561 			kn->kn_sfflags = kev->fflags;
562 			break;
563 
564 		default:
565 			/* XXX Return error? */
566 			break;
567 		}
568 		kn->kn_sdata = kev->data;
569 
570 		/*
571 		 * This is not the correct use of EV_CLEAR in an event
572 		 * modification, it should have been passed as a NOTE instead.
573 		 * But we need to maintain compatibility with Apple & FreeBSD.
574 		 *
575 		 * Note however that EV_CLEAR can still be used when doing
576 		 * the initial registration of the event and works as expected
577 		 * (clears the event on reception).
578 		 */
579 		if (kev->flags & EV_CLEAR) {
580 			kn->kn_ptr.hookid = 0;
581 			kn->kn_data = 0;
582 			kn->kn_fflags = 0;
583 		}
584 		break;
585 
586         case EVENT_PROCESS:
587 		*kev = kn->kn_kevent;
588 		kev->fflags = kn->kn_sfflags;
589 		kev->data = kn->kn_sdata;
590 		if (kn->kn_flags & EV_CLEAR) {
591 			kn->kn_ptr.hookid = 0;
592 			/* kn_data, kn_fflags handled by parent */
593 		}
594 		break;
595 
596 	default:
597 		panic("filt_usertouch() - invalid type (%ld)", type);
598 		break;
599 	}
600 }
601 
602 /*
603  * EVFILT_FS
604  */
605 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
606 
607 static int
608 filt_fsattach(struct knote *kn)
609 {
610 	kn->kn_flags |= EV_CLEAR;
611 	knote_insert(&fs_klist, kn);
612 
613 	return (0);
614 }
615 
616 static void
617 filt_fsdetach(struct knote *kn)
618 {
619 	knote_remove(&fs_klist, kn);
620 }
621 
622 static int
623 filt_fs(struct knote *kn, long hint)
624 {
625 	kn->kn_fflags |= hint;
626 	return (kn->kn_fflags != 0);
627 }
628 
629 /*
630  * Initialize a kqueue.
631  *
632  * NOTE: The lwp/proc code initializes a kqueue for select/poll ops.
633  *
634  * MPSAFE
635  */
636 void
637 kqueue_init(struct kqueue *kq, struct filedesc *fdp)
638 {
639 	TAILQ_INIT(&kq->kq_knpend);
640 	TAILQ_INIT(&kq->kq_knlist);
641 	kq->kq_count = 0;
642 	kq->kq_fdp = fdp;
643 	SLIST_INIT(&kq->kq_kqinfo.ki_note);
644 }
645 
646 /*
647  * Terminate a kqueue.  Freeing the actual kq itself is left up to the
648  * caller (it might be embedded in a lwp so we don't do it here).
649  *
650  * The kq's knlist must be completely eradicated so block on any
651  * processing races.
652  */
653 void
654 kqueue_terminate(struct kqueue *kq)
655 {
656 	struct knote *kn;
657 
658 	lwkt_getpooltoken(kq);
659 	while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
660 		if (knote_acquire(kn))
661 			knote_detach_and_drop(kn);
662 	}
663 	lwkt_relpooltoken(kq);
664 
665 	if (kq->kq_knhash) {
666 		hashdestroy(kq->kq_knhash, M_KQUEUE, kq->kq_knhashmask);
667 		kq->kq_knhash = NULL;
668 		kq->kq_knhashmask = 0;
669 	}
670 }
671 
672 /*
673  * MPSAFE
674  */
675 int
676 sys_kqueue(struct kqueue_args *uap)
677 {
678 	struct thread *td = curthread;
679 	struct kqueue *kq;
680 	struct file *fp;
681 	int fd, error;
682 
683 	error = falloc(td->td_lwp, &fp, &fd);
684 	if (error)
685 		return (error);
686 	fp->f_flag = FREAD | FWRITE;
687 	fp->f_type = DTYPE_KQUEUE;
688 	fp->f_ops = &kqueueops;
689 
690 	kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
691 	kqueue_init(kq, td->td_proc->p_fd);
692 	fp->f_data = kq;
693 
694 	fsetfd(kq->kq_fdp, fp, fd);
695 	uap->sysmsg_result = fd;
696 	fdrop(fp);
697 	return (error);
698 }
699 
700 /*
701  * Copy 'count' items into the destination list pointed to by uap->eventlist.
702  */
703 static int
704 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res)
705 {
706 	struct kevent_copyin_args *kap;
707 	int error;
708 
709 	kap = (struct kevent_copyin_args *)arg;
710 
711 	error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp));
712 	if (error == 0) {
713 		kap->ka->eventlist += count;
714 		*res += count;
715 	} else {
716 		*res = -1;
717 	}
718 
719 	return (error);
720 }
721 
722 /*
723  * Copy at most 'max' items from the list pointed to by kap->changelist,
724  * return number of items in 'events'.
725  */
726 static int
727 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events)
728 {
729 	struct kevent_copyin_args *kap;
730 	int error, count;
731 
732 	kap = (struct kevent_copyin_args *)arg;
733 
734 	count = min(kap->ka->nchanges - kap->pchanges, max);
735 	error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp);
736 	if (error == 0) {
737 		kap->ka->changelist += count;
738 		kap->pchanges += count;
739 		*events = count;
740 	}
741 
742 	return (error);
743 }
744 
745 /*
746  * MPSAFE
747  */
748 int
749 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
750 	    k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn,
751 	    struct timespec *tsp_in)
752 {
753 	struct kevent *kevp;
754 	struct timespec *tsp, ats;
755 	int i, n, total, error, nerrors = 0;
756 	int lres;
757 	int limit = kq_checkloop;
758 	struct kevent kev[KQ_NEVENTS];
759 	struct knote marker;
760 	struct lwkt_token *tok;
761 
762 	if (tsp_in == NULL || tsp_in->tv_sec || tsp_in->tv_nsec)
763 		atomic_set_int(&curthread->td_mpflags, TDF_MP_BATCH_DEMARC);
764 
765 	tsp = tsp_in;
766 	*res = 0;
767 
768 	for (;;) {
769 		n = 0;
770 		error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
771 		if (error)
772 			return error;
773 		if (n == 0)
774 			break;
775 		for (i = 0; i < n; i++) {
776 			kevp = &kev[i];
777 			kevp->flags &= ~EV_SYSFLAGS;
778 			error = kqueue_register(kq, kevp);
779 
780 			/*
781 			 * If a registration returns an error we
782 			 * immediately post the error.  The kevent()
783 			 * call itself will fail with the error if
784 			 * no space is available for posting.
785 			 *
786 			 * Such errors normally bypass the timeout/blocking
787 			 * code.  However, if the copyoutfn function refuses
788 			 * to post the error (see sys_poll()), then we
789 			 * ignore it too.
790 			 */
791 			if (error || (kevp->flags & EV_RECEIPT)) {
792 				kevp->flags = EV_ERROR;
793 				kevp->data = error;
794 				lres = *res;
795 				kevent_copyoutfn(uap, kevp, 1, res);
796 				if (*res < 0) {
797 					return error;
798 				} else if (lres != *res) {
799 					nevents--;
800 					nerrors++;
801 				}
802 			}
803 		}
804 	}
805 	if (nerrors)
806 		return 0;
807 
808 	/*
809 	 * Acquire/wait for events - setup timeout
810 	 */
811 	if (tsp != NULL) {
812 		if (tsp->tv_sec || tsp->tv_nsec) {
813 			getnanouptime(&ats);
814 			timespecadd(tsp, &ats);		/* tsp = target time */
815 		}
816 	}
817 
818 	/*
819 	 * Loop as required.
820 	 *
821 	 * Collect as many events as we can. Sleeping on successive
822 	 * loops is disabled if copyoutfn has incremented (*res).
823 	 *
824 	 * The loop stops if an error occurs, all events have been
825 	 * scanned (the marker has been reached), or fewer than the
826 	 * maximum number of events is found.
827 	 *
828 	 * The copyoutfn function does not have to increment (*res) in
829 	 * order for the loop to continue.
830 	 *
831 	 * NOTE: doselect() usually passes 0x7FFFFFFF for nevents.
832 	 */
833 	total = 0;
834 	error = 0;
835 	marker.kn_filter = EVFILT_MARKER;
836 	marker.kn_status = KN_PROCESSING;
837 	tok = lwkt_token_pool_lookup(kq);
838 	lwkt_gettoken(tok);
839 	TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
840 	lwkt_reltoken(tok);
841 	while ((n = nevents - total) > 0) {
842 		if (n > KQ_NEVENTS)
843 			n = KQ_NEVENTS;
844 
845 		/*
846 		 * If no events are pending sleep until timeout (if any)
847 		 * or an event occurs.
848 		 *
849 		 * After the sleep completes the marker is moved to the
850 		 * end of the list, making any received events available
851 		 * to our scan.
852 		 */
853 		if (kq->kq_count == 0 && *res == 0) {
854 			int timeout;
855 
856 			if (tsp == NULL) {
857 				timeout = 0;
858 			} else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
859 				error = EWOULDBLOCK;
860 				break;
861 			} else {
862 				struct timespec atx = *tsp;
863 
864 				getnanouptime(&ats);
865 				timespecsub(&atx, &ats);
866 				if (atx.tv_sec < 0) {
867 					error = EWOULDBLOCK;
868 					break;
869 				} else {
870 					timeout = atx.tv_sec > 24 * 60 * 60 ?
871 					    24 * 60 * 60 * hz :
872 					    tstohz_high(&atx);
873 				}
874 			}
875 
876 			lwkt_gettoken(tok);
877 			if (kq->kq_count == 0) {
878 				kq->kq_sleep_cnt++;
879 				if (__predict_false(kq->kq_sleep_cnt == 0)) {
880 					/*
881 					 * Guard against possible wrapping.  And
882 					 * set it to 2, so that kqueue_wakeup()
883 					 * can wake everyone up.
884 					 */
885 					kq->kq_sleep_cnt = 2;
886 				}
887 				error = tsleep(kq, PCATCH, "kqread", timeout);
888 
889 				/* don't restart after signals... */
890 				if (error == ERESTART)
891 					error = EINTR;
892 				if (error) {
893 					lwkt_reltoken(tok);
894 					break;
895 				}
896 
897 				TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
898 				TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker,
899 				    kn_tqe);
900 			}
901 			lwkt_reltoken(tok);
902 		}
903 
904 		/*
905 		 * Process all received events
906 		 * Account for all non-spurious events in our total
907 		 */
908 		i = kqueue_scan(kq, kev, n, &marker);
909 		if (i) {
910 			lres = *res;
911 			error = kevent_copyoutfn(uap, kev, i, res);
912 			total += *res - lres;
913 			if (error)
914 				break;
915 		}
916 		if (limit && --limit == 0)
917 			panic("kqueue: checkloop failed i=%d", i);
918 
919 		/*
920 		 * Normally when fewer events are returned than requested
921 		 * we can stop.  However, if only spurious events were
922 		 * collected the copyout will not bump (*res) and we have
923 		 * to continue.
924 		 */
925 		if (i < n && *res)
926 			break;
927 
928 		/*
929 		 * Deal with an edge case where spurious events can cause
930 		 * a loop to occur without moving the marker.  This can
931 		 * prevent kqueue_scan() from picking up new events which
932 		 * race us.  We must be sure to move the marker for this
933 		 * case.
934 		 *
935 		 * NOTE: We do not want to move the marker if events
936 		 *	 were scanned because normal kqueue operations
937 		 *	 may reactivate events.  Moving the marker in
938 		 *	 that case could result in duplicates for the
939 		 *	 same event.
940 		 */
941 		if (i == 0) {
942 			lwkt_gettoken(tok);
943 			TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
944 			TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
945 			lwkt_reltoken(tok);
946 		}
947 	}
948 	lwkt_gettoken(tok);
949 	TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
950 	lwkt_reltoken(tok);
951 
952 	/* Timeouts do not return EWOULDBLOCK. */
953 	if (error == EWOULDBLOCK)
954 		error = 0;
955 	return error;
956 }
957 
958 /*
959  * MPALMOSTSAFE
960  */
961 int
962 sys_kevent(struct kevent_args *uap)
963 {
964 	struct thread *td = curthread;
965 	struct proc *p = td->td_proc;
966 	struct timespec ts, *tsp;
967 	struct kqueue *kq;
968 	struct file *fp = NULL;
969 	struct kevent_copyin_args *kap, ka;
970 	int error;
971 
972 	if (uap->timeout) {
973 		error = copyin(uap->timeout, &ts, sizeof(ts));
974 		if (error)
975 			return (error);
976 		tsp = &ts;
977 	} else {
978 		tsp = NULL;
979 	}
980 	fp = holdfp(p->p_fd, uap->fd, -1);
981 	if (fp == NULL)
982 		return (EBADF);
983 	if (fp->f_type != DTYPE_KQUEUE) {
984 		fdrop(fp);
985 		return (EBADF);
986 	}
987 
988 	kq = (struct kqueue *)fp->f_data;
989 
990 	kap = &ka;
991 	kap->ka = uap;
992 	kap->pchanges = 0;
993 
994 	error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap,
995 			    kevent_copyin, kevent_copyout, tsp);
996 
997 	fdrop(fp);
998 
999 	return (error);
1000 }
1001 
1002 int
1003 kqueue_register(struct kqueue *kq, struct kevent *kev)
1004 {
1005 	struct filedesc *fdp = kq->kq_fdp;
1006 	struct klist *list = NULL;
1007 	struct filterops *fops;
1008 	struct file *fp = NULL;
1009 	struct knote *kn = NULL;
1010 	struct thread *td;
1011 	int error = 0;
1012 	struct knote_cache_list *cache_list;
1013 
1014 	if (kev->filter < 0) {
1015 		if (kev->filter + EVFILT_SYSCOUNT < 0)
1016 			return (EINVAL);
1017 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
1018 	} else {
1019 		/*
1020 		 * XXX
1021 		 * filter attach routine is responsible for insuring that
1022 		 * the identifier can be attached to it.
1023 		 */
1024 		return (EINVAL);
1025 	}
1026 
1027 	if (fops->f_flags & FILTEROP_ISFD) {
1028 		/* validate descriptor */
1029 		fp = holdfp(fdp, kev->ident, -1);
1030 		if (fp == NULL)
1031 			return (EBADF);
1032 	}
1033 
1034 	cache_list = &knote_cache_lists[mycpuid];
1035 	if (SLIST_EMPTY(&cache_list->knote_cache)) {
1036 		struct knote *new_kn;
1037 
1038 		new_kn = knote_alloc();
1039 		crit_enter();
1040 		SLIST_INSERT_HEAD(&cache_list->knote_cache, new_kn, kn_link);
1041 		cache_list->knote_cache_cnt++;
1042 		crit_exit();
1043 	}
1044 
1045 	td = curthread;
1046 	lwkt_getpooltoken(kq);
1047 
1048 	/*
1049 	 * Make sure that only one thread can register event on this kqueue,
1050 	 * so that we would not suffer any race, even if the registration
1051 	 * blocked, i.e. kq token was released, and the kqueue was shared
1052 	 * between threads (this should be rare though).
1053 	 */
1054 	while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) {
1055 		kq->kq_state |= KQ_REGWAIT;
1056 		tsleep(&kq->kq_regtd, 0, "kqreg", 0);
1057 	}
1058 	if (__predict_false(kq->kq_regtd != NULL)) {
1059 		/* Recursive calling of kqueue_register() */
1060 		td = NULL;
1061 	} else {
1062 		/* Owner of the kq_regtd, i.e. td != NULL */
1063 		kq->kq_regtd = td;
1064 	}
1065 
1066 	if (fp != NULL) {
1067 		list = &fp->f_klist;
1068 	} else if (kq->kq_knhashmask) {
1069 		list = &kq->kq_knhash[
1070 		    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1071 	}
1072 	if (list != NULL) {
1073 		lwkt_getpooltoken(list);
1074 again:
1075 		SLIST_FOREACH(kn, list, kn_link) {
1076 			if (kn->kn_kq == kq &&
1077 			    kn->kn_filter == kev->filter &&
1078 			    kn->kn_id == kev->ident) {
1079 				if (knote_acquire(kn) == 0)
1080 					goto again;
1081 				break;
1082 			}
1083 		}
1084 		lwkt_relpooltoken(list);
1085 	}
1086 
1087 	/*
1088 	 * NOTE: At this point if kn is non-NULL we will have acquired
1089 	 *	 it and set KN_PROCESSING.
1090 	 */
1091 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
1092 		error = ENOENT;
1093 		goto done;
1094 	}
1095 
1096 	/*
1097 	 * kn now contains the matching knote, or NULL if no match
1098 	 */
1099 	if (kev->flags & EV_ADD) {
1100 		if (kn == NULL) {
1101 			crit_enter();
1102 			kn = SLIST_FIRST(&cache_list->knote_cache);
1103 			if (kn == NULL) {
1104 				crit_exit();
1105 				kn = knote_alloc();
1106 			} else {
1107 				SLIST_REMOVE_HEAD(&cache_list->knote_cache,
1108 				    kn_link);
1109 				cache_list->knote_cache_cnt--;
1110 				crit_exit();
1111 			}
1112 			kn->kn_fp = fp;
1113 			kn->kn_kq = kq;
1114 			kn->kn_fop = fops;
1115 
1116 			/*
1117 			 * apply reference count to knote structure, and
1118 			 * do not release it at the end of this routine.
1119 			 */
1120 			fp = NULL;
1121 
1122 			kn->kn_sfflags = kev->fflags;
1123 			kn->kn_sdata = kev->data;
1124 			kev->fflags = 0;
1125 			kev->data = 0;
1126 			kn->kn_kevent = *kev;
1127 
1128 			/*
1129 			 * KN_PROCESSING prevents the knote from getting
1130 			 * ripped out from under us while we are trying
1131 			 * to attach it, in case the attach blocks.
1132 			 */
1133 			kn->kn_status = KN_PROCESSING;
1134 			knote_attach(kn);
1135 			if ((error = filter_attach(kn)) != 0) {
1136 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
1137 				knote_drop(kn);
1138 				goto done;
1139 			}
1140 
1141 			/*
1142 			 * Interlock against close races which either tried
1143 			 * to remove our knote while we were blocked or missed
1144 			 * it entirely prior to our attachment.  We do not
1145 			 * want to end up with a knote on a closed descriptor.
1146 			 */
1147 			if ((fops->f_flags & FILTEROP_ISFD) &&
1148 			    checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
1149 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
1150 			}
1151 		} else {
1152 			/*
1153 			 * The user may change some filter values after the
1154 			 * initial EV_ADD, but doing so will not reset any
1155 			 * filter which have already been triggered.
1156 			 */
1157 			KKASSERT(kn->kn_status & KN_PROCESSING);
1158 			if (fops == &user_filtops) {
1159 				filt_usertouch(kn, kev, EVENT_REGISTER);
1160 			} else {
1161 				kn->kn_sfflags = kev->fflags;
1162 				kn->kn_sdata = kev->data;
1163 				kn->kn_kevent.udata = kev->udata;
1164 			}
1165 		}
1166 
1167 		/*
1168 		 * Execute the filter event to immediately activate the
1169 		 * knote if necessary.  If reprocessing events are pending
1170 		 * due to blocking above we do not run the filter here
1171 		 * but instead let knote_release() do it.  Otherwise we
1172 		 * might run the filter on a deleted event.
1173 		 */
1174 		if ((kn->kn_status & KN_REPROCESS) == 0) {
1175 			if (filter_event(kn, 0))
1176 				KNOTE_ACTIVATE(kn);
1177 		}
1178 	} else if (kev->flags & EV_DELETE) {
1179 		/*
1180 		 * Delete the existing knote
1181 		 */
1182 		knote_detach_and_drop(kn);
1183 		goto done;
1184 	} else {
1185 		/*
1186 		 * Modify an existing event.
1187 		 *
1188 		 * The user may change some filter values after the
1189 		 * initial EV_ADD, but doing so will not reset any
1190 		 * filter which have already been triggered.
1191 		 */
1192 		KKASSERT(kn->kn_status & KN_PROCESSING);
1193 		if (fops == &user_filtops) {
1194 			filt_usertouch(kn, kev, EVENT_REGISTER);
1195 		} else {
1196 			kn->kn_sfflags = kev->fflags;
1197 			kn->kn_sdata = kev->data;
1198 			kn->kn_kevent.udata = kev->udata;
1199 		}
1200 
1201 		/*
1202 		 * Execute the filter event to immediately activate the
1203 		 * knote if necessary.  If reprocessing events are pending
1204 		 * due to blocking above we do not run the filter here
1205 		 * but instead let knote_release() do it.  Otherwise we
1206 		 * might run the filter on a deleted event.
1207 		 */
1208 		if ((kn->kn_status & KN_REPROCESS) == 0) {
1209 			if (filter_event(kn, 0))
1210 				KNOTE_ACTIVATE(kn);
1211 		}
1212 	}
1213 
1214 	/*
1215 	 * Disablement does not deactivate a knote here.
1216 	 */
1217 	if ((kev->flags & EV_DISABLE) &&
1218 	    ((kn->kn_status & KN_DISABLED) == 0)) {
1219 		kn->kn_status |= KN_DISABLED;
1220 	}
1221 
1222 	/*
1223 	 * Re-enablement may have to immediately enqueue an active knote.
1224 	 */
1225 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
1226 		kn->kn_status &= ~KN_DISABLED;
1227 		if ((kn->kn_status & KN_ACTIVE) &&
1228 		    ((kn->kn_status & KN_QUEUED) == 0)) {
1229 			knote_enqueue(kn);
1230 		}
1231 	}
1232 
1233 	/*
1234 	 * Handle any required reprocessing
1235 	 */
1236 	knote_release(kn);
1237 	/* kn may be invalid now */
1238 
1239 done:
1240 	if (td != NULL) { /* Owner of the kq_regtd */
1241 		kq->kq_regtd = NULL;
1242 		if (__predict_false(kq->kq_state & KQ_REGWAIT)) {
1243 			kq->kq_state &= ~KQ_REGWAIT;
1244 			wakeup(&kq->kq_regtd);
1245 		}
1246 	}
1247 	lwkt_relpooltoken(kq);
1248 	if (fp != NULL)
1249 		fdrop(fp);
1250 	return (error);
1251 }
1252 
1253 /*
1254  * Scan the kqueue, return the number of active events placed in kevp up
1255  * to count.
1256  *
1257  * Continuous mode events may get recycled, do not continue scanning past
1258  * marker unless no events have been collected.
1259  */
1260 static int
1261 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
1262             struct knote *marker)
1263 {
1264         struct knote *kn, local_marker;
1265         int total;
1266 
1267 	total = 0;
1268 	local_marker.kn_filter = EVFILT_MARKER;
1269 	local_marker.kn_status = KN_PROCESSING;
1270 
1271 	lwkt_getpooltoken(kq);
1272 
1273 	/*
1274 	 * Collect events.
1275 	 */
1276 	TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe);
1277 	while (count) {
1278 		kn = TAILQ_NEXT(&local_marker, kn_tqe);
1279 		if (kn->kn_filter == EVFILT_MARKER) {
1280 			/* Marker reached, we are done */
1281 			if (kn == marker)
1282 				break;
1283 
1284 			/* Move local marker past some other threads marker */
1285 			kn = TAILQ_NEXT(kn, kn_tqe);
1286 			TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1287 			TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe);
1288 			continue;
1289 		}
1290 
1291 		/*
1292 		 * We can't skip a knote undergoing processing, otherwise
1293 		 * we risk not returning it when the user process expects
1294 		 * it should be returned.  Sleep and retry.
1295 		 */
1296 		if (knote_acquire(kn) == 0)
1297 			continue;
1298 
1299 		/*
1300 		 * Remove the event for processing.
1301 		 *
1302 		 * WARNING!  We must leave KN_QUEUED set to prevent the
1303 		 *	     event from being KNOTE_ACTIVATE()d while
1304 		 *	     the queue state is in limbo, in case we
1305 		 *	     block.
1306 		 */
1307 		TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
1308 		kq->kq_count--;
1309 
1310 		/*
1311 		 * We have to deal with an extremely important race against
1312 		 * file descriptor close()s here.  The file descriptor can
1313 		 * disappear MPSAFE, and there is a small window of
1314 		 * opportunity between that and the call to knote_fdclose().
1315 		 *
1316 		 * If we hit that window here while doselect or dopoll is
1317 		 * trying to delete a spurious event they will not be able
1318 		 * to match up the event against a knote and will go haywire.
1319 		 */
1320 		if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
1321 		    checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
1322 			kn->kn_status |= KN_DELETING | KN_REPROCESS;
1323 		}
1324 
1325 		if (kn->kn_status & KN_DISABLED) {
1326 			/*
1327 			 * If disabled we ensure the event is not queued
1328 			 * but leave its active bit set.  On re-enablement
1329 			 * the event may be immediately triggered.
1330 			 */
1331 			kn->kn_status &= ~KN_QUEUED;
1332 		} else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
1333 			   (kn->kn_status & KN_DELETING) == 0 &&
1334 			   filter_event(kn, 0) == 0) {
1335 			/*
1336 			 * If not running in one-shot mode and the event
1337 			 * is no longer present we ensure it is removed
1338 			 * from the queue and ignore it.
1339 			 */
1340 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1341 		} else {
1342 			/*
1343 			 * Post the event
1344 			 */
1345 			if (kn->kn_fop == &user_filtops)
1346 				filt_usertouch(kn, kevp, EVENT_PROCESS);
1347 			else
1348 				*kevp = kn->kn_kevent;
1349 			++kevp;
1350 			++total;
1351 			--count;
1352 
1353 			if (kn->kn_flags & EV_ONESHOT) {
1354 				kn->kn_status &= ~KN_QUEUED;
1355 				kn->kn_status |= KN_DELETING | KN_REPROCESS;
1356 			} else {
1357 				if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1358 					if (kn->kn_flags & EV_CLEAR) {
1359 						kn->kn_data = 0;
1360 						kn->kn_fflags = 0;
1361 					}
1362 					if (kn->kn_flags & EV_DISPATCH) {
1363 						kn->kn_status |= KN_DISABLED;
1364 					}
1365 					kn->kn_status &= ~(KN_QUEUED |
1366 							   KN_ACTIVE);
1367 				} else {
1368 					TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
1369 					kq->kq_count++;
1370 				}
1371 			}
1372 		}
1373 
1374 		/*
1375 		 * Handle any post-processing states
1376 		 */
1377 		knote_release(kn);
1378 	}
1379 	TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1380 
1381 	lwkt_relpooltoken(kq);
1382 	return (total);
1383 }
1384 
1385 /*
1386  * XXX
1387  * This could be expanded to call kqueue_scan, if desired.
1388  *
1389  * MPSAFE
1390  */
1391 static int
1392 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1393 {
1394 	return (ENXIO);
1395 }
1396 
1397 /*
1398  * MPSAFE
1399  */
1400 static int
1401 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1402 {
1403 	return (ENXIO);
1404 }
1405 
1406 /*
1407  * MPALMOSTSAFE
1408  */
1409 static int
1410 kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
1411 	     struct ucred *cred, struct sysmsg *msg)
1412 {
1413 	struct kqueue *kq;
1414 	int error;
1415 
1416 	kq = (struct kqueue *)fp->f_data;
1417 	lwkt_getpooltoken(kq);
1418 	switch(com) {
1419 	case FIOASYNC:
1420 		if (*(int *)data)
1421 			kq->kq_state |= KQ_ASYNC;
1422 		else
1423 			kq->kq_state &= ~KQ_ASYNC;
1424 		error = 0;
1425 		break;
1426 	case FIOSETOWN:
1427 		error = fsetown(*(int *)data, &kq->kq_sigio);
1428 		break;
1429 	default:
1430 		error = ENOTTY;
1431 		break;
1432 	}
1433 	lwkt_relpooltoken(kq);
1434 	return (error);
1435 }
1436 
1437 /*
1438  * MPSAFE
1439  */
1440 static int
1441 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred)
1442 {
1443 	struct kqueue *kq = (struct kqueue *)fp->f_data;
1444 
1445 	bzero((void *)st, sizeof(*st));
1446 	st->st_size = kq->kq_count;
1447 	st->st_blksize = sizeof(struct kevent);
1448 	st->st_mode = S_IFIFO;
1449 	return (0);
1450 }
1451 
1452 /*
1453  * MPSAFE
1454  */
1455 static int
1456 kqueue_close(struct file *fp)
1457 {
1458 	struct kqueue *kq = (struct kqueue *)fp->f_data;
1459 
1460 	kqueue_terminate(kq);
1461 
1462 	fp->f_data = NULL;
1463 	funsetown(&kq->kq_sigio);
1464 
1465 	kfree(kq, M_KQUEUE);
1466 	return (0);
1467 }
1468 
1469 static void
1470 kqueue_wakeup(struct kqueue *kq)
1471 {
1472 	if (kq->kq_sleep_cnt) {
1473 		u_int sleep_cnt = kq->kq_sleep_cnt;
1474 
1475 		kq->kq_sleep_cnt = 0;
1476 		if (sleep_cnt == 1)
1477 			wakeup_one(kq);
1478 		else
1479 			wakeup(kq);
1480 	}
1481 	KNOTE(&kq->kq_kqinfo.ki_note, 0);
1482 }
1483 
1484 /*
1485  * Calls filterops f_attach function, acquiring mplock if filter is not
1486  * marked as FILTEROP_MPSAFE.
1487  *
1488  * Caller must be holding the related kq token
1489  */
1490 static int
1491 filter_attach(struct knote *kn)
1492 {
1493 	int ret;
1494 
1495 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1496 		ret = kn->kn_fop->f_attach(kn);
1497 	} else {
1498 		get_mplock();
1499 		ret = kn->kn_fop->f_attach(kn);
1500 		rel_mplock();
1501 	}
1502 	return (ret);
1503 }
1504 
1505 /*
1506  * Detach the knote and drop it, destroying the knote.
1507  *
1508  * Calls filterops f_detach function, acquiring mplock if filter is not
1509  * marked as FILTEROP_MPSAFE.
1510  *
1511  * Caller must be holding the related kq token
1512  */
1513 static void
1514 knote_detach_and_drop(struct knote *kn)
1515 {
1516 	kn->kn_status |= KN_DELETING | KN_REPROCESS;
1517 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1518 		kn->kn_fop->f_detach(kn);
1519 	} else {
1520 		get_mplock();
1521 		kn->kn_fop->f_detach(kn);
1522 		rel_mplock();
1523 	}
1524 	knote_drop(kn);
1525 }
1526 
1527 /*
1528  * Calls filterops f_event function, acquiring mplock if filter is not
1529  * marked as FILTEROP_MPSAFE.
1530  *
1531  * If the knote is in the middle of being created or deleted we cannot
1532  * safely call the filter op.
1533  *
1534  * Caller must be holding the related kq token
1535  */
1536 static int
1537 filter_event(struct knote *kn, long hint)
1538 {
1539 	int ret;
1540 
1541 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1542 		ret = kn->kn_fop->f_event(kn, hint);
1543 	} else {
1544 		get_mplock();
1545 		ret = kn->kn_fop->f_event(kn, hint);
1546 		rel_mplock();
1547 	}
1548 	return (ret);
1549 }
1550 
1551 /*
1552  * Walk down a list of knotes, activating them if their event has triggered.
1553  *
1554  * If we encounter any knotes which are undergoing processing we just mark
1555  * them for reprocessing and do not try to [re]activate the knote.  However,
1556  * if a hint is being passed we have to wait and that makes things a bit
1557  * sticky.
1558  */
1559 void
1560 knote(struct klist *list, long hint)
1561 {
1562 	struct kqueue *kq;
1563 	struct knote *kn;
1564 	struct knote *kntmp;
1565 
1566 	lwkt_getpooltoken(list);
1567 restart:
1568 	SLIST_FOREACH(kn, list, kn_next) {
1569 		kq = kn->kn_kq;
1570 		lwkt_getpooltoken(kq);
1571 
1572 		/* temporary verification hack */
1573 		SLIST_FOREACH(kntmp, list, kn_next) {
1574 			if (kn == kntmp)
1575 				break;
1576 		}
1577 		if (kn != kntmp || kn->kn_kq != kq) {
1578 			lwkt_relpooltoken(kq);
1579 			goto restart;
1580 		}
1581 
1582 		if (kn->kn_status & KN_PROCESSING) {
1583 			/*
1584 			 * Someone else is processing the knote, ask the
1585 			 * other thread to reprocess it and don't mess
1586 			 * with it otherwise.
1587 			 */
1588 			if (hint == 0) {
1589 				kn->kn_status |= KN_REPROCESS;
1590 				lwkt_relpooltoken(kq);
1591 				continue;
1592 			}
1593 
1594 			/*
1595 			 * If the hint is non-zero we have to wait or risk
1596 			 * losing the state the caller is trying to update.
1597 			 *
1598 			 * XXX This is a real problem, certain process
1599 			 *     and signal filters will bump kn_data for
1600 			 *     already-processed notes more than once if
1601 			 *     we restart the list scan.  FIXME.
1602 			 */
1603 			kn->kn_status |= KN_WAITING | KN_REPROCESS;
1604 			tsleep(kn, 0, "knotec", hz);
1605 			lwkt_relpooltoken(kq);
1606 			goto restart;
1607 		}
1608 
1609 		/*
1610 		 * Become the reprocessing master ourselves.
1611 		 *
1612 		 * If hint is non-zero running the event is mandatory
1613 		 * when not deleting so do it whether reprocessing is
1614 		 * set or not.
1615 		 */
1616 		kn->kn_status |= KN_PROCESSING;
1617 		if ((kn->kn_status & KN_DELETING) == 0) {
1618 			if (filter_event(kn, hint))
1619 				KNOTE_ACTIVATE(kn);
1620 		}
1621 		if (knote_release(kn)) {
1622 			lwkt_relpooltoken(kq);
1623 			goto restart;
1624 		}
1625 		lwkt_relpooltoken(kq);
1626 	}
1627 	lwkt_relpooltoken(list);
1628 }
1629 
1630 /*
1631  * Insert knote at head of klist.
1632  *
1633  * This function may only be called via a filter function and thus
1634  * kq_token should already be held and marked for processing.
1635  */
1636 void
1637 knote_insert(struct klist *klist, struct knote *kn)
1638 {
1639 	lwkt_getpooltoken(klist);
1640 	KKASSERT(kn->kn_status & KN_PROCESSING);
1641 	SLIST_INSERT_HEAD(klist, kn, kn_next);
1642 	lwkt_relpooltoken(klist);
1643 }
1644 
1645 /*
1646  * Remove knote from a klist
1647  *
1648  * This function may only be called via a filter function and thus
1649  * kq_token should already be held and marked for processing.
1650  */
1651 void
1652 knote_remove(struct klist *klist, struct knote *kn)
1653 {
1654 	lwkt_getpooltoken(klist);
1655 	KKASSERT(kn->kn_status & KN_PROCESSING);
1656 	SLIST_REMOVE(klist, kn, knote, kn_next);
1657 	lwkt_relpooltoken(klist);
1658 }
1659 
1660 void
1661 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst,
1662 		    struct filterops *ops, void *hook)
1663 {
1664 	struct kqueue *kq;
1665 	struct knote *kn;
1666 
1667 	lwkt_getpooltoken(&src->ki_note);
1668 	lwkt_getpooltoken(&dst->ki_note);
1669 	while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
1670 		kq = kn->kn_kq;
1671 		lwkt_getpooltoken(kq);
1672 		if (SLIST_FIRST(&src->ki_note) != kn || kn->kn_kq != kq) {
1673 			lwkt_relpooltoken(kq);
1674 			continue;
1675 		}
1676 		if (knote_acquire(kn)) {
1677 			knote_remove(&src->ki_note, kn);
1678 			kn->kn_fop = ops;
1679 			kn->kn_hook = hook;
1680 			knote_insert(&dst->ki_note, kn);
1681 			knote_release(kn);
1682 			/* kn may be invalid now */
1683 		}
1684 		lwkt_relpooltoken(kq);
1685 	}
1686 	lwkt_relpooltoken(&dst->ki_note);
1687 	lwkt_relpooltoken(&src->ki_note);
1688 }
1689 
1690 /*
1691  * Remove all knotes referencing a specified fd
1692  */
1693 void
1694 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd)
1695 {
1696 	struct kqueue *kq;
1697 	struct knote *kn;
1698 	struct knote *kntmp;
1699 
1700 	lwkt_getpooltoken(&fp->f_klist);
1701 restart:
1702 	SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
1703 		if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
1704 			kq = kn->kn_kq;
1705 			lwkt_getpooltoken(kq);
1706 
1707 			/* temporary verification hack */
1708 			SLIST_FOREACH(kntmp, &fp->f_klist, kn_link) {
1709 				if (kn == kntmp)
1710 					break;
1711 			}
1712 			if (kn != kntmp || kn->kn_kq->kq_fdp != fdp ||
1713 			    kn->kn_id != fd || kn->kn_kq != kq) {
1714 				lwkt_relpooltoken(kq);
1715 				goto restart;
1716 			}
1717 			if (knote_acquire(kn))
1718 				knote_detach_and_drop(kn);
1719 			lwkt_relpooltoken(kq);
1720 			goto restart;
1721 		}
1722 	}
1723 	lwkt_relpooltoken(&fp->f_klist);
1724 }
1725 
1726 /*
1727  * Low level attach function.
1728  *
1729  * The knote should already be marked for processing.
1730  * Caller must hold the related kq token.
1731  */
1732 static void
1733 knote_attach(struct knote *kn)
1734 {
1735 	struct klist *list;
1736 	struct kqueue *kq = kn->kn_kq;
1737 
1738 	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1739 		KKASSERT(kn->kn_fp);
1740 		list = &kn->kn_fp->f_klist;
1741 	} else {
1742 		if (kq->kq_knhashmask == 0)
1743 			kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1744 						 &kq->kq_knhashmask);
1745 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1746 	}
1747 	lwkt_getpooltoken(list);
1748 	SLIST_INSERT_HEAD(list, kn, kn_link);
1749 	lwkt_relpooltoken(list);
1750 	TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
1751 }
1752 
1753 /*
1754  * Low level drop function.
1755  *
1756  * The knote should already be marked for processing.
1757  * Caller must hold the related kq token.
1758  */
1759 static void
1760 knote_drop(struct knote *kn)
1761 {
1762 	struct kqueue *kq;
1763 	struct klist *list;
1764 
1765 	kq = kn->kn_kq;
1766 
1767 	if (kn->kn_fop->f_flags & FILTEROP_ISFD)
1768 		list = &kn->kn_fp->f_klist;
1769 	else
1770 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1771 
1772 	lwkt_getpooltoken(list);
1773 	SLIST_REMOVE(list, kn, knote, kn_link);
1774 	lwkt_relpooltoken(list);
1775 	TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
1776 	if (kn->kn_status & KN_QUEUED)
1777 		knote_dequeue(kn);
1778 	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1779 		fdrop(kn->kn_fp);
1780 		kn->kn_fp = NULL;
1781 	}
1782 	knote_free(kn);
1783 }
1784 
1785 /*
1786  * Low level enqueue function.
1787  *
1788  * The knote should already be marked for processing.
1789  * Caller must be holding the kq token
1790  */
1791 static void
1792 knote_enqueue(struct knote *kn)
1793 {
1794 	struct kqueue *kq = kn->kn_kq;
1795 
1796 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
1797 	TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
1798 	kn->kn_status |= KN_QUEUED;
1799 	++kq->kq_count;
1800 
1801 	/*
1802 	 * Send SIGIO on request (typically set up as a mailbox signal)
1803 	 */
1804 	if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
1805 		pgsigio(kq->kq_sigio, SIGIO, 0);
1806 
1807 	kqueue_wakeup(kq);
1808 }
1809 
1810 /*
1811  * Low level dequeue function.
1812  *
1813  * The knote should already be marked for processing.
1814  * Caller must be holding the kq token
1815  */
1816 static void
1817 knote_dequeue(struct knote *kn)
1818 {
1819 	struct kqueue *kq = kn->kn_kq;
1820 
1821 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
1822 	TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
1823 	kn->kn_status &= ~KN_QUEUED;
1824 	kq->kq_count--;
1825 }
1826 
1827 static struct knote *
1828 knote_alloc(void)
1829 {
1830 	return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK);
1831 }
1832 
1833 static void
1834 knote_free(struct knote *kn)
1835 {
1836 	struct knote_cache_list *cache_list;
1837 
1838 	cache_list = &knote_cache_lists[mycpuid];
1839 	if (cache_list->knote_cache_cnt < KNOTE_CACHE_MAX) {
1840 		crit_enter();
1841 		SLIST_INSERT_HEAD(&cache_list->knote_cache, kn, kn_link);
1842 		cache_list->knote_cache_cnt++;
1843 		crit_exit();
1844 		return;
1845 	}
1846 	kfree(kn, M_KQUEUE);
1847 }
1848