xref: /openbsd/sys/kern/kern_event.c (revision 8932bfb7)
1 /*	$OpenBSD: kern_event.c,v 1.41 2011/07/02 22:20:08 nicm Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD: src/sys/kern/kern_event.c,v 1.22 2001/02/23 20:32:42 jlemon Exp $
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/proc.h>
35 #include <sys/malloc.h>
36 #include <sys/unistd.h>
37 #include <sys/file.h>
38 #include <sys/filedesc.h>
39 #include <sys/fcntl.h>
40 #include <sys/selinfo.h>
41 #include <sys/queue.h>
42 #include <sys/event.h>
43 #include <sys/eventvar.h>
44 #include <sys/pool.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/stat.h>
49 #include <sys/uio.h>
50 #include <sys/mount.h>
51 #include <sys/poll.h>
52 #include <sys/syscallargs.h>
53 #include <sys/timeout.h>
54 
55 int	kqueue_scan(struct file *fp, int maxevents,
56 		    struct kevent *ulistp, const struct timespec *timeout,
57 		    struct proc *p, int *retval);
58 
59 int	kqueue_read(struct file *fp, off_t *poff, struct uio *uio,
60 		    struct ucred *cred);
61 int	kqueue_write(struct file *fp, off_t *poff, struct uio *uio,
62 		    struct ucred *cred);
63 int	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
64 		    struct proc *p);
65 int	kqueue_poll(struct file *fp, int events, struct proc *p);
66 int	kqueue_kqfilter(struct file *fp, struct knote *kn);
67 int	kqueue_stat(struct file *fp, struct stat *st, struct proc *p);
68 int	kqueue_close(struct file *fp, struct proc *p);
69 void	kqueue_wakeup(struct kqueue *kq);
70 
71 struct fileops kqueueops = {
72 	kqueue_read,
73 	kqueue_write,
74 	kqueue_ioctl,
75 	kqueue_poll,
76 	kqueue_kqfilter,
77 	kqueue_stat,
78 	kqueue_close
79 };
80 
81 void	knote_attach(struct knote *kn, struct filedesc *fdp);
82 void	knote_drop(struct knote *kn, struct proc *p, struct filedesc *fdp);
83 void	knote_enqueue(struct knote *kn);
84 void	knote_dequeue(struct knote *kn);
85 #define knote_alloc() ((struct knote *)pool_get(&knote_pool, PR_WAITOK))
86 #define knote_free(kn) pool_put(&knote_pool, (kn))
87 
88 void	filt_kqdetach(struct knote *kn);
89 int	filt_kqueue(struct knote *kn, long hint);
90 int	filt_procattach(struct knote *kn);
91 void	filt_procdetach(struct knote *kn);
92 int	filt_proc(struct knote *kn, long hint);
93 int	filt_fileattach(struct knote *kn);
94 void	filt_timerexpire(void *knx);
95 int	filt_timerattach(struct knote *kn);
96 void	filt_timerdetach(struct knote *kn);
97 int	filt_timer(struct knote *kn, long hint);
98 void	filt_seltruedetach(struct knote *kn);
99 
100 struct filterops kqread_filtops =
101 	{ 1, NULL, filt_kqdetach, filt_kqueue };
102 struct filterops proc_filtops =
103 	{ 0, filt_procattach, filt_procdetach, filt_proc };
104 struct filterops file_filtops =
105 	{ 1, filt_fileattach, NULL, NULL };
106 struct filterops timer_filtops =
107         { 0, filt_timerattach, filt_timerdetach, filt_timer };
108 
109 struct	pool knote_pool;
110 struct	pool kqueue_pool;
111 int kq_ntimeouts = 0;
112 int kq_timeoutmax = (4 * 1024);
113 
114 #define KNOTE_ACTIVATE(kn) do {						\
115 	kn->kn_status |= KN_ACTIVE;					\
116 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
117 		knote_enqueue(kn);					\
118 } while(0)
119 
120 #define	KN_HASHSIZE		64		/* XXX should be tunable */
121 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
122 
123 extern struct filterops sig_filtops;
124 #ifdef notyet
125 extern struct filterops aio_filtops;
126 #endif
127 
128 /*
129  * Table for for all system-defined filters.
130  */
131 struct filterops *sysfilt_ops[] = {
132 	&file_filtops,			/* EVFILT_READ */
133 	&file_filtops,			/* EVFILT_WRITE */
134 	NULL, /*&aio_filtops,*/		/* EVFILT_AIO */
135 	&file_filtops,			/* EVFILT_VNODE */
136 	&proc_filtops,			/* EVFILT_PROC */
137 	&sig_filtops,			/* EVFILT_SIGNAL */
138 	&timer_filtops,			/* EVFILT_TIMER */
139 };
140 
141 void kqueue_init(void);
142 
143 void
144 kqueue_init(void)
145 {
146 
147 	pool_init(&kqueue_pool, sizeof(struct kqueue), 0, 0, 0, "kqueuepl",
148 	    &pool_allocator_nointr);
149 	pool_init(&knote_pool, sizeof(struct knote), 0, 0, 0, "knotepl",
150 	    &pool_allocator_nointr);
151 }
152 
153 int
154 filt_fileattach(struct knote *kn)
155 {
156 	struct file *fp = kn->kn_fp;
157 
158 	return ((*fp->f_ops->fo_kqfilter)(fp, kn));
159 }
160 
161 int
162 kqueue_kqfilter(struct file *fp, struct knote *kn)
163 {
164 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
165 
166 	if (kn->kn_filter != EVFILT_READ)
167 		return (EINVAL);
168 
169 	kn->kn_fop = &kqread_filtops;
170 	SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
171 	return (0);
172 }
173 
174 void
175 filt_kqdetach(struct knote *kn)
176 {
177 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
178 
179 	SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
180 }
181 
182 /*ARGSUSED*/
183 int
184 filt_kqueue(struct knote *kn, long hint)
185 {
186 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
187 
188 	kn->kn_data = kq->kq_count;
189 	return (kn->kn_data > 0);
190 }
191 
192 int
193 filt_procattach(struct knote *kn)
194 {
195 	struct proc *p;
196 
197 	p = pfind(kn->kn_id);
198 	if (p == NULL)
199 		return (ESRCH);
200 
201 	/* threads and exiting processes can't be specified */
202 	if (p->p_flag & (P_THREAD|P_WEXIT))
203 		return (ESRCH);
204 
205 	/*
206 	 * Fail if it's not owned by you, or the last exec gave us
207 	 * setuid/setgid privs (unless you're root).
208 	 */
209 	if (p->p_p != curproc->p_p &&
210 	    (p->p_cred->p_ruid != curproc->p_cred->p_ruid ||
211 	    (p->p_p->ps_flags & PS_SUGID)) && suser(curproc, 0) != 0)
212 		return (EACCES);
213 
214 	kn->kn_ptr.p_proc = p;
215 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
216 
217 	/*
218 	 * internal flag indicating registration done by kernel
219 	 */
220 	if (kn->kn_flags & EV_FLAG1) {
221 		kn->kn_data = kn->kn_sdata;		/* ppid */
222 		kn->kn_fflags = NOTE_CHILD;
223 		kn->kn_flags &= ~EV_FLAG1;
224 	}
225 
226 	/* XXX lock the proc here while adding to the list? */
227 	SLIST_INSERT_HEAD(&p->p_p->ps_klist, kn, kn_selnext);
228 
229 	return (0);
230 }
231 
232 /*
233  * The knote may be attached to a different process, which may exit,
234  * leaving nothing for the knote to be attached to.  So when the process
235  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
236  * it will be deleted when read out.  However, as part of the knote deletion,
237  * this routine is called, so a check is needed to avoid actually performing
238  * a detach, because the original process does not exist any more.
239  */
240 void
241 filt_procdetach(struct knote *kn)
242 {
243 	struct proc *p = kn->kn_ptr.p_proc;
244 
245 	if (kn->kn_status & KN_DETACHED)
246 		return;
247 
248 	/* XXX locking?  this might modify another process. */
249 	SLIST_REMOVE(&p->p_p->ps_klist, kn, knote, kn_selnext);
250 }
251 
252 int
253 filt_proc(struct knote *kn, long hint)
254 {
255 	u_int event;
256 
257 	/*
258 	 * mask off extra data
259 	 */
260 	event = (u_int)hint & NOTE_PCTRLMASK;
261 
262 	/*
263 	 * if the user is interested in this event, record it.
264 	 */
265 	if (kn->kn_sfflags & event)
266 		kn->kn_fflags |= event;
267 
268 	/*
269 	 * process is gone, so flag the event as finished and remove it
270 	 * from the process's klist
271 	 */
272 	if (event == NOTE_EXIT) {
273 		struct process *pr = kn->kn_ptr.p_proc->p_p;
274 
275 		kn->kn_status |= KN_DETACHED;
276 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
277 		SLIST_REMOVE(&pr->ps_klist, kn, knote, kn_selnext);
278 		return (1);
279 	}
280 
281 	/*
282 	 * process forked, and user wants to track the new process,
283 	 * so attach a new knote to it, and immediately report an
284 	 * event with the parent's pid.
285 	 */
286 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
287 		struct kevent kev;
288 		int error;
289 
290 		/*
291 		 * register knote with new process.
292 		 */
293 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
294 		kev.filter = kn->kn_filter;
295 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
296 		kev.fflags = kn->kn_sfflags;
297 		kev.data = kn->kn_id;			/* parent */
298 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
299 		error = kqueue_register(kn->kn_kq, &kev, NULL);
300 		if (error)
301 			kn->kn_fflags |= NOTE_TRACKERR;
302 	}
303 
304 	return (kn->kn_fflags != 0);
305 }
306 
307 void
308 filt_timerexpire(void *knx)
309 {
310 	struct knote *kn = knx;
311 	struct timeval tv;
312 	int tticks;
313 
314 	kn->kn_data++;
315 	KNOTE_ACTIVATE(kn);
316 
317 	if ((kn->kn_flags & EV_ONESHOT) == 0) {
318 		tv.tv_sec = kn->kn_sdata / 1000;
319 		tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
320 		tticks = tvtohz(&tv);
321 		timeout_add((struct timeout *)kn->kn_hook, tticks);
322 	}
323 }
324 
325 
326 /*
327  * data contains amount of time to sleep, in milliseconds
328  */
329 int
330 filt_timerattach(struct knote *kn)
331 {
332 	struct timeout *to;
333 	struct timeval tv;
334 	int tticks;
335 
336 	if (kq_ntimeouts > kq_timeoutmax)
337 		return (ENOMEM);
338 	kq_ntimeouts++;
339 
340 	tv.tv_sec = kn->kn_sdata / 1000;
341 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
342 	tticks = tvtohz(&tv);
343 
344 	kn->kn_flags |= EV_CLEAR;	/* automatically set */
345 	to = malloc(sizeof(*to), M_KEVENT, M_WAITOK);
346 	timeout_set(to, filt_timerexpire, kn);
347 	timeout_add(to, tticks);
348 	kn->kn_hook = to;
349 
350 	return (0);
351 }
352 
353 void
354 filt_timerdetach(struct knote *kn)
355 {
356 	struct timeout *to;
357 
358 	to = (struct timeout *)kn->kn_hook;
359 	timeout_del(to);
360 	free(to, M_KEVENT);
361 	kq_ntimeouts--;
362 }
363 
364 int
365 filt_timer(struct knote *kn, long hint)
366 {
367 	return (kn->kn_data != 0);
368 }
369 
370 
371 /*
372  * filt_seltrue:
373  *
374  *	This filter "event" routine simulates seltrue().
375  */
376 int
377 filt_seltrue(struct knote *kn, long hint)
378 {
379 
380 	/*
381 	 * We don't know how much data can be read/written,
382 	 * but we know that it *can* be.  This is about as
383 	 * good as select/poll does as well.
384 	 */
385 	kn->kn_data = 0;
386 	return (1);
387 }
388 
389 /*
390  * This provides full kqfilter entry for device switch tables, which
391  * has same effect as filter using filt_seltrue() as filter method.
392  */
393 void
394 filt_seltruedetach(struct knote *kn)
395 {
396 	/* Nothing to do */
397 }
398 
399 const struct filterops seltrue_filtops =
400 	{ 1, NULL, filt_seltruedetach, filt_seltrue };
401 
402 int
403 seltrue_kqfilter(dev_t dev, struct knote *kn)
404 {
405 	switch (kn->kn_filter) {
406 	case EVFILT_READ:
407 	case EVFILT_WRITE:
408 		kn->kn_fop = &seltrue_filtops;
409 		break;
410 	default:
411 		return (EINVAL);
412 	}
413 
414 	/* Nothing more to do */
415 	return (0);
416 }
417 
418 int
419 sys_kqueue(struct proc *p, void *v, register_t *retval)
420 {
421 	struct filedesc *fdp = p->p_fd;
422 	struct kqueue *kq;
423 	struct file *fp;
424 	int fd, error;
425 
426 	error = falloc(p, &fp, &fd);
427 	if (error)
428 		return (error);
429 	fp->f_flag = FREAD | FWRITE;
430 	fp->f_type = DTYPE_KQUEUE;
431 	fp->f_ops = &kqueueops;
432 	kq = pool_get(&kqueue_pool, PR_WAITOK|PR_ZERO);
433 	TAILQ_INIT(&kq->kq_head);
434 	fp->f_data = (caddr_t)kq;
435 	*retval = fd;
436 	if (fdp->fd_knlistsize < 0)
437 		fdp->fd_knlistsize = 0;		/* this process has a kq */
438 	kq->kq_fdp = fdp;
439 	FILE_SET_MATURE(fp);
440 	return (0);
441 }
442 
443 int
444 sys_kevent(struct proc *p, void *v, register_t *retval)
445 {
446 	struct filedesc* fdp = p->p_fd;
447 	struct sys_kevent_args /* {
448 		syscallarg(int)	fd;
449 		syscallarg(const struct kevent *) changelist;
450 		syscallarg(int)	nchanges;
451 		syscallarg(struct kevent *) eventlist;
452 		syscallarg(int)	nevents;
453 		syscallarg(const struct timespec *) timeout;
454 	} */ *uap = v;
455 	struct kevent *kevp;
456 	struct kqueue *kq;
457 	struct file *fp;
458 	struct timespec ts;
459 	int i, n, nerrors, error;
460 
461 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL ||
462 	    (fp->f_type != DTYPE_KQUEUE))
463 		return (EBADF);
464 
465 	FREF(fp);
466 
467 	if (SCARG(uap, timeout) != NULL) {
468 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
469 		if (error)
470 			goto done;
471 		SCARG(uap, timeout) = &ts;
472 	}
473 
474 	kq = (struct kqueue *)fp->f_data;
475 	nerrors = 0;
476 
477 	while (SCARG(uap, nchanges) > 0) {
478 		n = SCARG(uap, nchanges) > KQ_NEVENTS
479 			? KQ_NEVENTS : SCARG(uap, nchanges);
480 		error = copyin(SCARG(uap, changelist), kq->kq_kev,
481 		    n * sizeof(struct kevent));
482 		if (error)
483 			goto done;
484 		for (i = 0; i < n; i++) {
485 			kevp = &kq->kq_kev[i];
486 			kevp->flags &= ~EV_SYSFLAGS;
487 			error = kqueue_register(kq, kevp, p);
488 			if (error) {
489 				if (SCARG(uap, nevents) != 0) {
490 					kevp->flags = EV_ERROR;
491 					kevp->data = error;
492 					(void) copyout((caddr_t)kevp,
493 					    (caddr_t)SCARG(uap, eventlist),
494 					    sizeof(*kevp));
495 					SCARG(uap, eventlist)++;
496 					SCARG(uap, nevents)--;
497 					nerrors++;
498 				} else {
499 					goto done;
500 				}
501 			}
502 		}
503 		SCARG(uap, nchanges) -= n;
504 		SCARG(uap, changelist) += n;
505 	}
506 	if (nerrors) {
507 		*retval = nerrors;
508 		error = 0;
509 		goto done;
510 	}
511 
512 	error = kqueue_scan(fp, SCARG(uap, nevents), SCARG(uap, eventlist),
513 			    SCARG(uap, timeout), p, &n);
514 	*retval = n;
515  done:
516 	FRELE(fp);
517 	return (error);
518 }
519 
520 int
521 kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p)
522 {
523 	struct filedesc *fdp = kq->kq_fdp;
524 	struct filterops *fops = NULL;
525 	struct file *fp = NULL;
526 	struct knote *kn = NULL;
527 	int s, error = 0;
528 
529 	if (kev->filter < 0) {
530 		if (kev->filter + EVFILT_SYSCOUNT < 0)
531 			return (EINVAL);
532 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
533 	}
534 
535 	if (fops == NULL) {
536 		/*
537 		 * XXX
538 		 * filter attach routine is responsible for ensuring that
539 		 * the identifier can be attached to it.
540 		 */
541 		return (EINVAL);
542 	}
543 
544 	if (fops->f_isfd) {
545 		/* validate descriptor */
546 		if ((fp = fd_getfile(fdp, kev->ident)) == NULL)
547 			return (EBADF);
548 		FREF(fp);
549 		fp->f_count++;
550 
551 		if (kev->ident < fdp->fd_knlistsize) {
552 			SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
553 				if (kq == kn->kn_kq &&
554 				    kev->filter == kn->kn_filter)
555 					break;
556 		}
557 	} else {
558 		if (fdp->fd_knhashmask != 0) {
559 			struct klist *list;
560 
561 			list = &fdp->fd_knhash[
562 			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
563 			SLIST_FOREACH(kn, list, kn_link)
564 				if (kev->ident == kn->kn_id &&
565 				    kq == kn->kn_kq &&
566 				    kev->filter == kn->kn_filter)
567 					break;
568 		}
569 	}
570 
571 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
572 		error = ENOENT;
573 		goto done;
574 	}
575 
576 	/*
577 	 * kn now contains the matching knote, or NULL if no match
578 	 */
579 	if (kev->flags & EV_ADD) {
580 
581 		if (kn == NULL) {
582 			kn = knote_alloc();
583 			if (kn == NULL) {
584 				error = ENOMEM;
585 				goto done;
586 			}
587 			kn->kn_fp = fp;
588 			kn->kn_kq = kq;
589 			kn->kn_fop = fops;
590 
591 			/*
592 			 * apply reference count to knote structure, and
593 			 * do not release it at the end of this routine.
594 			 */
595 			if (fp != NULL)
596 				FRELE(fp);
597 			fp = NULL;
598 
599 			kn->kn_sfflags = kev->fflags;
600 			kn->kn_sdata = kev->data;
601 			kev->fflags = 0;
602 			kev->data = 0;
603 			kn->kn_kevent = *kev;
604 
605 			knote_attach(kn, fdp);
606 			if ((error = fops->f_attach(kn)) != 0) {
607 				knote_drop(kn, p, fdp);
608 				goto done;
609 			}
610 		} else {
611 			/*
612 			 * The user may change some filter values after the
613 			 * initial EV_ADD, but doing so will not reset any
614 			 * filters which have already been triggered.
615 			 */
616 			kn->kn_sfflags = kev->fflags;
617 			kn->kn_sdata = kev->data;
618 			kn->kn_kevent.udata = kev->udata;
619 		}
620 
621 		s = splhigh();
622 		if (kn->kn_fop->f_event(kn, 0))
623 			KNOTE_ACTIVATE(kn);
624 		splx(s);
625 
626 	} else if (kev->flags & EV_DELETE) {
627 		kn->kn_fop->f_detach(kn);
628 		knote_drop(kn, p, p->p_fd);
629 		goto done;
630 	}
631 
632 	if ((kev->flags & EV_DISABLE) &&
633 	    ((kn->kn_status & KN_DISABLED) == 0)) {
634 		s = splhigh();
635 		kn->kn_status |= KN_DISABLED;
636 		splx(s);
637 	}
638 
639 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
640 		s = splhigh();
641 		kn->kn_status &= ~KN_DISABLED;
642 		if ((kn->kn_status & KN_ACTIVE) &&
643 		    ((kn->kn_status & KN_QUEUED) == 0))
644 			knote_enqueue(kn);
645 		splx(s);
646 	}
647 
648 done:
649 	if (fp != NULL)
650 		closef(fp, p);
651 	return (error);
652 }
653 
654 int
655 kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
656 	const struct timespec *tsp, struct proc *p, int *retval)
657 {
658 	struct kqueue *kq = (struct kqueue *)fp->f_data;
659 	struct kevent *kevp;
660 	struct timeval atv, rtv, ttv;
661 	struct knote *kn, marker;
662 	int s, count, timeout, nkev = 0, error = 0;
663 
664 	count = maxevents;
665 	if (count == 0)
666 		goto done;
667 
668 	if (tsp != NULL) {
669 		TIMESPEC_TO_TIMEVAL(&atv, tsp);
670 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
671 			/* No timeout, just poll */
672 			timeout = -1;
673 			goto start;
674 		}
675 		if (itimerfix(&atv)) {
676 			error = EINVAL;
677 			goto done;
678 		}
679 
680 		timeout = atv.tv_sec > 24 * 60 * 60 ?
681 			24 * 60 * 60 * hz : tvtohz(&atv);
682 
683 		getmicrouptime(&rtv);
684 		timeradd(&atv, &rtv, &atv);
685 	} else {
686 		atv.tv_sec = 0;
687 		atv.tv_usec = 0;
688 		timeout = 0;
689 	}
690 	goto start;
691 
692 retry:
693 	if (atv.tv_sec || atv.tv_usec) {
694 		getmicrouptime(&rtv);
695 		if (timercmp(&rtv, &atv, >=))
696 			goto done;
697 		ttv = atv;
698 		timersub(&ttv, &rtv, &ttv);
699 		timeout = ttv.tv_sec > 24 * 60 * 60 ?
700 			24 * 60 * 60 * hz : tvtohz(&ttv);
701 	}
702 
703 start:
704 	kevp = kq->kq_kev;
705 	s = splhigh();
706 	if (kq->kq_count == 0) {
707 		if (timeout < 0) {
708 			error = EWOULDBLOCK;
709 		} else {
710 			kq->kq_state |= KQ_SLEEP;
711 			error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
712 		}
713 		splx(s);
714 		if (error == 0)
715 			goto retry;
716 		/* don't restart after signals... */
717 		if (error == ERESTART)
718 			error = EINTR;
719 		else if (error == EWOULDBLOCK)
720 			error = 0;
721 		goto done;
722 	}
723 
724 	TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
725 	while (count) {
726 		kn = TAILQ_FIRST(&kq->kq_head);
727 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
728 		if (kn == &marker) {
729 			splx(s);
730 			if (count == maxevents)
731 				goto retry;
732 			goto done;
733 		}
734 		if (kn->kn_status & KN_DISABLED) {
735 			kn->kn_status &= ~KN_QUEUED;
736 			kq->kq_count--;
737 			continue;
738 		}
739 		if ((kn->kn_flags & EV_ONESHOT) == 0 &&
740 		    kn->kn_fop->f_event(kn, 0) == 0) {
741 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
742 			kq->kq_count--;
743 			continue;
744 		}
745 		*kevp = kn->kn_kevent;
746 		kevp++;
747 		nkev++;
748 		if (kn->kn_flags & EV_ONESHOT) {
749 			kn->kn_status &= ~KN_QUEUED;
750 			kq->kq_count--;
751 			splx(s);
752 			kn->kn_fop->f_detach(kn);
753 			knote_drop(kn, p, p->p_fd);
754 			s = splhigh();
755 		} else if (kn->kn_flags & EV_CLEAR) {
756 			kn->kn_data = 0;
757 			kn->kn_fflags = 0;
758 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
759 			kq->kq_count--;
760 		} else {
761 			TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
762 		}
763 		count--;
764 		if (nkev == KQ_NEVENTS) {
765 			splx(s);
766 			error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
767 			    sizeof(struct kevent) * nkev);
768 			ulistp += nkev;
769 			nkev = 0;
770 			kevp = kq->kq_kev;
771 			s = splhigh();
772 			if (error)
773 				break;
774 		}
775 	}
776 	TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
777 	splx(s);
778 done:
779 	if (nkev != 0)
780 		error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
781 		    sizeof(struct kevent) * nkev);
782 	*retval = maxevents - count;
783 	return (error);
784 }
785 
786 /*
787  * XXX
788  * This could be expanded to call kqueue_scan, if desired.
789  */
790 /*ARGSUSED*/
791 int
792 kqueue_read(struct file *fp, off_t *poff, struct uio *uio, struct ucred *cred)
793 {
794 	return (ENXIO);
795 }
796 
797 /*ARGSUSED*/
798 int
799 kqueue_write(struct file *fp, off_t *poff, struct uio *uio, struct ucred *cred)
800 
801 {
802 	return (ENXIO);
803 }
804 
805 /*ARGSUSED*/
806 int
807 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p)
808 {
809 	return (ENOTTY);
810 }
811 
812 /*ARGSUSED*/
813 int
814 kqueue_poll(struct file *fp, int events, struct proc *p)
815 {
816 	struct kqueue *kq = (struct kqueue *)fp->f_data;
817 	int revents = 0;
818 	int s = splhigh();
819 
820 	if (events & (POLLIN | POLLRDNORM)) {
821 		if (kq->kq_count) {
822 			revents |= events & (POLLIN | POLLRDNORM);
823 		} else {
824 			selrecord(p, &kq->kq_sel);
825 			kq->kq_state |= KQ_SEL;
826 		}
827 	}
828 	splx(s);
829 	return (revents);
830 }
831 
832 /*ARGSUSED*/
833 int
834 kqueue_stat(struct file *fp, struct stat *st, struct proc *p)
835 {
836 	struct kqueue *kq = (struct kqueue *)fp->f_data;
837 
838 	bzero((void *)st, sizeof(*st));
839 	st->st_size = kq->kq_count;
840 	st->st_blksize = sizeof(struct kevent);
841 	st->st_mode = S_IFIFO;
842 	return (0);
843 }
844 
845 /*ARGSUSED*/
846 int
847 kqueue_close(struct file *fp, struct proc *p)
848 {
849 	struct kqueue *kq = (struct kqueue *)fp->f_data;
850 	struct filedesc *fdp = p->p_fd;
851 	struct knote **knp, *kn, *kn0;
852 	int i;
853 
854 	for (i = 0; i < fdp->fd_knlistsize; i++) {
855 		knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
856 		kn = *knp;
857 		while (kn != NULL) {
858 			kn0 = SLIST_NEXT(kn, kn_link);
859 			if (kq == kn->kn_kq) {
860 				FREF(kn->kn_fp);
861 				kn->kn_fop->f_detach(kn);
862 				closef(kn->kn_fp, p);
863 				knote_free(kn);
864 				*knp = kn0;
865 			} else {
866 				knp = &SLIST_NEXT(kn, kn_link);
867 			}
868 			kn = kn0;
869 		}
870 	}
871 	if (fdp->fd_knhashmask != 0) {
872 		for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
873 			knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
874 			kn = *knp;
875 			while (kn != NULL) {
876 				kn0 = SLIST_NEXT(kn, kn_link);
877 				if (kq == kn->kn_kq) {
878 					kn->kn_fop->f_detach(kn);
879 		/* XXX non-fd release of kn->kn_ptr */
880 					knote_free(kn);
881 					*knp = kn0;
882 				} else {
883 					knp = &SLIST_NEXT(kn, kn_link);
884 				}
885 				kn = kn0;
886 			}
887 		}
888 	}
889 	pool_put(&kqueue_pool, kq);
890 	fp->f_data = NULL;
891 
892 	return (0);
893 }
894 
895 void
896 kqueue_wakeup(struct kqueue *kq)
897 {
898 
899 	if (kq->kq_state & KQ_SLEEP) {
900 		kq->kq_state &= ~KQ_SLEEP;
901 		wakeup(kq);
902 	}
903 	if (kq->kq_state & KQ_SEL) {
904 		kq->kq_state &= ~KQ_SEL;
905 		selwakeup(&kq->kq_sel);
906 	} else
907 		KNOTE(&kq->kq_sel.si_note, 0);
908 }
909 
910 /*
911  * walk down a list of knotes, activating them if their event has triggered.
912  */
913 void
914 knote(struct klist *list, long hint)
915 {
916 	struct knote *kn;
917 
918 	SLIST_FOREACH(kn, list, kn_selnext)
919 		if (kn->kn_fop->f_event(kn, hint))
920 			KNOTE_ACTIVATE(kn);
921 }
922 
923 /*
924  * remove all knotes from a specified klist
925  */
926 void
927 knote_remove(struct proc *p, struct klist *list)
928 {
929 	struct knote *kn;
930 
931 	while ((kn = SLIST_FIRST(list)) != NULL) {
932 		kn->kn_fop->f_detach(kn);
933 		knote_drop(kn, p, p->p_fd);
934 	}
935 }
936 
937 /*
938  * remove all knotes referencing a specified fd
939  */
940 void
941 knote_fdclose(struct proc *p, int fd)
942 {
943 	struct filedesc *fdp = p->p_fd;
944 	struct klist *list = &fdp->fd_knlist[fd];
945 
946 	knote_remove(p, list);
947 }
948 
949 /*
950  * handle a process exiting, including the triggering of NOTE_EXIT notes
951  * XXX this could be more efficient, doing a single pass down the klist
952  */
953 void
954 knote_processexit(struct process *pr)
955 {
956 	KNOTE(&pr->ps_klist, NOTE_EXIT);
957 
958 	/* remove other knotes hanging off the process */
959 	knote_remove(pr->ps_mainproc, &pr->ps_klist);
960 }
961 
962 void
963 knote_attach(struct knote *kn, struct filedesc *fdp)
964 {
965 	struct klist *list;
966 	int size;
967 
968 	if (! kn->kn_fop->f_isfd) {
969 		if (fdp->fd_knhashmask == 0)
970 			fdp->fd_knhash = hashinit(KN_HASHSIZE, M_TEMP,
971 			    M_WAITOK, &fdp->fd_knhashmask);
972 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
973 		goto done;
974 	}
975 
976 	if (fdp->fd_knlistsize <= kn->kn_id) {
977 		size = fdp->fd_knlistsize;
978 		while (size <= kn->kn_id)
979 			size += KQEXTENT;
980 		list = malloc(size * sizeof(struct klist *), M_TEMP, M_WAITOK);
981 		bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
982 		    fdp->fd_knlistsize * sizeof(struct klist *));
983 		bzero((caddr_t)list +
984 		    fdp->fd_knlistsize * sizeof(struct klist *),
985 		    (size - fdp->fd_knlistsize) * sizeof(struct klist *));
986 		if (fdp->fd_knlist != NULL)
987 			free(fdp->fd_knlist, M_TEMP);
988 		fdp->fd_knlistsize = size;
989 		fdp->fd_knlist = list;
990 	}
991 	list = &fdp->fd_knlist[kn->kn_id];
992 done:
993 	SLIST_INSERT_HEAD(list, kn, kn_link);
994 	kn->kn_status = 0;
995 }
996 
997 /*
998  * should be called at spl == 0, since we don't want to hold spl
999  * while calling closef and free.
1000  */
1001 void
1002 knote_drop(struct knote *kn, struct proc *p, struct filedesc *fdp)
1003 {
1004 	struct klist *list;
1005 
1006 	if (kn->kn_fop->f_isfd)
1007 		list = &fdp->fd_knlist[kn->kn_id];
1008 	else
1009 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1010 
1011 	SLIST_REMOVE(list, kn, knote, kn_link);
1012 	if (kn->kn_status & KN_QUEUED)
1013 		knote_dequeue(kn);
1014 	if (kn->kn_fop->f_isfd) {
1015 		FREF(kn->kn_fp);
1016 		closef(kn->kn_fp, p);
1017 	}
1018 	knote_free(kn);
1019 }
1020 
1021 
1022 void
1023 knote_enqueue(struct knote *kn)
1024 {
1025 	struct kqueue *kq = kn->kn_kq;
1026 	int s = splhigh();
1027 
1028 	KASSERT((kn->kn_status & KN_QUEUED) == 0);
1029 
1030 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1031 	kn->kn_status |= KN_QUEUED;
1032 	kq->kq_count++;
1033 	splx(s);
1034 	kqueue_wakeup(kq);
1035 }
1036 
1037 void
1038 knote_dequeue(struct knote *kn)
1039 {
1040 	struct kqueue *kq = kn->kn_kq;
1041 	int s = splhigh();
1042 
1043 	KASSERT(kn->kn_status & KN_QUEUED);
1044 
1045 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1046 	kn->kn_status &= ~KN_QUEUED;
1047 	kq->kq_count--;
1048 	splx(s);
1049 }
1050 
1051 void
1052 klist_invalidate(struct klist *list)
1053 {
1054 	struct knote *kn;
1055 
1056 	SLIST_FOREACH(kn, list, kn_selnext) {
1057 		kn->kn_status |= KN_DETACHED;
1058 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
1059 	}
1060 }
1061