xref: /freebsd/sys/compat/linux/linux_event.c (revision 1d386b48)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/callout.h>
32 #include <sys/capsicum.h>
33 #include <sys/errno.h>
34 #include <sys/event.h>
35 #include <sys/eventfd.h>
36 #include <sys/file.h>
37 #include <sys/filedesc.h>
38 #include <sys/filio.h>
39 #include <sys/limits.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/poll.h>
43 #include <sys/proc.h>
44 #include <sys/selinfo.h>
45 #include <sys/specialfd.h>
46 #include <sys/sx.h>
47 #include <sys/syscallsubr.h>
48 #include <sys/timespec.h>
49 #include <sys/user.h>
50 
51 #ifdef COMPAT_LINUX32
52 #include <machine/../linux32/linux.h>
53 #include <machine/../linux32/linux32_proto.h>
54 #else
55 #include <machine/../linux/linux.h>
56 #include <machine/../linux/linux_proto.h>
57 #endif
58 
59 #include <compat/linux/linux_emul.h>
60 #include <compat/linux/linux_event.h>
61 #include <compat/linux/linux_file.h>
62 #include <compat/linux/linux_signal.h>
63 #include <compat/linux/linux_time.h>
64 #include <compat/linux/linux_util.h>
65 
66 typedef uint64_t	epoll_udata_t;
67 
68 struct epoll_event {
69 	uint32_t	events;
70 	epoll_udata_t	data;
71 }
72 #if defined(__amd64__)
73 __attribute__((packed))
74 #endif
75 ;
76 
77 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
78 
79 static int	epoll_to_kevent(struct thread *td, int fd,
80 		    struct epoll_event *l_event, struct kevent *kevent,
81 		    int *nkevents);
82 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
83 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
84 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
85 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
86 		    int fd, int filter, unsigned int flags);
87 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
88 		    int fd);
89 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
90 		    int fd);
91 
92 struct epoll_copyin_args {
93 	struct kevent	*changelist;
94 };
95 
96 struct epoll_copyout_args {
97 	struct epoll_event	*leventlist;
98 	struct proc		*p;
99 	uint32_t		count;
100 	int			error;
101 };
102 
103 /* timerfd */
104 typedef uint64_t	timerfd_t;
105 
106 static fo_rdwr_t	timerfd_read;
107 static fo_ioctl_t	timerfd_ioctl;
108 static fo_poll_t	timerfd_poll;
109 static fo_kqfilter_t	timerfd_kqfilter;
110 static fo_stat_t	timerfd_stat;
111 static fo_close_t	timerfd_close;
112 static fo_fill_kinfo_t	timerfd_fill_kinfo;
113 
114 static struct fileops timerfdops = {
115 	.fo_read = timerfd_read,
116 	.fo_write = invfo_rdwr,
117 	.fo_truncate = invfo_truncate,
118 	.fo_ioctl = timerfd_ioctl,
119 	.fo_poll = timerfd_poll,
120 	.fo_kqfilter = timerfd_kqfilter,
121 	.fo_stat = timerfd_stat,
122 	.fo_close = timerfd_close,
123 	.fo_chmod = invfo_chmod,
124 	.fo_chown = invfo_chown,
125 	.fo_sendfile = invfo_sendfile,
126 	.fo_fill_kinfo = timerfd_fill_kinfo,
127 	.fo_flags = DFLAG_PASSABLE
128 };
129 
130 static void	filt_timerfddetach(struct knote *kn);
131 static int	filt_timerfdread(struct knote *kn, long hint);
132 
133 static struct filterops timerfd_rfiltops = {
134 	.f_isfd = 1,
135 	.f_detach = filt_timerfddetach,
136 	.f_event = filt_timerfdread
137 };
138 
139 struct timerfd {
140 	clockid_t	tfd_clockid;
141 	struct itimerspec tfd_time;
142 	struct callout	tfd_callout;
143 	timerfd_t	tfd_count;
144 	bool		tfd_canceled;
145 	struct selinfo	tfd_sel;
146 	struct mtx	tfd_lock;
147 };
148 
149 static void	linux_timerfd_expire(void *);
150 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
151 
152 static int
153 epoll_create_common(struct thread *td, int flags)
154 {
155 
156 	return (kern_kqueue(td, flags, NULL));
157 }
158 
159 #ifdef LINUX_LEGACY_SYSCALLS
160 int
161 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
162 {
163 
164 	/*
165 	 * args->size is unused. Linux just tests it
166 	 * and then forgets it as well.
167 	 */
168 	if (args->size <= 0)
169 		return (EINVAL);
170 
171 	return (epoll_create_common(td, 0));
172 }
173 #endif
174 
175 int
176 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
177 {
178 	int flags;
179 
180 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
181 		return (EINVAL);
182 
183 	flags = 0;
184 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
185 		flags |= O_CLOEXEC;
186 
187 	return (epoll_create_common(td, flags));
188 }
189 
190 /* Structure converting function from epoll to kevent. */
191 static int
192 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
193     struct kevent *kevent, int *nkevents)
194 {
195 	uint32_t levents = l_event->events;
196 	struct linux_pemuldata *pem;
197 	struct proc *p;
198 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
199 
200 	/* flags related to how event is registered */
201 	if ((levents & LINUX_EPOLLONESHOT) != 0)
202 		kev_flags |= EV_DISPATCH;
203 	if ((levents & LINUX_EPOLLET) != 0)
204 		kev_flags |= EV_CLEAR;
205 	if ((levents & LINUX_EPOLLERR) != 0)
206 		kev_flags |= EV_ERROR;
207 	if ((levents & LINUX_EPOLLRDHUP) != 0)
208 		kev_flags |= EV_EOF;
209 
210 	/* flags related to what event is registered */
211 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
212 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
213 		kevent->ext[0] = l_event->data;
214 		++kevent;
215 		++(*nkevents);
216 	}
217 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
218 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
219 		kevent->ext[0] = l_event->data;
220 		++kevent;
221 		++(*nkevents);
222 	}
223 	/* zero event mask is legal */
224 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
225 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
226 		++(*nkevents);
227 	}
228 
229 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
230 		p = td->td_proc;
231 
232 		pem = pem_find(p);
233 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
234 
235 		LINUX_PEM_XLOCK(pem);
236 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
237 			pem->flags |= LINUX_XUNSUP_EPOLL;
238 			LINUX_PEM_XUNLOCK(pem);
239 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
240 			    levents);
241 		} else
242 			LINUX_PEM_XUNLOCK(pem);
243 		return (EINVAL);
244 	}
245 
246 	return (0);
247 }
248 
249 /*
250  * Structure converting function from kevent to epoll. In a case
251  * this is called on error in registration we store the error in
252  * event->data and pick it up later in linux_epoll_ctl().
253  */
254 static void
255 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
256 {
257 
258 	l_event->data = kevent->ext[0];
259 
260 	if ((kevent->flags & EV_ERROR) != 0) {
261 		l_event->events = LINUX_EPOLLERR;
262 		return;
263 	}
264 
265 	/* XXX EPOLLPRI, EPOLLHUP */
266 	switch (kevent->filter) {
267 	case EVFILT_READ:
268 		l_event->events = LINUX_EPOLLIN;
269 		if ((kevent->flags & EV_EOF) != 0)
270 			l_event->events |= LINUX_EPOLLRDHUP;
271 	break;
272 	case EVFILT_WRITE:
273 		l_event->events = LINUX_EPOLLOUT;
274 	break;
275 	}
276 }
277 
278 /*
279  * Copyout callback used by kevent. This converts kevent
280  * events to epoll events and copies them back to the
281  * userspace. This is also called on error on registering
282  * of the filter.
283  */
284 static int
285 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
286 {
287 	struct epoll_copyout_args *args;
288 	struct epoll_event *eep;
289 	int error, i;
290 
291 	args = (struct epoll_copyout_args*) arg;
292 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
293 
294 	for (i = 0; i < count; i++)
295 		kevent_to_epoll(&kevp[i], &eep[i]);
296 
297 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
298 	if (error == 0) {
299 		args->leventlist += count;
300 		args->count += count;
301 	} else if (args->error == 0)
302 		args->error = error;
303 
304 	free(eep, M_EPOLL);
305 	return (error);
306 }
307 
308 /*
309  * Copyin callback used by kevent. This copies already
310  * converted filters from kernel memory to the kevent
311  * internal kernel memory. Hence the memcpy instead of
312  * copyin.
313  */
314 static int
315 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
316 {
317 	struct epoll_copyin_args *args;
318 
319 	args = (struct epoll_copyin_args*) arg;
320 
321 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
322 	args->changelist += count;
323 
324 	return (0);
325 }
326 
327 /*
328  * Load epoll filter, convert it to kevent filter
329  * and load it into kevent subsystem.
330  */
331 int
332 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
333 {
334 	struct file *epfp, *fp;
335 	struct epoll_copyin_args ciargs;
336 	struct kevent kev[2];
337 	struct kevent_copyops k_ops = { &ciargs,
338 					NULL,
339 					epoll_kev_copyin};
340 	struct epoll_event le;
341 	cap_rights_t rights;
342 	int nchanges = 0;
343 	int error;
344 
345 	if (args->op != LINUX_EPOLL_CTL_DEL) {
346 		error = copyin(args->event, &le, sizeof(le));
347 		if (error != 0)
348 			return (error);
349 	}
350 
351 	error = fget(td, args->epfd,
352 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
353 	if (error != 0)
354 		return (error);
355 	if (epfp->f_type != DTYPE_KQUEUE) {
356 		error = EINVAL;
357 		goto leave1;
358 	}
359 
360 	 /* Protect user data vector from incorrectly supplied fd. */
361 	error = fget(td, args->fd,
362 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
363 	if (error != 0)
364 		goto leave1;
365 
366 	/* Linux disallows spying on himself */
367 	if (epfp == fp) {
368 		error = EINVAL;
369 		goto leave0;
370 	}
371 
372 	ciargs.changelist = kev;
373 
374 	if (args->op != LINUX_EPOLL_CTL_DEL) {
375 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
376 		if (error != 0)
377 			goto leave0;
378 	}
379 
380 	switch (args->op) {
381 	case LINUX_EPOLL_CTL_MOD:
382 		error = epoll_delete_all_events(td, epfp, args->fd);
383 		if (error != 0)
384 			goto leave0;
385 		break;
386 
387 	case LINUX_EPOLL_CTL_ADD:
388 		if (epoll_fd_registered(td, epfp, args->fd)) {
389 			error = EEXIST;
390 			goto leave0;
391 		}
392 		break;
393 
394 	case LINUX_EPOLL_CTL_DEL:
395 		/* CTL_DEL means unregister this fd with this epoll */
396 		error = epoll_delete_all_events(td, epfp, args->fd);
397 		goto leave0;
398 
399 	default:
400 		error = EINVAL;
401 		goto leave0;
402 	}
403 
404 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
405 
406 leave0:
407 	fdrop(fp, td);
408 
409 leave1:
410 	fdrop(epfp, td);
411 	return (error);
412 }
413 
414 /*
415  * Wait for a filter to be triggered on the epoll file descriptor.
416  */
417 
418 static int
419 linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
420     int maxevents, struct timespec *tsp, sigset_t *uset)
421 {
422 	struct epoll_copyout_args coargs;
423 	struct kevent_copyops k_ops = { &coargs,
424 					epoll_kev_copyout,
425 					NULL};
426 	cap_rights_t rights;
427 	struct file *epfp;
428 	sigset_t omask;
429 	int error;
430 
431 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
432 		return (EINVAL);
433 
434 	error = fget(td, epfd,
435 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
436 	if (error != 0)
437 		return (error);
438 	if (epfp->f_type != DTYPE_KQUEUE) {
439 		error = EINVAL;
440 		goto leave;
441 	}
442 	if (uset != NULL) {
443 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
444 		    &omask, 0);
445 		if (error != 0)
446 			goto leave;
447 		td->td_pflags |= TDP_OLDMASK;
448 		/*
449 		 * Make sure that ast() is called on return to
450 		 * usermode and TDP_OLDMASK is cleared, restoring old
451 		 * sigmask.
452 		 */
453 		ast_sched(td, TDA_SIGSUSPEND);
454 	}
455 
456 	coargs.leventlist = events;
457 	coargs.p = td->td_proc;
458 	coargs.count = 0;
459 	coargs.error = 0;
460 
461 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
462 	if (error == 0 && coargs.error != 0)
463 		error = coargs.error;
464 
465 	/*
466 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
467 	 * Maybe we should translate that but I don't think it matters at all.
468 	 */
469 	if (error == 0)
470 		td->td_retval[0] = coargs.count;
471 
472 	if (uset != NULL)
473 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
474 		    NULL, 0);
475 leave:
476 	fdrop(epfp, td);
477 	return (error);
478 }
479 
480 static int
481 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
482     int maxevents, int timeout, sigset_t *uset)
483 {
484 	struct timespec ts, *tsp;
485 
486 	/*
487 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
488 	 * to block indefinitely. Real implementation does it if any negative
489 	 * timeout value is passed.
490 	 */
491 	if (timeout >= 0) {
492 		/* Convert from milliseconds to timespec. */
493 		ts.tv_sec = timeout / 1000;
494 		ts.tv_nsec = (timeout % 1000) * 1000000;
495 		tsp = &ts;
496 	} else {
497 		tsp = NULL;
498 	}
499 	return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
500 
501 }
502 
503 #ifdef LINUX_LEGACY_SYSCALLS
504 int
505 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
506 {
507 
508 	return (linux_epoll_wait_common(td, args->epfd, args->events,
509 	    args->maxevents, args->timeout, NULL));
510 }
511 #endif
512 
513 int
514 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
515 {
516 	sigset_t mask, *pmask;
517 	int error;
518 
519 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
520 	    &mask, &pmask);
521 	if (error != 0)
522 		return (error);
523 
524 	return (linux_epoll_wait_common(td, args->epfd, args->events,
525 	    args->maxevents, args->timeout, pmask));
526 }
527 
528 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
529 int
530 linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
531 {
532 	struct timespec ts, *tsa;
533 	sigset_t mask, *pmask;
534 	int error;
535 
536 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
537 	    &mask, &pmask);
538 	if (error != 0)
539 		return (error);
540 
541 	if (args->timeout) {
542 		error = linux_get_timespec64(&ts, args->timeout);
543 		if (error != 0)
544 			return (error);
545 		tsa = &ts;
546 	} else
547 		tsa = NULL;
548 
549 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
550 	    args->maxevents, tsa, pmask));
551 }
552 #else
553 int
554 linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
555 {
556 	struct timespec ts, *tsa;
557 	sigset_t mask, *pmask;
558 	int error;
559 
560 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
561 	    &mask, &pmask);
562 	if (error != 0)
563 		return (error);
564 
565 	if (args->timeout) {
566 		error = linux_get_timespec(&ts, args->timeout);
567 		if (error != 0)
568 			return (error);
569 		tsa = &ts;
570 	} else
571 		tsa = NULL;
572 
573 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
574 	    args->maxevents, tsa, pmask));
575 }
576 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
577 
578 static int
579 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
580     unsigned int flags)
581 {
582 	struct epoll_copyin_args ciargs;
583 	struct kevent kev;
584 	struct kevent_copyops k_ops = { &ciargs,
585 					NULL,
586 					epoll_kev_copyin};
587 
588 	ciargs.changelist = &kev;
589 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
590 
591 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
592 }
593 
594 static int
595 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
596 {
597 	/*
598 	 * Set empty filter flags to avoid accidental modification of already
599 	 * registered events. In the case of event re-registration:
600 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
601 	 * 2. If event does exists, it's enabled/disabled state is preserved
602 	 *    but fflags, data and udata fields are overwritten. So we can not
603 	 *    set socket lowats and store user's context pointer in udata.
604 	 */
605 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
606 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
607 		return (1);
608 
609 	return (0);
610 }
611 
612 static int
613 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
614 {
615 	int error1, error2;
616 
617 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
618 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
619 
620 	/* return 0 if at least one result positive */
621 	return (error1 == 0 ? 0 : error2);
622 }
623 
624 #ifdef LINUX_LEGACY_SYSCALLS
625 int
626 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
627 {
628 	struct specialfd_eventfd ae;
629 
630 	bzero(&ae, sizeof(ae));
631 	ae.initval = args->initval;
632 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
633 }
634 #endif
635 
636 int
637 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
638 {
639 	struct specialfd_eventfd ae;
640 	int flags;
641 
642 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
643 	    LINUX_EFD_SEMAPHORE)) != 0)
644 		return (EINVAL);
645 	flags = 0;
646 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
647 		flags |= EFD_CLOEXEC;
648 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
649 		flags |= EFD_NONBLOCK;
650 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
651 		flags |= EFD_SEMAPHORE;
652 
653 	bzero(&ae, sizeof(ae));
654 	ae.flags = flags;
655 	ae.initval = args->initval;
656 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
657 }
658 
659 int
660 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
661 {
662 	struct timerfd *tfd;
663 	struct file *fp;
664 	clockid_t clockid;
665 	int fflags, fd, error;
666 
667 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
668 		return (EINVAL);
669 
670 	error = linux_to_native_clockid(&clockid, args->clockid);
671 	if (error != 0)
672 		return (error);
673 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
674 		return (EINVAL);
675 
676 	fflags = 0;
677 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
678 		fflags |= O_CLOEXEC;
679 
680 	error = falloc(td, &fp, &fd, fflags);
681 	if (error != 0)
682 		return (error);
683 
684 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
685 	tfd->tfd_clockid = clockid;
686 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
687 
688 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
689 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
690 
691 	fflags = FREAD;
692 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
693 		fflags |= FNONBLOCK;
694 
695 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
696 	fdrop(fp, td);
697 
698 	td->td_retval[0] = fd;
699 	return (error);
700 }
701 
702 static int
703 timerfd_close(struct file *fp, struct thread *td)
704 {
705 	struct timerfd *tfd;
706 
707 	tfd = fp->f_data;
708 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
709 		return (EINVAL);
710 
711 	timespecclear(&tfd->tfd_time.it_value);
712 	timespecclear(&tfd->tfd_time.it_interval);
713 
714 	callout_drain(&tfd->tfd_callout);
715 
716 	seldrain(&tfd->tfd_sel);
717 	knlist_destroy(&tfd->tfd_sel.si_note);
718 
719 	fp->f_ops = &badfileops;
720 	mtx_destroy(&tfd->tfd_lock);
721 	free(tfd, M_EPOLL);
722 
723 	return (0);
724 }
725 
726 static int
727 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
728     int flags, struct thread *td)
729 {
730 	struct timerfd *tfd;
731 	timerfd_t count;
732 	int error;
733 
734 	tfd = fp->f_data;
735 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
736 		return (EINVAL);
737 
738 	if (uio->uio_resid < sizeof(timerfd_t))
739 		return (EINVAL);
740 
741 	error = 0;
742 	mtx_lock(&tfd->tfd_lock);
743 retry:
744 	if (tfd->tfd_canceled) {
745 		tfd->tfd_count = 0;
746 		mtx_unlock(&tfd->tfd_lock);
747 		return (ECANCELED);
748 	}
749 	if (tfd->tfd_count == 0) {
750 		if ((fp->f_flag & FNONBLOCK) != 0) {
751 			mtx_unlock(&tfd->tfd_lock);
752 			return (EAGAIN);
753 		}
754 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
755 		if (error == 0)
756 			goto retry;
757 	}
758 	if (error == 0) {
759 		count = tfd->tfd_count;
760 		tfd->tfd_count = 0;
761 		mtx_unlock(&tfd->tfd_lock);
762 		error = uiomove(&count, sizeof(timerfd_t), uio);
763 	} else
764 		mtx_unlock(&tfd->tfd_lock);
765 
766 	return (error);
767 }
768 
769 static int
770 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
771     struct thread *td)
772 {
773 	struct timerfd *tfd;
774 	int revents = 0;
775 
776 	tfd = fp->f_data;
777 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
778 		return (POLLERR);
779 
780 	mtx_lock(&tfd->tfd_lock);
781 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
782 		revents |= events & (POLLIN|POLLRDNORM);
783 	if (revents == 0)
784 		selrecord(td, &tfd->tfd_sel);
785 	mtx_unlock(&tfd->tfd_lock);
786 
787 	return (revents);
788 }
789 
790 static int
791 timerfd_kqfilter(struct file *fp, struct knote *kn)
792 {
793 	struct timerfd *tfd;
794 
795 	tfd = fp->f_data;
796 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
797 		return (EINVAL);
798 
799 	if (kn->kn_filter == EVFILT_READ)
800 		kn->kn_fop = &timerfd_rfiltops;
801 	else
802 		return (EINVAL);
803 
804 	kn->kn_hook = tfd;
805 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
806 
807 	return (0);
808 }
809 
810 static void
811 filt_timerfddetach(struct knote *kn)
812 {
813 	struct timerfd *tfd = kn->kn_hook;
814 
815 	mtx_lock(&tfd->tfd_lock);
816 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
817 	mtx_unlock(&tfd->tfd_lock);
818 }
819 
820 static int
821 filt_timerfdread(struct knote *kn, long hint)
822 {
823 	struct timerfd *tfd = kn->kn_hook;
824 
825 	return (tfd->tfd_count > 0);
826 }
827 
828 static int
829 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
830     struct ucred *active_cred, struct thread *td)
831 {
832 
833 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
834 		return (EINVAL);
835 
836 	switch (cmd) {
837 	case FIONBIO:
838 	case FIOASYNC:
839 		return (0);
840 	}
841 
842 	return (ENOTTY);
843 }
844 
845 static int
846 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
847 {
848 
849 	return (ENXIO);
850 }
851 
852 static int
853 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
854 {
855 
856 	kif->kf_type = KF_TYPE_UNKNOWN;
857 	return (0);
858 }
859 
860 static void
861 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
862 {
863 
864 	if (tfd->tfd_clockid == CLOCK_REALTIME)
865 		getnanotime(ts);
866 	else	/* CLOCK_MONOTONIC */
867 		getnanouptime(ts);
868 }
869 
870 static void
871 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
872 {
873 	struct timespec cts;
874 
875 	linux_timerfd_clocktime(tfd, &cts);
876 	*ots = tfd->tfd_time;
877 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
878 		timespecsub(&ots->it_value, &cts, &ots->it_value);
879 		if (ots->it_value.tv_sec < 0 ||
880 		    (ots->it_value.tv_sec == 0 &&
881 		     ots->it_value.tv_nsec == 0)) {
882 			ots->it_value.tv_sec  = 0;
883 			ots->it_value.tv_nsec = 1;
884 		}
885 	}
886 }
887 
888 static int
889 linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
890 {
891 	struct timerfd *tfd;
892 	struct file *fp;
893 	int error;
894 
895 	error = fget(td, fd, &cap_read_rights, &fp);
896 	if (error != 0)
897 		return (error);
898 	tfd = fp->f_data;
899 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
900 		error = EINVAL;
901 		goto out;
902 	}
903 
904 	mtx_lock(&tfd->tfd_lock);
905 	linux_timerfd_curval(tfd, ots);
906 	mtx_unlock(&tfd->tfd_lock);
907 
908 out:
909 	fdrop(fp, td);
910 	return (error);
911 }
912 
913 int
914 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
915 {
916 	struct l_itimerspec lots;
917 	struct itimerspec ots;
918 	int error;
919 
920 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
921 	if (error != 0)
922 		return (error);
923 	error = native_to_linux_itimerspec(&lots, &ots);
924 	if (error == 0)
925 		error = copyout(&lots, args->old_value, sizeof(lots));
926 	return (error);
927 }
928 
929 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
930 int
931 linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
932 {
933 	struct l_itimerspec64 lots;
934 	struct itimerspec ots;
935 	int error;
936 
937 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
938 	if (error != 0)
939 		return (error);
940 	error = native_to_linux_itimerspec64(&lots, &ots);
941 	if (error == 0)
942 		error = copyout(&lots, args->old_value, sizeof(lots));
943 	return (error);
944 }
945 #endif
946 
947 static int
948 linux_timerfd_settime_common(struct thread *td, int fd, int flags,
949     struct itimerspec *nts, struct itimerspec *oval)
950 {
951 	struct timespec cts, ts;
952 	struct timerfd *tfd;
953 	struct timeval tv;
954 	struct file *fp;
955 	int error;
956 
957 	if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
958 		return (EINVAL);
959 
960 	error = fget(td, fd, &cap_write_rights, &fp);
961 	if (error != 0)
962 		return (error);
963 	tfd = fp->f_data;
964 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
965 		error = EINVAL;
966 		goto out;
967 	}
968 
969 	mtx_lock(&tfd->tfd_lock);
970 	if (!timespecisset(&nts->it_value))
971 		timespecclear(&nts->it_interval);
972 	if (oval != NULL)
973 		linux_timerfd_curval(tfd, oval);
974 
975 	bcopy(nts, &tfd->tfd_time, sizeof(*nts));
976 	tfd->tfd_count = 0;
977 	if (timespecisset(&nts->it_value)) {
978 		linux_timerfd_clocktime(tfd, &cts);
979 		ts = nts->it_value;
980 		if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
981 			timespecadd(&tfd->tfd_time.it_value, &cts,
982 				&tfd->tfd_time.it_value);
983 		} else {
984 			timespecsub(&ts, &cts, &ts);
985 		}
986 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
987 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
988 			linux_timerfd_expire, tfd);
989 		tfd->tfd_canceled = false;
990 	} else {
991 		tfd->tfd_canceled = true;
992 		callout_stop(&tfd->tfd_callout);
993 	}
994 	mtx_unlock(&tfd->tfd_lock);
995 
996 out:
997 	fdrop(fp, td);
998 	return (error);
999 }
1000 
1001 int
1002 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1003 {
1004 	struct l_itimerspec lots;
1005 	struct itimerspec nts, ots, *pots;
1006 	int error;
1007 
1008 	error = copyin(args->new_value, &lots, sizeof(lots));
1009 	if (error != 0)
1010 		return (error);
1011 	error = linux_to_native_itimerspec(&nts, &lots);
1012 	if (error != 0)
1013 		return (error);
1014 	pots = (args->old_value != NULL ? &ots : NULL);
1015 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1016 	    &nts, pots);
1017 	if (error == 0 && args->old_value != NULL) {
1018 		error = native_to_linux_itimerspec(&lots, &ots);
1019 		if (error == 0)
1020 			error = copyout(&lots, args->old_value, sizeof(lots));
1021 	}
1022 	return (error);
1023 }
1024 
1025 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1026 int
1027 linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
1028 {
1029 	struct l_itimerspec64 lots;
1030 	struct itimerspec nts, ots, *pots;
1031 	int error;
1032 
1033 	error = copyin(args->new_value, &lots, sizeof(lots));
1034 	if (error != 0)
1035 		return (error);
1036 	error = linux_to_native_itimerspec64(&nts, &lots);
1037 	if (error != 0)
1038 		return (error);
1039 	pots = (args->old_value != NULL ? &ots : NULL);
1040 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1041 	    &nts, pots);
1042 	if (error == 0 && args->old_value != NULL) {
1043 		error = native_to_linux_itimerspec64(&lots, &ots);
1044 		if (error == 0)
1045 			error = copyout(&lots, args->old_value, sizeof(lots));
1046 	}
1047 	return (error);
1048 }
1049 #endif
1050 
1051 static void
1052 linux_timerfd_expire(void *arg)
1053 {
1054 	struct timespec cts, ts;
1055 	struct timeval tv;
1056 	struct timerfd *tfd;
1057 
1058 	tfd = (struct timerfd *)arg;
1059 
1060 	linux_timerfd_clocktime(tfd, &cts);
1061 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1062 		if (timespecisset(&tfd->tfd_time.it_interval))
1063 			timespecadd(&tfd->tfd_time.it_value,
1064 				    &tfd->tfd_time.it_interval,
1065 				    &tfd->tfd_time.it_value);
1066 		else
1067 			/* single shot timer */
1068 			timespecclear(&tfd->tfd_time.it_value);
1069 		if (timespecisset(&tfd->tfd_time.it_value)) {
1070 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1071 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1072 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1073 				linux_timerfd_expire, tfd);
1074 		}
1075 		tfd->tfd_count++;
1076 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1077 		selwakeup(&tfd->tfd_sel);
1078 		wakeup(&tfd->tfd_count);
1079 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1080 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1081 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1082 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1083 		    linux_timerfd_expire, tfd);
1084 	}
1085 }
1086