xref: /freebsd/sys/compat/linux/linux_event.c (revision 2a58b312)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/callout.h>
34 #include <sys/capsicum.h>
35 #include <sys/errno.h>
36 #include <sys/event.h>
37 #include <sys/eventfd.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/filio.h>
41 #include <sys/limits.h>
42 #include <sys/lock.h>
43 #include <sys/mutex.h>
44 #include <sys/poll.h>
45 #include <sys/proc.h>
46 #include <sys/selinfo.h>
47 #include <sys/specialfd.h>
48 #include <sys/sx.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/timespec.h>
51 #include <sys/user.h>
52 
53 #ifdef COMPAT_LINUX32
54 #include <machine/../linux32/linux.h>
55 #include <machine/../linux32/linux32_proto.h>
56 #else
57 #include <machine/../linux/linux.h>
58 #include <machine/../linux/linux_proto.h>
59 #endif
60 
61 #include <compat/linux/linux_emul.h>
62 #include <compat/linux/linux_event.h>
63 #include <compat/linux/linux_file.h>
64 #include <compat/linux/linux_signal.h>
65 #include <compat/linux/linux_time.h>
66 #include <compat/linux/linux_util.h>
67 
68 typedef uint64_t	epoll_udata_t;
69 
70 struct epoll_event {
71 	uint32_t	events;
72 	epoll_udata_t	data;
73 }
74 #if defined(__amd64__)
75 __attribute__((packed))
76 #endif
77 ;
78 
79 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
80 
81 static int	epoll_to_kevent(struct thread *td, int fd,
82 		    struct epoll_event *l_event, struct kevent *kevent,
83 		    int *nkevents);
84 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
85 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
86 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
87 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
88 		    int fd, int filter, unsigned int flags);
89 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
90 		    int fd);
91 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
92 		    int fd);
93 
94 struct epoll_copyin_args {
95 	struct kevent	*changelist;
96 };
97 
98 struct epoll_copyout_args {
99 	struct epoll_event	*leventlist;
100 	struct proc		*p;
101 	uint32_t		count;
102 	int			error;
103 };
104 
105 /* timerfd */
106 typedef uint64_t	timerfd_t;
107 
108 static fo_rdwr_t	timerfd_read;
109 static fo_ioctl_t	timerfd_ioctl;
110 static fo_poll_t	timerfd_poll;
111 static fo_kqfilter_t	timerfd_kqfilter;
112 static fo_stat_t	timerfd_stat;
113 static fo_close_t	timerfd_close;
114 static fo_fill_kinfo_t	timerfd_fill_kinfo;
115 
116 static struct fileops timerfdops = {
117 	.fo_read = timerfd_read,
118 	.fo_write = invfo_rdwr,
119 	.fo_truncate = invfo_truncate,
120 	.fo_ioctl = timerfd_ioctl,
121 	.fo_poll = timerfd_poll,
122 	.fo_kqfilter = timerfd_kqfilter,
123 	.fo_stat = timerfd_stat,
124 	.fo_close = timerfd_close,
125 	.fo_chmod = invfo_chmod,
126 	.fo_chown = invfo_chown,
127 	.fo_sendfile = invfo_sendfile,
128 	.fo_fill_kinfo = timerfd_fill_kinfo,
129 	.fo_flags = DFLAG_PASSABLE
130 };
131 
132 static void	filt_timerfddetach(struct knote *kn);
133 static int	filt_timerfdread(struct knote *kn, long hint);
134 
135 static struct filterops timerfd_rfiltops = {
136 	.f_isfd = 1,
137 	.f_detach = filt_timerfddetach,
138 	.f_event = filt_timerfdread
139 };
140 
141 struct timerfd {
142 	clockid_t	tfd_clockid;
143 	struct itimerspec tfd_time;
144 	struct callout	tfd_callout;
145 	timerfd_t	tfd_count;
146 	bool		tfd_canceled;
147 	struct selinfo	tfd_sel;
148 	struct mtx	tfd_lock;
149 };
150 
151 static void	linux_timerfd_expire(void *);
152 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
153 
154 static int
155 epoll_create_common(struct thread *td, int flags)
156 {
157 
158 	return (kern_kqueue(td, flags, NULL));
159 }
160 
161 #ifdef LINUX_LEGACY_SYSCALLS
162 int
163 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
164 {
165 
166 	/*
167 	 * args->size is unused. Linux just tests it
168 	 * and then forgets it as well.
169 	 */
170 	if (args->size <= 0)
171 		return (EINVAL);
172 
173 	return (epoll_create_common(td, 0));
174 }
175 #endif
176 
177 int
178 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
179 {
180 	int flags;
181 
182 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
183 		return (EINVAL);
184 
185 	flags = 0;
186 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
187 		flags |= O_CLOEXEC;
188 
189 	return (epoll_create_common(td, flags));
190 }
191 
192 /* Structure converting function from epoll to kevent. */
193 static int
194 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
195     struct kevent *kevent, int *nkevents)
196 {
197 	uint32_t levents = l_event->events;
198 	struct linux_pemuldata *pem;
199 	struct proc *p;
200 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
201 
202 	/* flags related to how event is registered */
203 	if ((levents & LINUX_EPOLLONESHOT) != 0)
204 		kev_flags |= EV_DISPATCH;
205 	if ((levents & LINUX_EPOLLET) != 0)
206 		kev_flags |= EV_CLEAR;
207 	if ((levents & LINUX_EPOLLERR) != 0)
208 		kev_flags |= EV_ERROR;
209 	if ((levents & LINUX_EPOLLRDHUP) != 0)
210 		kev_flags |= EV_EOF;
211 
212 	/* flags related to what event is registered */
213 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
214 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
215 		kevent->ext[0] = l_event->data;
216 		++kevent;
217 		++(*nkevents);
218 	}
219 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
220 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
221 		kevent->ext[0] = l_event->data;
222 		++kevent;
223 		++(*nkevents);
224 	}
225 	/* zero event mask is legal */
226 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
227 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
228 		++(*nkevents);
229 	}
230 
231 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
232 		p = td->td_proc;
233 
234 		pem = pem_find(p);
235 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
236 
237 		LINUX_PEM_XLOCK(pem);
238 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
239 			pem->flags |= LINUX_XUNSUP_EPOLL;
240 			LINUX_PEM_XUNLOCK(pem);
241 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
242 			    levents);
243 		} else
244 			LINUX_PEM_XUNLOCK(pem);
245 		return (EINVAL);
246 	}
247 
248 	return (0);
249 }
250 
251 /*
252  * Structure converting function from kevent to epoll. In a case
253  * this is called on error in registration we store the error in
254  * event->data and pick it up later in linux_epoll_ctl().
255  */
256 static void
257 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
258 {
259 
260 	l_event->data = kevent->ext[0];
261 
262 	if ((kevent->flags & EV_ERROR) != 0) {
263 		l_event->events = LINUX_EPOLLERR;
264 		return;
265 	}
266 
267 	/* XXX EPOLLPRI, EPOLLHUP */
268 	switch (kevent->filter) {
269 	case EVFILT_READ:
270 		l_event->events = LINUX_EPOLLIN;
271 		if ((kevent->flags & EV_EOF) != 0)
272 			l_event->events |= LINUX_EPOLLRDHUP;
273 	break;
274 	case EVFILT_WRITE:
275 		l_event->events = LINUX_EPOLLOUT;
276 	break;
277 	}
278 }
279 
280 /*
281  * Copyout callback used by kevent. This converts kevent
282  * events to epoll events and copies them back to the
283  * userspace. This is also called on error on registering
284  * of the filter.
285  */
286 static int
287 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
288 {
289 	struct epoll_copyout_args *args;
290 	struct epoll_event *eep;
291 	int error, i;
292 
293 	args = (struct epoll_copyout_args*) arg;
294 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
295 
296 	for (i = 0; i < count; i++)
297 		kevent_to_epoll(&kevp[i], &eep[i]);
298 
299 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
300 	if (error == 0) {
301 		args->leventlist += count;
302 		args->count += count;
303 	} else if (args->error == 0)
304 		args->error = error;
305 
306 	free(eep, M_EPOLL);
307 	return (error);
308 }
309 
310 /*
311  * Copyin callback used by kevent. This copies already
312  * converted filters from kernel memory to the kevent
313  * internal kernel memory. Hence the memcpy instead of
314  * copyin.
315  */
316 static int
317 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
318 {
319 	struct epoll_copyin_args *args;
320 
321 	args = (struct epoll_copyin_args*) arg;
322 
323 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
324 	args->changelist += count;
325 
326 	return (0);
327 }
328 
329 /*
330  * Load epoll filter, convert it to kevent filter
331  * and load it into kevent subsystem.
332  */
333 int
334 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
335 {
336 	struct file *epfp, *fp;
337 	struct epoll_copyin_args ciargs;
338 	struct kevent kev[2];
339 	struct kevent_copyops k_ops = { &ciargs,
340 					NULL,
341 					epoll_kev_copyin};
342 	struct epoll_event le;
343 	cap_rights_t rights;
344 	int nchanges = 0;
345 	int error;
346 
347 	if (args->op != LINUX_EPOLL_CTL_DEL) {
348 		error = copyin(args->event, &le, sizeof(le));
349 		if (error != 0)
350 			return (error);
351 	}
352 
353 	error = fget(td, args->epfd,
354 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
355 	if (error != 0)
356 		return (error);
357 	if (epfp->f_type != DTYPE_KQUEUE) {
358 		error = EINVAL;
359 		goto leave1;
360 	}
361 
362 	 /* Protect user data vector from incorrectly supplied fd. */
363 	error = fget(td, args->fd,
364 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
365 	if (error != 0)
366 		goto leave1;
367 
368 	/* Linux disallows spying on himself */
369 	if (epfp == fp) {
370 		error = EINVAL;
371 		goto leave0;
372 	}
373 
374 	ciargs.changelist = kev;
375 
376 	if (args->op != LINUX_EPOLL_CTL_DEL) {
377 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
378 		if (error != 0)
379 			goto leave0;
380 	}
381 
382 	switch (args->op) {
383 	case LINUX_EPOLL_CTL_MOD:
384 		error = epoll_delete_all_events(td, epfp, args->fd);
385 		if (error != 0)
386 			goto leave0;
387 		break;
388 
389 	case LINUX_EPOLL_CTL_ADD:
390 		if (epoll_fd_registered(td, epfp, args->fd)) {
391 			error = EEXIST;
392 			goto leave0;
393 		}
394 		break;
395 
396 	case LINUX_EPOLL_CTL_DEL:
397 		/* CTL_DEL means unregister this fd with this epoll */
398 		error = epoll_delete_all_events(td, epfp, args->fd);
399 		goto leave0;
400 
401 	default:
402 		error = EINVAL;
403 		goto leave0;
404 	}
405 
406 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
407 
408 leave0:
409 	fdrop(fp, td);
410 
411 leave1:
412 	fdrop(epfp, td);
413 	return (error);
414 }
415 
416 /*
417  * Wait for a filter to be triggered on the epoll file descriptor.
418  */
419 
420 static int
421 linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
422     int maxevents, struct timespec *tsp, sigset_t *uset)
423 {
424 	struct epoll_copyout_args coargs;
425 	struct kevent_copyops k_ops = { &coargs,
426 					epoll_kev_copyout,
427 					NULL};
428 	cap_rights_t rights;
429 	struct file *epfp;
430 	sigset_t omask;
431 	int error;
432 
433 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
434 		return (EINVAL);
435 
436 	error = fget(td, epfd,
437 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
438 	if (error != 0)
439 		return (error);
440 	if (epfp->f_type != DTYPE_KQUEUE) {
441 		error = EINVAL;
442 		goto leave;
443 	}
444 	if (uset != NULL) {
445 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
446 		    &omask, 0);
447 		if (error != 0)
448 			goto leave;
449 		td->td_pflags |= TDP_OLDMASK;
450 		/*
451 		 * Make sure that ast() is called on return to
452 		 * usermode and TDP_OLDMASK is cleared, restoring old
453 		 * sigmask.
454 		 */
455 		ast_sched(td, TDA_SIGSUSPEND);
456 	}
457 
458 	coargs.leventlist = events;
459 	coargs.p = td->td_proc;
460 	coargs.count = 0;
461 	coargs.error = 0;
462 
463 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
464 	if (error == 0 && coargs.error != 0)
465 		error = coargs.error;
466 
467 	/*
468 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
469 	 * Maybe we should translate that but I don't think it matters at all.
470 	 */
471 	if (error == 0)
472 		td->td_retval[0] = coargs.count;
473 
474 	if (uset != NULL)
475 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
476 		    NULL, 0);
477 leave:
478 	fdrop(epfp, td);
479 	return (error);
480 }
481 
482 static int
483 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
484     int maxevents, int timeout, sigset_t *uset)
485 {
486 	struct timespec ts, *tsp;
487 
488 	/*
489 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
490 	 * to block indefinitely. Real implementation does it if any negative
491 	 * timeout value is passed.
492 	 */
493 	if (timeout >= 0) {
494 		/* Convert from milliseconds to timespec. */
495 		ts.tv_sec = timeout / 1000;
496 		ts.tv_nsec = (timeout % 1000) * 1000000;
497 		tsp = &ts;
498 	} else {
499 		tsp = NULL;
500 	}
501 	return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
502 
503 }
504 
505 #ifdef LINUX_LEGACY_SYSCALLS
506 int
507 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
508 {
509 
510 	return (linux_epoll_wait_common(td, args->epfd, args->events,
511 	    args->maxevents, args->timeout, NULL));
512 }
513 #endif
514 
515 int
516 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
517 {
518 	sigset_t mask, *pmask;
519 	int error;
520 
521 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
522 	    &mask, &pmask);
523 	if (error != 0)
524 		return (error);
525 
526 	return (linux_epoll_wait_common(td, args->epfd, args->events,
527 	    args->maxevents, args->timeout, pmask));
528 }
529 
530 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
531 int
532 linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
533 {
534 	struct timespec ts, *tsa;
535 	sigset_t mask, *pmask;
536 	int error;
537 
538 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
539 	    &mask, &pmask);
540 	if (error != 0)
541 		return (error);
542 
543 	if (args->timeout) {
544 		error = linux_get_timespec64(&ts, args->timeout);
545 		if (error != 0)
546 			return (error);
547 		tsa = &ts;
548 	} else
549 		tsa = NULL;
550 
551 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
552 	    args->maxevents, tsa, pmask));
553 }
554 #else
555 int
556 linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
557 {
558 	struct timespec ts, *tsa;
559 	sigset_t mask, *pmask;
560 	int error;
561 
562 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
563 	    &mask, &pmask);
564 	if (error != 0)
565 		return (error);
566 
567 	if (args->timeout) {
568 		error = linux_get_timespec(&ts, args->timeout);
569 		if (error != 0)
570 			return (error);
571 		tsa = &ts;
572 	} else
573 		tsa = NULL;
574 
575 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
576 	    args->maxevents, tsa, pmask));
577 }
578 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
579 
580 static int
581 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
582     unsigned int flags)
583 {
584 	struct epoll_copyin_args ciargs;
585 	struct kevent kev;
586 	struct kevent_copyops k_ops = { &ciargs,
587 					NULL,
588 					epoll_kev_copyin};
589 
590 	ciargs.changelist = &kev;
591 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
592 
593 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
594 }
595 
596 static int
597 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
598 {
599 	/*
600 	 * Set empty filter flags to avoid accidental modification of already
601 	 * registered events. In the case of event re-registration:
602 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
603 	 * 2. If event does exists, it's enabled/disabled state is preserved
604 	 *    but fflags, data and udata fields are overwritten. So we can not
605 	 *    set socket lowats and store user's context pointer in udata.
606 	 */
607 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
608 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
609 		return (1);
610 
611 	return (0);
612 }
613 
614 static int
615 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
616 {
617 	int error1, error2;
618 
619 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
620 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
621 
622 	/* return 0 if at least one result positive */
623 	return (error1 == 0 ? 0 : error2);
624 }
625 
626 #ifdef LINUX_LEGACY_SYSCALLS
627 int
628 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
629 {
630 	struct specialfd_eventfd ae;
631 
632 	bzero(&ae, sizeof(ae));
633 	ae.initval = args->initval;
634 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
635 }
636 #endif
637 
638 int
639 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
640 {
641 	struct specialfd_eventfd ae;
642 	int flags;
643 
644 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
645 	    LINUX_EFD_SEMAPHORE)) != 0)
646 		return (EINVAL);
647 	flags = 0;
648 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
649 		flags |= EFD_CLOEXEC;
650 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
651 		flags |= EFD_NONBLOCK;
652 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
653 		flags |= EFD_SEMAPHORE;
654 
655 	bzero(&ae, sizeof(ae));
656 	ae.flags = flags;
657 	ae.initval = args->initval;
658 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
659 }
660 
661 int
662 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
663 {
664 	struct timerfd *tfd;
665 	struct file *fp;
666 	clockid_t clockid;
667 	int fflags, fd, error;
668 
669 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
670 		return (EINVAL);
671 
672 	error = linux_to_native_clockid(&clockid, args->clockid);
673 	if (error != 0)
674 		return (error);
675 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
676 		return (EINVAL);
677 
678 	fflags = 0;
679 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
680 		fflags |= O_CLOEXEC;
681 
682 	error = falloc(td, &fp, &fd, fflags);
683 	if (error != 0)
684 		return (error);
685 
686 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
687 	tfd->tfd_clockid = clockid;
688 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
689 
690 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
691 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
692 
693 	fflags = FREAD;
694 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
695 		fflags |= FNONBLOCK;
696 
697 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
698 	fdrop(fp, td);
699 
700 	td->td_retval[0] = fd;
701 	return (error);
702 }
703 
704 static int
705 timerfd_close(struct file *fp, struct thread *td)
706 {
707 	struct timerfd *tfd;
708 
709 	tfd = fp->f_data;
710 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
711 		return (EINVAL);
712 
713 	timespecclear(&tfd->tfd_time.it_value);
714 	timespecclear(&tfd->tfd_time.it_interval);
715 
716 	callout_drain(&tfd->tfd_callout);
717 
718 	seldrain(&tfd->tfd_sel);
719 	knlist_destroy(&tfd->tfd_sel.si_note);
720 
721 	fp->f_ops = &badfileops;
722 	mtx_destroy(&tfd->tfd_lock);
723 	free(tfd, M_EPOLL);
724 
725 	return (0);
726 }
727 
728 static int
729 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
730     int flags, struct thread *td)
731 {
732 	struct timerfd *tfd;
733 	timerfd_t count;
734 	int error;
735 
736 	tfd = fp->f_data;
737 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
738 		return (EINVAL);
739 
740 	if (uio->uio_resid < sizeof(timerfd_t))
741 		return (EINVAL);
742 
743 	error = 0;
744 	mtx_lock(&tfd->tfd_lock);
745 retry:
746 	if (tfd->tfd_canceled) {
747 		tfd->tfd_count = 0;
748 		mtx_unlock(&tfd->tfd_lock);
749 		return (ECANCELED);
750 	}
751 	if (tfd->tfd_count == 0) {
752 		if ((fp->f_flag & FNONBLOCK) != 0) {
753 			mtx_unlock(&tfd->tfd_lock);
754 			return (EAGAIN);
755 		}
756 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
757 		if (error == 0)
758 			goto retry;
759 	}
760 	if (error == 0) {
761 		count = tfd->tfd_count;
762 		tfd->tfd_count = 0;
763 		mtx_unlock(&tfd->tfd_lock);
764 		error = uiomove(&count, sizeof(timerfd_t), uio);
765 	} else
766 		mtx_unlock(&tfd->tfd_lock);
767 
768 	return (error);
769 }
770 
771 static int
772 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
773     struct thread *td)
774 {
775 	struct timerfd *tfd;
776 	int revents = 0;
777 
778 	tfd = fp->f_data;
779 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
780 		return (POLLERR);
781 
782 	mtx_lock(&tfd->tfd_lock);
783 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
784 		revents |= events & (POLLIN|POLLRDNORM);
785 	if (revents == 0)
786 		selrecord(td, &tfd->tfd_sel);
787 	mtx_unlock(&tfd->tfd_lock);
788 
789 	return (revents);
790 }
791 
792 static int
793 timerfd_kqfilter(struct file *fp, struct knote *kn)
794 {
795 	struct timerfd *tfd;
796 
797 	tfd = fp->f_data;
798 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
799 		return (EINVAL);
800 
801 	if (kn->kn_filter == EVFILT_READ)
802 		kn->kn_fop = &timerfd_rfiltops;
803 	else
804 		return (EINVAL);
805 
806 	kn->kn_hook = tfd;
807 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
808 
809 	return (0);
810 }
811 
812 static void
813 filt_timerfddetach(struct knote *kn)
814 {
815 	struct timerfd *tfd = kn->kn_hook;
816 
817 	mtx_lock(&tfd->tfd_lock);
818 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
819 	mtx_unlock(&tfd->tfd_lock);
820 }
821 
822 static int
823 filt_timerfdread(struct knote *kn, long hint)
824 {
825 	struct timerfd *tfd = kn->kn_hook;
826 
827 	return (tfd->tfd_count > 0);
828 }
829 
830 static int
831 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
832     struct ucred *active_cred, struct thread *td)
833 {
834 
835 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
836 		return (EINVAL);
837 
838 	switch (cmd) {
839 	case FIONBIO:
840 	case FIOASYNC:
841 		return (0);
842 	}
843 
844 	return (ENOTTY);
845 }
846 
847 static int
848 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
849 {
850 
851 	return (ENXIO);
852 }
853 
854 static int
855 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
856 {
857 
858 	kif->kf_type = KF_TYPE_UNKNOWN;
859 	return (0);
860 }
861 
862 static void
863 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
864 {
865 
866 	if (tfd->tfd_clockid == CLOCK_REALTIME)
867 		getnanotime(ts);
868 	else	/* CLOCK_MONOTONIC */
869 		getnanouptime(ts);
870 }
871 
872 static void
873 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
874 {
875 	struct timespec cts;
876 
877 	linux_timerfd_clocktime(tfd, &cts);
878 	*ots = tfd->tfd_time;
879 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
880 		timespecsub(&ots->it_value, &cts, &ots->it_value);
881 		if (ots->it_value.tv_sec < 0 ||
882 		    (ots->it_value.tv_sec == 0 &&
883 		     ots->it_value.tv_nsec == 0)) {
884 			ots->it_value.tv_sec  = 0;
885 			ots->it_value.tv_nsec = 1;
886 		}
887 	}
888 }
889 
890 static int
891 linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
892 {
893 	struct timerfd *tfd;
894 	struct file *fp;
895 	int error;
896 
897 	error = fget(td, fd, &cap_read_rights, &fp);
898 	if (error != 0)
899 		return (error);
900 	tfd = fp->f_data;
901 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
902 		error = EINVAL;
903 		goto out;
904 	}
905 
906 	mtx_lock(&tfd->tfd_lock);
907 	linux_timerfd_curval(tfd, ots);
908 	mtx_unlock(&tfd->tfd_lock);
909 
910 out:
911 	fdrop(fp, td);
912 	return (error);
913 }
914 
915 int
916 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
917 {
918 	struct l_itimerspec lots;
919 	struct itimerspec ots;
920 	int error;
921 
922 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
923 	if (error != 0)
924 		return (error);
925 	error = native_to_linux_itimerspec(&lots, &ots);
926 	if (error == 0)
927 		error = copyout(&lots, args->old_value, sizeof(lots));
928 	return (error);
929 }
930 
931 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
932 int
933 linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
934 {
935 	struct l_itimerspec64 lots;
936 	struct itimerspec ots;
937 	int error;
938 
939 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
940 	if (error != 0)
941 		return (error);
942 	error = native_to_linux_itimerspec64(&lots, &ots);
943 	if (error == 0)
944 		error = copyout(&lots, args->old_value, sizeof(lots));
945 	return (error);
946 }
947 #endif
948 
949 static int
950 linux_timerfd_settime_common(struct thread *td, int fd, int flags,
951     struct itimerspec *nts, struct itimerspec *oval)
952 {
953 	struct timespec cts, ts;
954 	struct timerfd *tfd;
955 	struct timeval tv;
956 	struct file *fp;
957 	int error;
958 
959 	if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
960 		return (EINVAL);
961 
962 	error = fget(td, fd, &cap_write_rights, &fp);
963 	if (error != 0)
964 		return (error);
965 	tfd = fp->f_data;
966 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
967 		error = EINVAL;
968 		goto out;
969 	}
970 
971 	mtx_lock(&tfd->tfd_lock);
972 	if (!timespecisset(&nts->it_value))
973 		timespecclear(&nts->it_interval);
974 	if (oval != NULL)
975 		linux_timerfd_curval(tfd, oval);
976 
977 	bcopy(nts, &tfd->tfd_time, sizeof(*nts));
978 	tfd->tfd_count = 0;
979 	if (timespecisset(&nts->it_value)) {
980 		linux_timerfd_clocktime(tfd, &cts);
981 		ts = nts->it_value;
982 		if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
983 			timespecadd(&tfd->tfd_time.it_value, &cts,
984 				&tfd->tfd_time.it_value);
985 		} else {
986 			timespecsub(&ts, &cts, &ts);
987 		}
988 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
989 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
990 			linux_timerfd_expire, tfd);
991 		tfd->tfd_canceled = false;
992 	} else {
993 		tfd->tfd_canceled = true;
994 		callout_stop(&tfd->tfd_callout);
995 	}
996 	mtx_unlock(&tfd->tfd_lock);
997 
998 out:
999 	fdrop(fp, td);
1000 	return (error);
1001 }
1002 
1003 int
1004 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1005 {
1006 	struct l_itimerspec lots;
1007 	struct itimerspec nts, ots, *pots;
1008 	int error;
1009 
1010 	error = copyin(args->new_value, &lots, sizeof(lots));
1011 	if (error != 0)
1012 		return (error);
1013 	error = linux_to_native_itimerspec(&nts, &lots);
1014 	if (error != 0)
1015 		return (error);
1016 	pots = (args->old_value != NULL ? &ots : NULL);
1017 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1018 	    &nts, pots);
1019 	if (error == 0 && args->old_value != NULL) {
1020 		error = native_to_linux_itimerspec(&lots, &ots);
1021 		if (error == 0)
1022 			error = copyout(&lots, args->old_value, sizeof(lots));
1023 	}
1024 	return (error);
1025 }
1026 
1027 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1028 int
1029 linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
1030 {
1031 	struct l_itimerspec64 lots;
1032 	struct itimerspec nts, ots, *pots;
1033 	int error;
1034 
1035 	error = copyin(args->new_value, &lots, sizeof(lots));
1036 	if (error != 0)
1037 		return (error);
1038 	error = linux_to_native_itimerspec64(&nts, &lots);
1039 	if (error != 0)
1040 		return (error);
1041 	pots = (args->old_value != NULL ? &ots : NULL);
1042 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1043 	    &nts, pots);
1044 	if (error == 0 && args->old_value != NULL) {
1045 		error = native_to_linux_itimerspec64(&lots, &ots);
1046 		if (error == 0)
1047 			error = copyout(&lots, args->old_value, sizeof(lots));
1048 	}
1049 	return (error);
1050 }
1051 #endif
1052 
1053 static void
1054 linux_timerfd_expire(void *arg)
1055 {
1056 	struct timespec cts, ts;
1057 	struct timeval tv;
1058 	struct timerfd *tfd;
1059 
1060 	tfd = (struct timerfd *)arg;
1061 
1062 	linux_timerfd_clocktime(tfd, &cts);
1063 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1064 		if (timespecisset(&tfd->tfd_time.it_interval))
1065 			timespecadd(&tfd->tfd_time.it_value,
1066 				    &tfd->tfd_time.it_interval,
1067 				    &tfd->tfd_time.it_value);
1068 		else
1069 			/* single shot timer */
1070 			timespecclear(&tfd->tfd_time.it_value);
1071 		if (timespecisset(&tfd->tfd_time.it_value)) {
1072 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1073 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1074 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1075 				linux_timerfd_expire, tfd);
1076 		}
1077 		tfd->tfd_count++;
1078 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1079 		selwakeup(&tfd->tfd_sel);
1080 		wakeup(&tfd->tfd_count);
1081 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1082 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1083 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1084 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1085 		    linux_timerfd_expire, tfd);
1086 	}
1087 }
1088