xref: /freebsd/sys/compat/linux/linux_event.c (revision 9768746b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/imgact.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/callout.h>
40 #include <sys/capsicum.h>
41 #include <sys/types.h>
42 #include <sys/user.h>
43 #include <sys/file.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/errno.h>
47 #include <sys/event.h>
48 #include <sys/poll.h>
49 #include <sys/proc.h>
50 #include <sys/selinfo.h>
51 #include <sys/specialfd.h>
52 #include <sys/sx.h>
53 #include <sys/syscallsubr.h>
54 #include <sys/timespec.h>
55 #include <sys/eventfd.h>
56 
57 #ifdef COMPAT_LINUX32
58 #include <machine/../linux32/linux.h>
59 #include <machine/../linux32/linux32_proto.h>
60 #else
61 #include <machine/../linux/linux.h>
62 #include <machine/../linux/linux_proto.h>
63 #endif
64 
65 #include <compat/linux/linux_emul.h>
66 #include <compat/linux/linux_event.h>
67 #include <compat/linux/linux_file.h>
68 #include <compat/linux/linux_signal.h>
69 #include <compat/linux/linux_timer.h>
70 #include <compat/linux/linux_util.h>
71 
72 typedef uint64_t	epoll_udata_t;
73 
74 struct epoll_event {
75 	uint32_t	events;
76 	epoll_udata_t	data;
77 }
78 #if defined(__amd64__)
79 __attribute__((packed))
80 #endif
81 ;
82 
83 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
84 
85 static int	epoll_to_kevent(struct thread *td, int fd,
86 		    struct epoll_event *l_event, struct kevent *kevent,
87 		    int *nkevents);
88 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
89 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
90 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
91 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
92 		    int fd, int filter, unsigned int flags);
93 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
94 		    int fd);
95 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
96 		    int fd);
97 
98 struct epoll_copyin_args {
99 	struct kevent	*changelist;
100 };
101 
102 struct epoll_copyout_args {
103 	struct epoll_event	*leventlist;
104 	struct proc		*p;
105 	uint32_t		count;
106 	int			error;
107 };
108 
109 /* timerfd */
110 typedef uint64_t	timerfd_t;
111 
112 static fo_rdwr_t	timerfd_read;
113 static fo_ioctl_t	timerfd_ioctl;
114 static fo_poll_t	timerfd_poll;
115 static fo_kqfilter_t	timerfd_kqfilter;
116 static fo_stat_t	timerfd_stat;
117 static fo_close_t	timerfd_close;
118 static fo_fill_kinfo_t	timerfd_fill_kinfo;
119 
120 static struct fileops timerfdops = {
121 	.fo_read = timerfd_read,
122 	.fo_write = invfo_rdwr,
123 	.fo_truncate = invfo_truncate,
124 	.fo_ioctl = timerfd_ioctl,
125 	.fo_poll = timerfd_poll,
126 	.fo_kqfilter = timerfd_kqfilter,
127 	.fo_stat = timerfd_stat,
128 	.fo_close = timerfd_close,
129 	.fo_chmod = invfo_chmod,
130 	.fo_chown = invfo_chown,
131 	.fo_sendfile = invfo_sendfile,
132 	.fo_fill_kinfo = timerfd_fill_kinfo,
133 	.fo_flags = DFLAG_PASSABLE
134 };
135 
136 static void	filt_timerfddetach(struct knote *kn);
137 static int	filt_timerfdread(struct knote *kn, long hint);
138 
139 static struct filterops timerfd_rfiltops = {
140 	.f_isfd = 1,
141 	.f_detach = filt_timerfddetach,
142 	.f_event = filt_timerfdread
143 };
144 
145 struct timerfd {
146 	clockid_t	tfd_clockid;
147 	struct itimerspec tfd_time;
148 	struct callout	tfd_callout;
149 	timerfd_t	tfd_count;
150 	bool		tfd_canceled;
151 	struct selinfo	tfd_sel;
152 	struct mtx	tfd_lock;
153 };
154 
155 static void	linux_timerfd_expire(void *);
156 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
157 
158 static int
159 epoll_create_common(struct thread *td, int flags)
160 {
161 
162 	return (kern_kqueue(td, flags, NULL));
163 }
164 
165 #ifdef LINUX_LEGACY_SYSCALLS
166 int
167 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
168 {
169 
170 	/*
171 	 * args->size is unused. Linux just tests it
172 	 * and then forgets it as well.
173 	 */
174 	if (args->size <= 0)
175 		return (EINVAL);
176 
177 	return (epoll_create_common(td, 0));
178 }
179 #endif
180 
181 int
182 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
183 {
184 	int flags;
185 
186 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
187 		return (EINVAL);
188 
189 	flags = 0;
190 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
191 		flags |= O_CLOEXEC;
192 
193 	return (epoll_create_common(td, flags));
194 }
195 
196 /* Structure converting function from epoll to kevent. */
197 static int
198 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
199     struct kevent *kevent, int *nkevents)
200 {
201 	uint32_t levents = l_event->events;
202 	struct linux_pemuldata *pem;
203 	struct proc *p;
204 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
205 
206 	/* flags related to how event is registered */
207 	if ((levents & LINUX_EPOLLONESHOT) != 0)
208 		kev_flags |= EV_DISPATCH;
209 	if ((levents & LINUX_EPOLLET) != 0)
210 		kev_flags |= EV_CLEAR;
211 	if ((levents & LINUX_EPOLLERR) != 0)
212 		kev_flags |= EV_ERROR;
213 	if ((levents & LINUX_EPOLLRDHUP) != 0)
214 		kev_flags |= EV_EOF;
215 
216 	/* flags related to what event is registered */
217 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
218 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
219 		kevent->ext[0] = l_event->data;
220 		++kevent;
221 		++(*nkevents);
222 	}
223 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
224 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
225 		kevent->ext[0] = l_event->data;
226 		++kevent;
227 		++(*nkevents);
228 	}
229 	/* zero event mask is legal */
230 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
231 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
232 		++(*nkevents);
233 	}
234 
235 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
236 		p = td->td_proc;
237 
238 		pem = pem_find(p);
239 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
240 
241 		LINUX_PEM_XLOCK(pem);
242 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
243 			pem->flags |= LINUX_XUNSUP_EPOLL;
244 			LINUX_PEM_XUNLOCK(pem);
245 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
246 			    levents);
247 		} else
248 			LINUX_PEM_XUNLOCK(pem);
249 		return (EINVAL);
250 	}
251 
252 	return (0);
253 }
254 
255 /*
256  * Structure converting function from kevent to epoll. In a case
257  * this is called on error in registration we store the error in
258  * event->data and pick it up later in linux_epoll_ctl().
259  */
260 static void
261 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
262 {
263 
264 	l_event->data = kevent->ext[0];
265 
266 	if ((kevent->flags & EV_ERROR) != 0) {
267 		l_event->events = LINUX_EPOLLERR;
268 		return;
269 	}
270 
271 	/* XXX EPOLLPRI, EPOLLHUP */
272 	switch (kevent->filter) {
273 	case EVFILT_READ:
274 		l_event->events = LINUX_EPOLLIN;
275 		if ((kevent->flags & EV_EOF) != 0)
276 			l_event->events |= LINUX_EPOLLRDHUP;
277 	break;
278 	case EVFILT_WRITE:
279 		l_event->events = LINUX_EPOLLOUT;
280 	break;
281 	}
282 }
283 
284 /*
285  * Copyout callback used by kevent. This converts kevent
286  * events to epoll events and copies them back to the
287  * userspace. This is also called on error on registering
288  * of the filter.
289  */
290 static int
291 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
292 {
293 	struct epoll_copyout_args *args;
294 	struct epoll_event *eep;
295 	int error, i;
296 
297 	args = (struct epoll_copyout_args*) arg;
298 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
299 
300 	for (i = 0; i < count; i++)
301 		kevent_to_epoll(&kevp[i], &eep[i]);
302 
303 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
304 	if (error == 0) {
305 		args->leventlist += count;
306 		args->count += count;
307 	} else if (args->error == 0)
308 		args->error = error;
309 
310 	free(eep, M_EPOLL);
311 	return (error);
312 }
313 
314 /*
315  * Copyin callback used by kevent. This copies already
316  * converted filters from kernel memory to the kevent
317  * internal kernel memory. Hence the memcpy instead of
318  * copyin.
319  */
320 static int
321 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
322 {
323 	struct epoll_copyin_args *args;
324 
325 	args = (struct epoll_copyin_args*) arg;
326 
327 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
328 	args->changelist += count;
329 
330 	return (0);
331 }
332 
333 /*
334  * Load epoll filter, convert it to kevent filter
335  * and load it into kevent subsystem.
336  */
337 int
338 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
339 {
340 	struct file *epfp, *fp;
341 	struct epoll_copyin_args ciargs;
342 	struct kevent kev[2];
343 	struct kevent_copyops k_ops = { &ciargs,
344 					NULL,
345 					epoll_kev_copyin};
346 	struct epoll_event le;
347 	cap_rights_t rights;
348 	int nchanges = 0;
349 	int error;
350 
351 	if (args->op != LINUX_EPOLL_CTL_DEL) {
352 		error = copyin(args->event, &le, sizeof(le));
353 		if (error != 0)
354 			return (error);
355 	}
356 
357 	error = fget(td, args->epfd,
358 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
359 	if (error != 0)
360 		return (error);
361 	if (epfp->f_type != DTYPE_KQUEUE) {
362 		error = EINVAL;
363 		goto leave1;
364 	}
365 
366 	 /* Protect user data vector from incorrectly supplied fd. */
367 	error = fget(td, args->fd,
368 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
369 	if (error != 0)
370 		goto leave1;
371 
372 	/* Linux disallows spying on himself */
373 	if (epfp == fp) {
374 		error = EINVAL;
375 		goto leave0;
376 	}
377 
378 	ciargs.changelist = kev;
379 
380 	if (args->op != LINUX_EPOLL_CTL_DEL) {
381 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
382 		if (error != 0)
383 			goto leave0;
384 	}
385 
386 	switch (args->op) {
387 	case LINUX_EPOLL_CTL_MOD:
388 		error = epoll_delete_all_events(td, epfp, args->fd);
389 		if (error != 0)
390 			goto leave0;
391 		break;
392 
393 	case LINUX_EPOLL_CTL_ADD:
394 		if (epoll_fd_registered(td, epfp, args->fd)) {
395 			error = EEXIST;
396 			goto leave0;
397 		}
398 		break;
399 
400 	case LINUX_EPOLL_CTL_DEL:
401 		/* CTL_DEL means unregister this fd with this epoll */
402 		error = epoll_delete_all_events(td, epfp, args->fd);
403 		goto leave0;
404 
405 	default:
406 		error = EINVAL;
407 		goto leave0;
408 	}
409 
410 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
411 
412 leave0:
413 	fdrop(fp, td);
414 
415 leave1:
416 	fdrop(epfp, td);
417 	return (error);
418 }
419 
420 /*
421  * Wait for a filter to be triggered on the epoll file descriptor.
422  */
423 
424 static int
425 linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
426     int maxevents, struct timespec *tsp, sigset_t *uset)
427 {
428 	struct epoll_copyout_args coargs;
429 	struct kevent_copyops k_ops = { &coargs,
430 					epoll_kev_copyout,
431 					NULL};
432 	cap_rights_t rights;
433 	struct file *epfp;
434 	sigset_t omask;
435 	int error;
436 
437 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
438 		return (EINVAL);
439 
440 	error = fget(td, epfd,
441 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
442 	if (error != 0)
443 		return (error);
444 	if (epfp->f_type != DTYPE_KQUEUE) {
445 		error = EINVAL;
446 		goto leave;
447 	}
448 	if (uset != NULL) {
449 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
450 		    &omask, 0);
451 		if (error != 0)
452 			goto leave;
453 		td->td_pflags |= TDP_OLDMASK;
454 		/*
455 		 * Make sure that ast() is called on return to
456 		 * usermode and TDP_OLDMASK is cleared, restoring old
457 		 * sigmask.
458 		 */
459 		ast_sched(td, TDA_SIGSUSPEND);
460 	}
461 
462 	coargs.leventlist = events;
463 	coargs.p = td->td_proc;
464 	coargs.count = 0;
465 	coargs.error = 0;
466 
467 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
468 	if (error == 0 && coargs.error != 0)
469 		error = coargs.error;
470 
471 	/*
472 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
473 	 * Maybe we should translate that but I don't think it matters at all.
474 	 */
475 	if (error == 0)
476 		td->td_retval[0] = coargs.count;
477 
478 	if (uset != NULL)
479 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
480 		    NULL, 0);
481 leave:
482 	fdrop(epfp, td);
483 	return (error);
484 }
485 
486 static int
487 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
488     int maxevents, int timeout, sigset_t *uset)
489 {
490 	struct timespec ts, *tsp;
491 
492 	/*
493 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
494 	 * to block indefinitely. Real implementation does it if any negative
495 	 * timeout value is passed.
496 	 */
497 	if (timeout >= 0) {
498 		/* Convert from milliseconds to timespec. */
499 		ts.tv_sec = timeout / 1000;
500 		ts.tv_nsec = (timeout % 1000) * 1000000;
501 		tsp = &ts;
502 	} else {
503 		tsp = NULL;
504 	}
505 	return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
506 
507 }
508 
509 #ifdef LINUX_LEGACY_SYSCALLS
510 int
511 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
512 {
513 
514 	return (linux_epoll_wait_common(td, args->epfd, args->events,
515 	    args->maxevents, args->timeout, NULL));
516 }
517 #endif
518 
519 int
520 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
521 {
522 	sigset_t mask, *pmask;
523 	int error;
524 
525 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
526 	    &mask, &pmask);
527 	if (error != 0)
528 		return (error);
529 
530 	return (linux_epoll_wait_common(td, args->epfd, args->events,
531 	    args->maxevents, args->timeout, pmask));
532 }
533 
534 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
535 int
536 linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
537 {
538 	struct timespec ts, *tsa;
539 	sigset_t mask, *pmask;
540 	int error;
541 
542 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
543 	    &mask, &pmask);
544 	if (error != 0)
545 		return (error);
546 
547 	if (args->timeout) {
548 		error = linux_get_timespec64(&ts, args->timeout);
549 		if (error != 0)
550 			return (error);
551 		tsa = &ts;
552 	} else
553 		tsa = NULL;
554 
555 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
556 	    args->maxevents, tsa, pmask));
557 }
558 #else
559 int
560 linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
561 {
562 	struct timespec ts, *tsa;
563 	sigset_t mask, *pmask;
564 	int error;
565 
566 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
567 	    &mask, &pmask);
568 	if (error != 0)
569 		return (error);
570 
571 	if (args->timeout) {
572 		error = linux_get_timespec(&ts, args->timeout);
573 		if (error != 0)
574 			return (error);
575 		tsa = &ts;
576 	} else
577 		tsa = NULL;
578 
579 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
580 	    args->maxevents, tsa, pmask));
581 }
582 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
583 
584 static int
585 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
586     unsigned int flags)
587 {
588 	struct epoll_copyin_args ciargs;
589 	struct kevent kev;
590 	struct kevent_copyops k_ops = { &ciargs,
591 					NULL,
592 					epoll_kev_copyin};
593 
594 	ciargs.changelist = &kev;
595 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
596 
597 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
598 }
599 
600 static int
601 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
602 {
603 	/*
604 	 * Set empty filter flags to avoid accidental modification of already
605 	 * registered events. In the case of event re-registration:
606 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
607 	 * 2. If event does exists, it's enabled/disabled state is preserved
608 	 *    but fflags, data and udata fields are overwritten. So we can not
609 	 *    set socket lowats and store user's context pointer in udata.
610 	 */
611 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
612 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
613 		return (1);
614 
615 	return (0);
616 }
617 
618 static int
619 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
620 {
621 	int error1, error2;
622 
623 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
624 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
625 
626 	/* return 0 if at least one result positive */
627 	return (error1 == 0 ? 0 : error2);
628 }
629 
630 #ifdef LINUX_LEGACY_SYSCALLS
631 int
632 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
633 {
634 	struct specialfd_eventfd ae;
635 
636 	bzero(&ae, sizeof(ae));
637 	ae.initval = args->initval;
638 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
639 }
640 #endif
641 
642 int
643 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
644 {
645 	struct specialfd_eventfd ae;
646 	int flags;
647 
648 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
649 	    LINUX_EFD_SEMAPHORE)) != 0)
650 		return (EINVAL);
651 	flags = 0;
652 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
653 		flags |= EFD_CLOEXEC;
654 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
655 		flags |= EFD_NONBLOCK;
656 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
657 		flags |= EFD_SEMAPHORE;
658 
659 	bzero(&ae, sizeof(ae));
660 	ae.flags = flags;
661 	ae.initval = args->initval;
662 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
663 }
664 
665 int
666 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
667 {
668 	struct timerfd *tfd;
669 	struct file *fp;
670 	clockid_t clockid;
671 	int fflags, fd, error;
672 
673 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
674 		return (EINVAL);
675 
676 	error = linux_to_native_clockid(&clockid, args->clockid);
677 	if (error != 0)
678 		return (error);
679 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
680 		return (EINVAL);
681 
682 	fflags = 0;
683 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
684 		fflags |= O_CLOEXEC;
685 
686 	error = falloc(td, &fp, &fd, fflags);
687 	if (error != 0)
688 		return (error);
689 
690 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
691 	tfd->tfd_clockid = clockid;
692 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
693 
694 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
695 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
696 
697 	fflags = FREAD;
698 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
699 		fflags |= FNONBLOCK;
700 
701 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
702 	fdrop(fp, td);
703 
704 	td->td_retval[0] = fd;
705 	return (error);
706 }
707 
708 static int
709 timerfd_close(struct file *fp, struct thread *td)
710 {
711 	struct timerfd *tfd;
712 
713 	tfd = fp->f_data;
714 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
715 		return (EINVAL);
716 
717 	timespecclear(&tfd->tfd_time.it_value);
718 	timespecclear(&tfd->tfd_time.it_interval);
719 
720 	callout_drain(&tfd->tfd_callout);
721 
722 	seldrain(&tfd->tfd_sel);
723 	knlist_destroy(&tfd->tfd_sel.si_note);
724 
725 	fp->f_ops = &badfileops;
726 	mtx_destroy(&tfd->tfd_lock);
727 	free(tfd, M_EPOLL);
728 
729 	return (0);
730 }
731 
732 static int
733 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
734     int flags, struct thread *td)
735 {
736 	struct timerfd *tfd;
737 	timerfd_t count;
738 	int error;
739 
740 	tfd = fp->f_data;
741 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
742 		return (EINVAL);
743 
744 	if (uio->uio_resid < sizeof(timerfd_t))
745 		return (EINVAL);
746 
747 	error = 0;
748 	mtx_lock(&tfd->tfd_lock);
749 retry:
750 	if (tfd->tfd_canceled) {
751 		tfd->tfd_count = 0;
752 		mtx_unlock(&tfd->tfd_lock);
753 		return (ECANCELED);
754 	}
755 	if (tfd->tfd_count == 0) {
756 		if ((fp->f_flag & FNONBLOCK) != 0) {
757 			mtx_unlock(&tfd->tfd_lock);
758 			return (EAGAIN);
759 		}
760 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
761 		if (error == 0)
762 			goto retry;
763 	}
764 	if (error == 0) {
765 		count = tfd->tfd_count;
766 		tfd->tfd_count = 0;
767 		mtx_unlock(&tfd->tfd_lock);
768 		error = uiomove(&count, sizeof(timerfd_t), uio);
769 	} else
770 		mtx_unlock(&tfd->tfd_lock);
771 
772 	return (error);
773 }
774 
775 static int
776 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
777     struct thread *td)
778 {
779 	struct timerfd *tfd;
780 	int revents = 0;
781 
782 	tfd = fp->f_data;
783 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
784 		return (POLLERR);
785 
786 	mtx_lock(&tfd->tfd_lock);
787 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
788 		revents |= events & (POLLIN|POLLRDNORM);
789 	if (revents == 0)
790 		selrecord(td, &tfd->tfd_sel);
791 	mtx_unlock(&tfd->tfd_lock);
792 
793 	return (revents);
794 }
795 
796 static int
797 timerfd_kqfilter(struct file *fp, struct knote *kn)
798 {
799 	struct timerfd *tfd;
800 
801 	tfd = fp->f_data;
802 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
803 		return (EINVAL);
804 
805 	if (kn->kn_filter == EVFILT_READ)
806 		kn->kn_fop = &timerfd_rfiltops;
807 	else
808 		return (EINVAL);
809 
810 	kn->kn_hook = tfd;
811 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
812 
813 	return (0);
814 }
815 
816 static void
817 filt_timerfddetach(struct knote *kn)
818 {
819 	struct timerfd *tfd = kn->kn_hook;
820 
821 	mtx_lock(&tfd->tfd_lock);
822 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
823 	mtx_unlock(&tfd->tfd_lock);
824 }
825 
826 static int
827 filt_timerfdread(struct knote *kn, long hint)
828 {
829 	struct timerfd *tfd = kn->kn_hook;
830 
831 	return (tfd->tfd_count > 0);
832 }
833 
834 static int
835 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
836     struct ucred *active_cred, struct thread *td)
837 {
838 
839 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
840 		return (EINVAL);
841 
842 	switch (cmd) {
843 	case FIONBIO:
844 	case FIOASYNC:
845 		return (0);
846 	}
847 
848 	return (ENOTTY);
849 }
850 
851 static int
852 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
853 {
854 
855 	return (ENXIO);
856 }
857 
858 static int
859 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
860 {
861 
862 	kif->kf_type = KF_TYPE_UNKNOWN;
863 	return (0);
864 }
865 
866 static void
867 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
868 {
869 
870 	if (tfd->tfd_clockid == CLOCK_REALTIME)
871 		getnanotime(ts);
872 	else	/* CLOCK_MONOTONIC */
873 		getnanouptime(ts);
874 }
875 
876 static void
877 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
878 {
879 	struct timespec cts;
880 
881 	linux_timerfd_clocktime(tfd, &cts);
882 	*ots = tfd->tfd_time;
883 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
884 		timespecsub(&ots->it_value, &cts, &ots->it_value);
885 		if (ots->it_value.tv_sec < 0 ||
886 		    (ots->it_value.tv_sec == 0 &&
887 		     ots->it_value.tv_nsec == 0)) {
888 			ots->it_value.tv_sec  = 0;
889 			ots->it_value.tv_nsec = 1;
890 		}
891 	}
892 }
893 
894 static int
895 linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
896 {
897 	struct timerfd *tfd;
898 	struct file *fp;
899 	int error;
900 
901 	error = fget(td, fd, &cap_read_rights, &fp);
902 	if (error != 0)
903 		return (error);
904 	tfd = fp->f_data;
905 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
906 		error = EINVAL;
907 		goto out;
908 	}
909 
910 	mtx_lock(&tfd->tfd_lock);
911 	linux_timerfd_curval(tfd, ots);
912 	mtx_unlock(&tfd->tfd_lock);
913 
914 out:
915 	fdrop(fp, td);
916 	return (error);
917 }
918 
919 int
920 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
921 {
922 	struct l_itimerspec lots;
923 	struct itimerspec ots;
924 	int error;
925 
926 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
927 	if (error != 0)
928 		return (error);
929 	error = native_to_linux_itimerspec(&lots, &ots);
930 	if (error == 0)
931 		error = copyout(&lots, args->old_value, sizeof(lots));
932 	return (error);
933 }
934 
935 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
936 int
937 linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
938 {
939 	struct l_itimerspec64 lots;
940 	struct itimerspec ots;
941 	int error;
942 
943 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
944 	if (error != 0)
945 		return (error);
946 	error = native_to_linux_itimerspec64(&lots, &ots);
947 	if (error == 0)
948 		error = copyout(&lots, args->old_value, sizeof(lots));
949 	return (error);
950 }
951 #endif
952 
953 static int
954 linux_timerfd_settime_common(struct thread *td, int fd, int flags,
955     struct itimerspec *nts, struct itimerspec *oval)
956 {
957 	struct timespec cts, ts;
958 	struct timerfd *tfd;
959 	struct timeval tv;
960 	struct file *fp;
961 	int error;
962 
963 	if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
964 		return (EINVAL);
965 
966 	error = fget(td, fd, &cap_write_rights, &fp);
967 	if (error != 0)
968 		return (error);
969 	tfd = fp->f_data;
970 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
971 		error = EINVAL;
972 		goto out;
973 	}
974 
975 	mtx_lock(&tfd->tfd_lock);
976 	if (!timespecisset(&nts->it_value))
977 		timespecclear(&nts->it_interval);
978 	if (oval != NULL)
979 		linux_timerfd_curval(tfd, oval);
980 
981 	bcopy(nts, &tfd->tfd_time, sizeof(*nts));
982 	tfd->tfd_count = 0;
983 	if (timespecisset(&nts->it_value)) {
984 		linux_timerfd_clocktime(tfd, &cts);
985 		ts = nts->it_value;
986 		if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
987 			timespecadd(&tfd->tfd_time.it_value, &cts,
988 				&tfd->tfd_time.it_value);
989 		} else {
990 			timespecsub(&ts, &cts, &ts);
991 		}
992 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
993 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
994 			linux_timerfd_expire, tfd);
995 		tfd->tfd_canceled = false;
996 	} else {
997 		tfd->tfd_canceled = true;
998 		callout_stop(&tfd->tfd_callout);
999 	}
1000 	mtx_unlock(&tfd->tfd_lock);
1001 
1002 out:
1003 	fdrop(fp, td);
1004 	return (error);
1005 }
1006 
1007 int
1008 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1009 {
1010 	struct l_itimerspec lots;
1011 	struct itimerspec nts, ots, *pots;
1012 	int error;
1013 
1014 	error = copyin(args->new_value, &lots, sizeof(lots));
1015 	if (error != 0)
1016 		return (error);
1017 	error = linux_to_native_itimerspec(&nts, &lots);
1018 	if (error != 0)
1019 		return (error);
1020 	pots = (args->old_value != NULL ? &ots : NULL);
1021 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1022 	    &nts, pots);
1023 	if (error == 0 && args->old_value != NULL) {
1024 		error = native_to_linux_itimerspec(&lots, &ots);
1025 		if (error == 0)
1026 			error = copyout(&lots, args->old_value, sizeof(lots));
1027 	}
1028 	return (error);
1029 }
1030 
1031 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1032 int
1033 linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
1034 {
1035 	struct l_itimerspec64 lots;
1036 	struct itimerspec nts, ots, *pots;
1037 	int error;
1038 
1039 	error = copyin(args->new_value, &lots, sizeof(lots));
1040 	if (error != 0)
1041 		return (error);
1042 	error = linux_to_native_itimerspec64(&nts, &lots);
1043 	if (error != 0)
1044 		return (error);
1045 	pots = (args->old_value != NULL ? &ots : NULL);
1046 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1047 	    &nts, pots);
1048 	if (error == 0 && args->old_value != NULL) {
1049 		error = native_to_linux_itimerspec64(&lots, &ots);
1050 		if (error == 0)
1051 			error = copyout(&lots, args->old_value, sizeof(lots));
1052 	}
1053 	return (error);
1054 }
1055 #endif
1056 
1057 static void
1058 linux_timerfd_expire(void *arg)
1059 {
1060 	struct timespec cts, ts;
1061 	struct timeval tv;
1062 	struct timerfd *tfd;
1063 
1064 	tfd = (struct timerfd *)arg;
1065 
1066 	linux_timerfd_clocktime(tfd, &cts);
1067 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1068 		if (timespecisset(&tfd->tfd_time.it_interval))
1069 			timespecadd(&tfd->tfd_time.it_value,
1070 				    &tfd->tfd_time.it_interval,
1071 				    &tfd->tfd_time.it_value);
1072 		else
1073 			/* single shot timer */
1074 			timespecclear(&tfd->tfd_time.it_value);
1075 		if (timespecisset(&tfd->tfd_time.it_value)) {
1076 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1077 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1078 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1079 				linux_timerfd_expire, tfd);
1080 		}
1081 		tfd->tfd_count++;
1082 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1083 		selwakeup(&tfd->tfd_sel);
1084 		wakeup(&tfd->tfd_count);
1085 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1086 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1087 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1088 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1089 		    linux_timerfd_expire, tfd);
1090 	}
1091 }
1092