xref: /freebsd/contrib/ntp/sntp/libevent/epoll.c (revision a466cc55)
12b15cb3dSCy Schubert /*
22b15cb3dSCy Schubert  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
32b15cb3dSCy Schubert  * Copyright 2007-2012 Niels Provos, Nick Mathewson
42b15cb3dSCy Schubert  *
52b15cb3dSCy Schubert  * Redistribution and use in source and binary forms, with or without
62b15cb3dSCy Schubert  * modification, are permitted provided that the following conditions
72b15cb3dSCy Schubert  * are met:
82b15cb3dSCy Schubert  * 1. Redistributions of source code must retain the above copyright
92b15cb3dSCy Schubert  *    notice, this list of conditions and the following disclaimer.
102b15cb3dSCy Schubert  * 2. Redistributions in binary form must reproduce the above copyright
112b15cb3dSCy Schubert  *    notice, this list of conditions and the following disclaimer in the
122b15cb3dSCy Schubert  *    documentation and/or other materials provided with the distribution.
132b15cb3dSCy Schubert  * 3. The name of the author may not be used to endorse or promote products
142b15cb3dSCy Schubert  *    derived from this software without specific prior written permission.
152b15cb3dSCy Schubert  *
162b15cb3dSCy Schubert  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
172b15cb3dSCy Schubert  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
182b15cb3dSCy Schubert  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
192b15cb3dSCy Schubert  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
202b15cb3dSCy Schubert  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
212b15cb3dSCy Schubert  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
222b15cb3dSCy Schubert  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
232b15cb3dSCy Schubert  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
242b15cb3dSCy Schubert  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
252b15cb3dSCy Schubert  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
262b15cb3dSCy Schubert  */
272b15cb3dSCy Schubert #include "event2/event-config.h"
282b15cb3dSCy Schubert #include "evconfig-private.h"
292b15cb3dSCy Schubert 
302b15cb3dSCy Schubert #ifdef EVENT__HAVE_EPOLL
312b15cb3dSCy Schubert 
322b15cb3dSCy Schubert #include <stdint.h>
332b15cb3dSCy Schubert #include <sys/types.h>
342b15cb3dSCy Schubert #include <sys/resource.h>
352b15cb3dSCy Schubert #ifdef EVENT__HAVE_SYS_TIME_H
362b15cb3dSCy Schubert #include <sys/time.h>
372b15cb3dSCy Schubert #endif
382b15cb3dSCy Schubert #include <sys/queue.h>
392b15cb3dSCy Schubert #include <sys/epoll.h>
402b15cb3dSCy Schubert #include <signal.h>
412b15cb3dSCy Schubert #include <limits.h>
422b15cb3dSCy Schubert #include <stdio.h>
432b15cb3dSCy Schubert #include <stdlib.h>
442b15cb3dSCy Schubert #include <string.h>
452b15cb3dSCy Schubert #include <unistd.h>
462b15cb3dSCy Schubert #include <errno.h>
472b15cb3dSCy Schubert #ifdef EVENT__HAVE_FCNTL_H
482b15cb3dSCy Schubert #include <fcntl.h>
492b15cb3dSCy Schubert #endif
502b15cb3dSCy Schubert #ifdef EVENT__HAVE_SYS_TIMERFD_H
512b15cb3dSCy Schubert #include <sys/timerfd.h>
522b15cb3dSCy Schubert #endif
532b15cb3dSCy Schubert 
542b15cb3dSCy Schubert #include "event-internal.h"
552b15cb3dSCy Schubert #include "evsignal-internal.h"
562b15cb3dSCy Schubert #include "event2/thread.h"
572b15cb3dSCy Schubert #include "evthread-internal.h"
582b15cb3dSCy Schubert #include "log-internal.h"
592b15cb3dSCy Schubert #include "evmap-internal.h"
602b15cb3dSCy Schubert #include "changelist-internal.h"
612b15cb3dSCy Schubert #include "time-internal.h"
622b15cb3dSCy Schubert 
632b15cb3dSCy Schubert /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection
642b15cb3dSCy Schubert    using special EPOLLRDHUP flag on a read event.
652b15cb3dSCy Schubert */
662b15cb3dSCy Schubert #if !defined(EPOLLRDHUP)
672b15cb3dSCy Schubert #define EPOLLRDHUP 0
682b15cb3dSCy Schubert #define EARLY_CLOSE_IF_HAVE_RDHUP 0
692b15cb3dSCy Schubert #else
702b15cb3dSCy Schubert #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE
712b15cb3dSCy Schubert #endif
722b15cb3dSCy Schubert 
732b15cb3dSCy Schubert #include "epolltable-internal.h"
742b15cb3dSCy Schubert 
752b15cb3dSCy Schubert #if defined(EVENT__HAVE_SYS_TIMERFD_H) &&			  \
762b15cb3dSCy Schubert 	defined(EVENT__HAVE_TIMERFD_CREATE) &&			  \
772b15cb3dSCy Schubert 	defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
782b15cb3dSCy Schubert 	defined(TFD_CLOEXEC)
792b15cb3dSCy Schubert /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
802b15cb3dSCy Schubert    and working.  This means that we can't support it on 2.6.25 (where timerfd
812b15cb3dSCy Schubert    was introduced) or 2.6.26, since 2.6.27 introduced those flags.
822b15cb3dSCy Schubert  */
832b15cb3dSCy Schubert #define USING_TIMERFD
842b15cb3dSCy Schubert #endif
852b15cb3dSCy Schubert 
862b15cb3dSCy Schubert struct epollop {
872b15cb3dSCy Schubert 	struct epoll_event *events;
882b15cb3dSCy Schubert 	int nevents;
892b15cb3dSCy Schubert 	int epfd;
902b15cb3dSCy Schubert #ifdef USING_TIMERFD
912b15cb3dSCy Schubert 	int timerfd;
922b15cb3dSCy Schubert #endif
932b15cb3dSCy Schubert };
942b15cb3dSCy Schubert 
952b15cb3dSCy Schubert static void *epoll_init(struct event_base *);
962b15cb3dSCy Schubert static int epoll_dispatch(struct event_base *, struct timeval *);
972b15cb3dSCy Schubert static void epoll_dealloc(struct event_base *);
982b15cb3dSCy Schubert 
992b15cb3dSCy Schubert static const struct eventop epollops_changelist = {
1002b15cb3dSCy Schubert 	"epoll (with changelist)",
1012b15cb3dSCy Schubert 	epoll_init,
1022b15cb3dSCy Schubert 	event_changelist_add_,
1032b15cb3dSCy Schubert 	event_changelist_del_,
1042b15cb3dSCy Schubert 	epoll_dispatch,
1052b15cb3dSCy Schubert 	epoll_dealloc,
1062b15cb3dSCy Schubert 	1, /* need reinit */
1072b15cb3dSCy Schubert 	EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP,
1082b15cb3dSCy Schubert 	EVENT_CHANGELIST_FDINFO_SIZE
1092b15cb3dSCy Schubert };
1102b15cb3dSCy Schubert 
1112b15cb3dSCy Schubert 
1122b15cb3dSCy Schubert static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
1132b15cb3dSCy Schubert     short old, short events, void *p);
1142b15cb3dSCy Schubert static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
1152b15cb3dSCy Schubert     short old, short events, void *p);
1162b15cb3dSCy Schubert 
1172b15cb3dSCy Schubert const struct eventop epollops = {
1182b15cb3dSCy Schubert 	"epoll",
1192b15cb3dSCy Schubert 	epoll_init,
1202b15cb3dSCy Schubert 	epoll_nochangelist_add,
1212b15cb3dSCy Schubert 	epoll_nochangelist_del,
1222b15cb3dSCy Schubert 	epoll_dispatch,
1232b15cb3dSCy Schubert 	epoll_dealloc,
1242b15cb3dSCy Schubert 	1, /* need reinit */
1252b15cb3dSCy Schubert 	EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE,
1262b15cb3dSCy Schubert 	0
1272b15cb3dSCy Schubert };
1282b15cb3dSCy Schubert 
1292b15cb3dSCy Schubert #define INITIAL_NEVENT 32
1302b15cb3dSCy Schubert #define MAX_NEVENT 4096
1312b15cb3dSCy Schubert 
1322b15cb3dSCy Schubert /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
1332b15cb3dSCy Schubert  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
1342b15cb3dSCy Schubert  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
1352b15cb3dSCy Schubert  * largest number of msec we can support here is 2147482.  Let's
1362b15cb3dSCy Schubert  * round that down by 47 seconds.
1372b15cb3dSCy Schubert  */
1382b15cb3dSCy Schubert #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
1392b15cb3dSCy Schubert 
1402b15cb3dSCy Schubert static void *
epoll_init(struct event_base * base)1412b15cb3dSCy Schubert epoll_init(struct event_base *base)
1422b15cb3dSCy Schubert {
1432b15cb3dSCy Schubert 	int epfd = -1;
1442b15cb3dSCy Schubert 	struct epollop *epollop;
1452b15cb3dSCy Schubert 
1462b15cb3dSCy Schubert #ifdef EVENT__HAVE_EPOLL_CREATE1
1472b15cb3dSCy Schubert 	/* First, try the shiny new epoll_create1 interface, if we have it. */
1482b15cb3dSCy Schubert 	epfd = epoll_create1(EPOLL_CLOEXEC);
1492b15cb3dSCy Schubert #endif
1502b15cb3dSCy Schubert 	if (epfd == -1) {
1512b15cb3dSCy Schubert 		/* Initialize the kernel queue using the old interface.  (The
1522b15cb3dSCy Schubert 		size field is ignored   since 2.6.8.) */
1532b15cb3dSCy Schubert 		if ((epfd = epoll_create(32000)) == -1) {
1542b15cb3dSCy Schubert 			if (errno != ENOSYS)
1552b15cb3dSCy Schubert 				event_warn("epoll_create");
1562b15cb3dSCy Schubert 			return (NULL);
1572b15cb3dSCy Schubert 		}
1582b15cb3dSCy Schubert 		evutil_make_socket_closeonexec(epfd);
1592b15cb3dSCy Schubert 	}
1602b15cb3dSCy Schubert 
1612b15cb3dSCy Schubert 	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
1622b15cb3dSCy Schubert 		close(epfd);
1632b15cb3dSCy Schubert 		return (NULL);
1642b15cb3dSCy Schubert 	}
1652b15cb3dSCy Schubert 
1662b15cb3dSCy Schubert 	epollop->epfd = epfd;
1672b15cb3dSCy Schubert 
1682b15cb3dSCy Schubert 	/* Initialize fields */
1692b15cb3dSCy Schubert 	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
1702b15cb3dSCy Schubert 	if (epollop->events == NULL) {
1712b15cb3dSCy Schubert 		mm_free(epollop);
1722b15cb3dSCy Schubert 		close(epfd);
1732b15cb3dSCy Schubert 		return (NULL);
1742b15cb3dSCy Schubert 	}
1752b15cb3dSCy Schubert 	epollop->nevents = INITIAL_NEVENT;
1762b15cb3dSCy Schubert 
1772b15cb3dSCy Schubert 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
1782b15cb3dSCy Schubert 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
1792b15cb3dSCy Schubert 		evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {
1802b15cb3dSCy Schubert 
1812b15cb3dSCy Schubert 		base->evsel = &epollops_changelist;
1822b15cb3dSCy Schubert 	}
1832b15cb3dSCy Schubert 
1842b15cb3dSCy Schubert #ifdef USING_TIMERFD
1852b15cb3dSCy Schubert 	/*
1862b15cb3dSCy Schubert 	  The epoll interface ordinarily gives us one-millisecond precision,
1872b15cb3dSCy Schubert 	  so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
1882b15cb3dSCy Schubert 	  timer.  But when the user has set the new PRECISE_TIMER flag for an
1892b15cb3dSCy Schubert 	  event_base, we can try to use timerfd to give them finer granularity.
1902b15cb3dSCy Schubert 	*/
1912b15cb3dSCy Schubert 	if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
1922b15cb3dSCy Schubert 	    base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
1932b15cb3dSCy Schubert 		int fd;
1942b15cb3dSCy Schubert 		fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
1952b15cb3dSCy Schubert 		if (epollop->timerfd >= 0) {
1962b15cb3dSCy Schubert 			struct epoll_event epev;
1972b15cb3dSCy Schubert 			memset(&epev, 0, sizeof(epev));
1982b15cb3dSCy Schubert 			epev.data.fd = epollop->timerfd;
1992b15cb3dSCy Schubert 			epev.events = EPOLLIN;
2002b15cb3dSCy Schubert 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
2012b15cb3dSCy Schubert 				event_warn("epoll_ctl(timerfd)");
2022b15cb3dSCy Schubert 				close(fd);
2032b15cb3dSCy Schubert 				epollop->timerfd = -1;
2042b15cb3dSCy Schubert 			}
2052b15cb3dSCy Schubert 		} else {
2062b15cb3dSCy Schubert 			if (errno != EINVAL && errno != ENOSYS) {
2072b15cb3dSCy Schubert 				/* These errors probably mean that we were
2082b15cb3dSCy Schubert 				 * compiled with timerfd/TFD_* support, but
2092b15cb3dSCy Schubert 				 * we're running on a kernel that lacks those.
2102b15cb3dSCy Schubert 				 */
2112b15cb3dSCy Schubert 				event_warn("timerfd_create");
2122b15cb3dSCy Schubert 			}
2132b15cb3dSCy Schubert 			epollop->timerfd = -1;
2142b15cb3dSCy Schubert 		}
2152b15cb3dSCy Schubert 	} else {
2162b15cb3dSCy Schubert 		epollop->timerfd = -1;
2172b15cb3dSCy Schubert 	}
2182b15cb3dSCy Schubert #endif
2192b15cb3dSCy Schubert 
2202b15cb3dSCy Schubert 	evsig_init_(base);
2212b15cb3dSCy Schubert 
2222b15cb3dSCy Schubert 	return (epollop);
2232b15cb3dSCy Schubert }
2242b15cb3dSCy Schubert 
2252b15cb3dSCy Schubert static const char *
change_to_string(int change)2262b15cb3dSCy Schubert change_to_string(int change)
2272b15cb3dSCy Schubert {
2282b15cb3dSCy Schubert 	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
2292b15cb3dSCy Schubert 	if (change == EV_CHANGE_ADD) {
2302b15cb3dSCy Schubert 		return "add";
2312b15cb3dSCy Schubert 	} else if (change == EV_CHANGE_DEL) {
2322b15cb3dSCy Schubert 		return "del";
2332b15cb3dSCy Schubert 	} else if (change == 0) {
2342b15cb3dSCy Schubert 		return "none";
2352b15cb3dSCy Schubert 	} else {
2362b15cb3dSCy Schubert 		return "???";
2372b15cb3dSCy Schubert 	}
2382b15cb3dSCy Schubert }
2392b15cb3dSCy Schubert 
2402b15cb3dSCy Schubert static const char *
epoll_op_to_string(int op)2412b15cb3dSCy Schubert epoll_op_to_string(int op)
2422b15cb3dSCy Schubert {
2432b15cb3dSCy Schubert 	return op == EPOLL_CTL_ADD?"ADD":
2442b15cb3dSCy Schubert 	    op == EPOLL_CTL_DEL?"DEL":
2452b15cb3dSCy Schubert 	    op == EPOLL_CTL_MOD?"MOD":
2462b15cb3dSCy Schubert 	    "???";
2472b15cb3dSCy Schubert }
2482b15cb3dSCy Schubert 
249*a466cc55SCy Schubert #define PRINT_CHANGES(op, events, ch, status)  \
250*a466cc55SCy Schubert 	"Epoll %s(%d) on fd %d " status ". "       \
251*a466cc55SCy Schubert 	"Old events were %d; "                     \
252*a466cc55SCy Schubert 	"read change was %d (%s); "                \
253*a466cc55SCy Schubert 	"write change was %d (%s); "               \
254*a466cc55SCy Schubert 	"close change was %d (%s)",                \
255*a466cc55SCy Schubert 	epoll_op_to_string(op),                    \
256*a466cc55SCy Schubert 	events,                                    \
257*a466cc55SCy Schubert 	ch->fd,                                    \
258*a466cc55SCy Schubert 	ch->old_events,                            \
259*a466cc55SCy Schubert 	ch->read_change,                           \
260*a466cc55SCy Schubert 	change_to_string(ch->read_change),         \
261*a466cc55SCy Schubert 	ch->write_change,                          \
262*a466cc55SCy Schubert 	change_to_string(ch->write_change),        \
263*a466cc55SCy Schubert 	ch->close_change,                          \
264*a466cc55SCy Schubert 	change_to_string(ch->close_change)
265*a466cc55SCy Schubert 
2662b15cb3dSCy Schubert static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)2672b15cb3dSCy Schubert epoll_apply_one_change(struct event_base *base,
2682b15cb3dSCy Schubert     struct epollop *epollop,
2692b15cb3dSCy Schubert     const struct event_change *ch)
2702b15cb3dSCy Schubert {
2712b15cb3dSCy Schubert 	struct epoll_event epev;
2722b15cb3dSCy Schubert 	int op, events = 0;
2732b15cb3dSCy Schubert 	int idx;
2742b15cb3dSCy Schubert 
2752b15cb3dSCy Schubert 	idx = EPOLL_OP_TABLE_INDEX(ch);
2762b15cb3dSCy Schubert 	op = epoll_op_table[idx].op;
2772b15cb3dSCy Schubert 	events = epoll_op_table[idx].events;
2782b15cb3dSCy Schubert 
2792b15cb3dSCy Schubert 	if (!events) {
2802b15cb3dSCy Schubert 		EVUTIL_ASSERT(op == 0);
2812b15cb3dSCy Schubert 		return 0;
2822b15cb3dSCy Schubert 	}
2832b15cb3dSCy Schubert 
284*a466cc55SCy Schubert 	if ((ch->read_change|ch->write_change|ch->close_change) & EV_CHANGE_ET)
2852b15cb3dSCy Schubert 		events |= EPOLLET;
2862b15cb3dSCy Schubert 
2872b15cb3dSCy Schubert 	memset(&epev, 0, sizeof(epev));
2882b15cb3dSCy Schubert 	epev.data.fd = ch->fd;
2892b15cb3dSCy Schubert 	epev.events = events;
2902b15cb3dSCy Schubert 	if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {
291*a466cc55SCy Schubert 		event_debug((PRINT_CHANGES(op, epev.events, ch, "okay")));
2922b15cb3dSCy Schubert 		return 0;
2932b15cb3dSCy Schubert 	}
2942b15cb3dSCy Schubert 
2952b15cb3dSCy Schubert 	switch (op) {
2962b15cb3dSCy Schubert 	case EPOLL_CTL_MOD:
2972b15cb3dSCy Schubert 		if (errno == ENOENT) {
2982b15cb3dSCy Schubert 			/* If a MOD operation fails with ENOENT, the
2992b15cb3dSCy Schubert 			 * fd was probably closed and re-opened.  We
3002b15cb3dSCy Schubert 			 * should retry the operation as an ADD.
3012b15cb3dSCy Schubert 			 */
3022b15cb3dSCy Schubert 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
3032b15cb3dSCy Schubert 				event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
3042b15cb3dSCy Schubert 				    (int)epev.events, ch->fd);
3052b15cb3dSCy Schubert 				return -1;
3062b15cb3dSCy Schubert 			} else {
3072b15cb3dSCy Schubert 				event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
3082b15cb3dSCy Schubert 					(int)epev.events,
3092b15cb3dSCy Schubert 					ch->fd));
3102b15cb3dSCy Schubert 				return 0;
3112b15cb3dSCy Schubert 			}
3122b15cb3dSCy Schubert 		}
3132b15cb3dSCy Schubert 		break;
3142b15cb3dSCy Schubert 	case EPOLL_CTL_ADD:
3152b15cb3dSCy Schubert 		if (errno == EEXIST) {
3162b15cb3dSCy Schubert 			/* If an ADD operation fails with EEXIST,
3172b15cb3dSCy Schubert 			 * either the operation was redundant (as with a
3182b15cb3dSCy Schubert 			 * precautionary add), or we ran into a fun
3192b15cb3dSCy Schubert 			 * kernel bug where using dup*() to duplicate the
3202b15cb3dSCy Schubert 			 * same file into the same fd gives you the same epitem
3212b15cb3dSCy Schubert 			 * rather than a fresh one.  For the second case,
3222b15cb3dSCy Schubert 			 * we must retry with MOD. */
3232b15cb3dSCy Schubert 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
3242b15cb3dSCy Schubert 				event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
3252b15cb3dSCy Schubert 				    (int)epev.events, ch->fd);
3262b15cb3dSCy Schubert 				return -1;
3272b15cb3dSCy Schubert 			} else {
3282b15cb3dSCy Schubert 				event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
3292b15cb3dSCy Schubert 					(int)epev.events,
3302b15cb3dSCy Schubert 					ch->fd));
3312b15cb3dSCy Schubert 				return 0;
3322b15cb3dSCy Schubert 			}
3332b15cb3dSCy Schubert 		}
3342b15cb3dSCy Schubert 		break;
3352b15cb3dSCy Schubert 	case EPOLL_CTL_DEL:
3362b15cb3dSCy Schubert 		if (errno == ENOENT || errno == EBADF || errno == EPERM) {
3372b15cb3dSCy Schubert 			/* If a delete fails with one of these errors,
3382b15cb3dSCy Schubert 			 * that's fine too: we closed the fd before we
3392b15cb3dSCy Schubert 			 * got around to calling epoll_dispatch. */
3402b15cb3dSCy Schubert 			event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
3412b15cb3dSCy Schubert 				(int)epev.events,
3422b15cb3dSCy Schubert 				ch->fd,
3432b15cb3dSCy Schubert 				strerror(errno)));
3442b15cb3dSCy Schubert 			return 0;
3452b15cb3dSCy Schubert 		}
3462b15cb3dSCy Schubert 		break;
3472b15cb3dSCy Schubert 	default:
3482b15cb3dSCy Schubert 		break;
3492b15cb3dSCy Schubert 	}
3502b15cb3dSCy Schubert 
351*a466cc55SCy Schubert 	event_warn(PRINT_CHANGES(op, epev.events, ch, "failed"));
3522b15cb3dSCy Schubert 	return -1;
3532b15cb3dSCy Schubert }
3542b15cb3dSCy Schubert 
3552b15cb3dSCy Schubert static int
epoll_apply_changes(struct event_base * base)3562b15cb3dSCy Schubert epoll_apply_changes(struct event_base *base)
3572b15cb3dSCy Schubert {
3582b15cb3dSCy Schubert 	struct event_changelist *changelist = &base->changelist;
3592b15cb3dSCy Schubert 	struct epollop *epollop = base->evbase;
3602b15cb3dSCy Schubert 	struct event_change *ch;
3612b15cb3dSCy Schubert 
3622b15cb3dSCy Schubert 	int r = 0;
3632b15cb3dSCy Schubert 	int i;
3642b15cb3dSCy Schubert 
3652b15cb3dSCy Schubert 	for (i = 0; i < changelist->n_changes; ++i) {
3662b15cb3dSCy Schubert 		ch = &changelist->changes[i];
3672b15cb3dSCy Schubert 		if (epoll_apply_one_change(base, epollop, ch) < 0)
3682b15cb3dSCy Schubert 			r = -1;
3692b15cb3dSCy Schubert 	}
3702b15cb3dSCy Schubert 
3712b15cb3dSCy Schubert 	return (r);
3722b15cb3dSCy Schubert }
3732b15cb3dSCy Schubert 
3742b15cb3dSCy Schubert static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)3752b15cb3dSCy Schubert epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
3762b15cb3dSCy Schubert     short old, short events, void *p)
3772b15cb3dSCy Schubert {
3782b15cb3dSCy Schubert 	struct event_change ch;
3792b15cb3dSCy Schubert 	ch.fd = fd;
3802b15cb3dSCy Schubert 	ch.old_events = old;
3812b15cb3dSCy Schubert 	ch.read_change = ch.write_change = ch.close_change = 0;
3822b15cb3dSCy Schubert 	if (events & EV_WRITE)
3832b15cb3dSCy Schubert 		ch.write_change = EV_CHANGE_ADD |
3842b15cb3dSCy Schubert 		    (events & EV_ET);
3852b15cb3dSCy Schubert 	if (events & EV_READ)
3862b15cb3dSCy Schubert 		ch.read_change = EV_CHANGE_ADD |
3872b15cb3dSCy Schubert 		    (events & EV_ET);
3882b15cb3dSCy Schubert 	if (events & EV_CLOSED)
3892b15cb3dSCy Schubert 		ch.close_change = EV_CHANGE_ADD |
3902b15cb3dSCy Schubert 		    (events & EV_ET);
3912b15cb3dSCy Schubert 
3922b15cb3dSCy Schubert 	return epoll_apply_one_change(base, base->evbase, &ch);
3932b15cb3dSCy Schubert }
3942b15cb3dSCy Schubert 
3952b15cb3dSCy Schubert static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)3962b15cb3dSCy Schubert epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
3972b15cb3dSCy Schubert     short old, short events, void *p)
3982b15cb3dSCy Schubert {
3992b15cb3dSCy Schubert 	struct event_change ch;
4002b15cb3dSCy Schubert 	ch.fd = fd;
4012b15cb3dSCy Schubert 	ch.old_events = old;
4022b15cb3dSCy Schubert 	ch.read_change = ch.write_change = ch.close_change = 0;
4032b15cb3dSCy Schubert 	if (events & EV_WRITE)
404*a466cc55SCy Schubert 		ch.write_change = EV_CHANGE_DEL |
405*a466cc55SCy Schubert 		    (events & EV_ET);
4062b15cb3dSCy Schubert 	if (events & EV_READ)
407*a466cc55SCy Schubert 		ch.read_change = EV_CHANGE_DEL |
408*a466cc55SCy Schubert 		    (events & EV_ET);
4092b15cb3dSCy Schubert 	if (events & EV_CLOSED)
410*a466cc55SCy Schubert 		ch.close_change = EV_CHANGE_DEL |
411*a466cc55SCy Schubert 		    (events & EV_ET);
4122b15cb3dSCy Schubert 
4132b15cb3dSCy Schubert 	return epoll_apply_one_change(base, base->evbase, &ch);
4142b15cb3dSCy Schubert }
4152b15cb3dSCy Schubert 
4162b15cb3dSCy Schubert static int
epoll_dispatch(struct event_base * base,struct timeval * tv)4172b15cb3dSCy Schubert epoll_dispatch(struct event_base *base, struct timeval *tv)
4182b15cb3dSCy Schubert {
4192b15cb3dSCy Schubert 	struct epollop *epollop = base->evbase;
4202b15cb3dSCy Schubert 	struct epoll_event *events = epollop->events;
4212b15cb3dSCy Schubert 	int i, res;
4222b15cb3dSCy Schubert 	long timeout = -1;
4232b15cb3dSCy Schubert 
4242b15cb3dSCy Schubert #ifdef USING_TIMERFD
4252b15cb3dSCy Schubert 	if (epollop->timerfd >= 0) {
4262b15cb3dSCy Schubert 		struct itimerspec is;
4272b15cb3dSCy Schubert 		is.it_interval.tv_sec = 0;
4282b15cb3dSCy Schubert 		is.it_interval.tv_nsec = 0;
4292b15cb3dSCy Schubert 		if (tv == NULL) {
4302b15cb3dSCy Schubert 			/* No timeout; disarm the timer. */
4312b15cb3dSCy Schubert 			is.it_value.tv_sec = 0;
4322b15cb3dSCy Schubert 			is.it_value.tv_nsec = 0;
4332b15cb3dSCy Schubert 		} else {
4342b15cb3dSCy Schubert 			if (tv->tv_sec == 0 && tv->tv_usec == 0) {
4352b15cb3dSCy Schubert 				/* we need to exit immediately; timerfd can't
4362b15cb3dSCy Schubert 				 * do that. */
4372b15cb3dSCy Schubert 				timeout = 0;
4382b15cb3dSCy Schubert 			}
4392b15cb3dSCy Schubert 			is.it_value.tv_sec = tv->tv_sec;
4402b15cb3dSCy Schubert 			is.it_value.tv_nsec = tv->tv_usec * 1000;
4412b15cb3dSCy Schubert 		}
4422b15cb3dSCy Schubert 		/* TODO: we could avoid unnecessary syscalls here by only
4432b15cb3dSCy Schubert 		   calling timerfd_settime when the top timeout changes, or
4442b15cb3dSCy Schubert 		   when we're called with a different timeval.
4452b15cb3dSCy Schubert 		*/
4462b15cb3dSCy Schubert 		if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
4472b15cb3dSCy Schubert 			event_warn("timerfd_settime");
4482b15cb3dSCy Schubert 		}
4492b15cb3dSCy Schubert 	} else
4502b15cb3dSCy Schubert #endif
4512b15cb3dSCy Schubert 	if (tv != NULL) {
4522b15cb3dSCy Schubert 		timeout = evutil_tv_to_msec_(tv);
4532b15cb3dSCy Schubert 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
4542b15cb3dSCy Schubert 			/* Linux kernels can wait forever if the timeout is
4552b15cb3dSCy Schubert 			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
4562b15cb3dSCy Schubert 			timeout = MAX_EPOLL_TIMEOUT_MSEC;
4572b15cb3dSCy Schubert 		}
4582b15cb3dSCy Schubert 	}
4592b15cb3dSCy Schubert 
4602b15cb3dSCy Schubert 	epoll_apply_changes(base);
4612b15cb3dSCy Schubert 	event_changelist_remove_all_(&base->changelist, base);
4622b15cb3dSCy Schubert 
4632b15cb3dSCy Schubert 	EVBASE_RELEASE_LOCK(base, th_base_lock);
4642b15cb3dSCy Schubert 
4652b15cb3dSCy Schubert 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
4662b15cb3dSCy Schubert 
4672b15cb3dSCy Schubert 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
4682b15cb3dSCy Schubert 
4692b15cb3dSCy Schubert 	if (res == -1) {
4702b15cb3dSCy Schubert 		if (errno != EINTR) {
4712b15cb3dSCy Schubert 			event_warn("epoll_wait");
4722b15cb3dSCy Schubert 			return (-1);
4732b15cb3dSCy Schubert 		}
4742b15cb3dSCy Schubert 
4752b15cb3dSCy Schubert 		return (0);
4762b15cb3dSCy Schubert 	}
4772b15cb3dSCy Schubert 
4782b15cb3dSCy Schubert 	event_debug(("%s: epoll_wait reports %d", __func__, res));
4792b15cb3dSCy Schubert 	EVUTIL_ASSERT(res <= epollop->nevents);
4802b15cb3dSCy Schubert 
4812b15cb3dSCy Schubert 	for (i = 0; i < res; i++) {
4822b15cb3dSCy Schubert 		int what = events[i].events;
4832b15cb3dSCy Schubert 		short ev = 0;
4842b15cb3dSCy Schubert #ifdef USING_TIMERFD
4852b15cb3dSCy Schubert 		if (events[i].data.fd == epollop->timerfd)
4862b15cb3dSCy Schubert 			continue;
4872b15cb3dSCy Schubert #endif
4882b15cb3dSCy Schubert 
489*a466cc55SCy Schubert 		if (what & EPOLLERR) {
490*a466cc55SCy Schubert 			ev = EV_READ | EV_WRITE;
491*a466cc55SCy Schubert 		} else if ((what & EPOLLHUP) && !(what & EPOLLRDHUP)) {
4922b15cb3dSCy Schubert 			ev = EV_READ | EV_WRITE;
4932b15cb3dSCy Schubert 		} else {
4942b15cb3dSCy Schubert 			if (what & EPOLLIN)
4952b15cb3dSCy Schubert 				ev |= EV_READ;
4962b15cb3dSCy Schubert 			if (what & EPOLLOUT)
4972b15cb3dSCy Schubert 				ev |= EV_WRITE;
4982b15cb3dSCy Schubert 			if (what & EPOLLRDHUP)
4992b15cb3dSCy Schubert 				ev |= EV_CLOSED;
5002b15cb3dSCy Schubert 		}
5012b15cb3dSCy Schubert 
5022b15cb3dSCy Schubert 		if (!ev)
5032b15cb3dSCy Schubert 			continue;
5042b15cb3dSCy Schubert 
5052b15cb3dSCy Schubert 		evmap_io_active_(base, events[i].data.fd, ev | EV_ET);
5062b15cb3dSCy Schubert 	}
5072b15cb3dSCy Schubert 
5082b15cb3dSCy Schubert 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
5092b15cb3dSCy Schubert 		/* We used all of the event space this time.  We should
5102b15cb3dSCy Schubert 		   be ready for more events next time. */
5112b15cb3dSCy Schubert 		int new_nevents = epollop->nevents * 2;
5122b15cb3dSCy Schubert 		struct epoll_event *new_events;
5132b15cb3dSCy Schubert 
5142b15cb3dSCy Schubert 		new_events = mm_realloc(epollop->events,
5152b15cb3dSCy Schubert 		    new_nevents * sizeof(struct epoll_event));
5162b15cb3dSCy Schubert 		if (new_events) {
5172b15cb3dSCy Schubert 			epollop->events = new_events;
5182b15cb3dSCy Schubert 			epollop->nevents = new_nevents;
5192b15cb3dSCy Schubert 		}
5202b15cb3dSCy Schubert 	}
5212b15cb3dSCy Schubert 
5222b15cb3dSCy Schubert 	return (0);
5232b15cb3dSCy Schubert }
5242b15cb3dSCy Schubert 
5252b15cb3dSCy Schubert 
5262b15cb3dSCy Schubert static void
epoll_dealloc(struct event_base * base)5272b15cb3dSCy Schubert epoll_dealloc(struct event_base *base)
5282b15cb3dSCy Schubert {
5292b15cb3dSCy Schubert 	struct epollop *epollop = base->evbase;
5302b15cb3dSCy Schubert 
5312b15cb3dSCy Schubert 	evsig_dealloc_(base);
5322b15cb3dSCy Schubert 	if (epollop->events)
5332b15cb3dSCy Schubert 		mm_free(epollop->events);
5342b15cb3dSCy Schubert 	if (epollop->epfd >= 0)
5352b15cb3dSCy Schubert 		close(epollop->epfd);
5362b15cb3dSCy Schubert #ifdef USING_TIMERFD
5372b15cb3dSCy Schubert 	if (epollop->timerfd >= 0)
5382b15cb3dSCy Schubert 		close(epollop->timerfd);
5392b15cb3dSCy Schubert #endif
5402b15cb3dSCy Schubert 
5412b15cb3dSCy Schubert 	memset(epollop, 0, sizeof(struct epollop));
5422b15cb3dSCy Schubert 	mm_free(epollop);
5432b15cb3dSCy Schubert }
5442b15cb3dSCy Schubert 
5452b15cb3dSCy Schubert #endif /* EVENT__HAVE_EPOLL */
546