12b15cb3dSCy Schubert /*
22b15cb3dSCy Schubert * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
32b15cb3dSCy Schubert * Copyright 2007-2012 Niels Provos, Nick Mathewson
42b15cb3dSCy Schubert *
52b15cb3dSCy Schubert * Redistribution and use in source and binary forms, with or without
62b15cb3dSCy Schubert * modification, are permitted provided that the following conditions
72b15cb3dSCy Schubert * are met:
82b15cb3dSCy Schubert * 1. Redistributions of source code must retain the above copyright
92b15cb3dSCy Schubert * notice, this list of conditions and the following disclaimer.
102b15cb3dSCy Schubert * 2. Redistributions in binary form must reproduce the above copyright
112b15cb3dSCy Schubert * notice, this list of conditions and the following disclaimer in the
122b15cb3dSCy Schubert * documentation and/or other materials provided with the distribution.
132b15cb3dSCy Schubert * 3. The name of the author may not be used to endorse or promote products
142b15cb3dSCy Schubert * derived from this software without specific prior written permission.
152b15cb3dSCy Schubert *
162b15cb3dSCy Schubert * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
172b15cb3dSCy Schubert * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
182b15cb3dSCy Schubert * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
192b15cb3dSCy Schubert * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
202b15cb3dSCy Schubert * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
212b15cb3dSCy Schubert * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
222b15cb3dSCy Schubert * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
232b15cb3dSCy Schubert * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
242b15cb3dSCy Schubert * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
252b15cb3dSCy Schubert * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
262b15cb3dSCy Schubert */
272b15cb3dSCy Schubert #include "event2/event-config.h"
282b15cb3dSCy Schubert #include "evconfig-private.h"
292b15cb3dSCy Schubert
302b15cb3dSCy Schubert #ifdef EVENT__HAVE_EPOLL
312b15cb3dSCy Schubert
322b15cb3dSCy Schubert #include <stdint.h>
332b15cb3dSCy Schubert #include <sys/types.h>
342b15cb3dSCy Schubert #include <sys/resource.h>
352b15cb3dSCy Schubert #ifdef EVENT__HAVE_SYS_TIME_H
362b15cb3dSCy Schubert #include <sys/time.h>
372b15cb3dSCy Schubert #endif
382b15cb3dSCy Schubert #include <sys/queue.h>
392b15cb3dSCy Schubert #include <sys/epoll.h>
402b15cb3dSCy Schubert #include <signal.h>
412b15cb3dSCy Schubert #include <limits.h>
422b15cb3dSCy Schubert #include <stdio.h>
432b15cb3dSCy Schubert #include <stdlib.h>
442b15cb3dSCy Schubert #include <string.h>
452b15cb3dSCy Schubert #include <unistd.h>
462b15cb3dSCy Schubert #include <errno.h>
472b15cb3dSCy Schubert #ifdef EVENT__HAVE_FCNTL_H
482b15cb3dSCy Schubert #include <fcntl.h>
492b15cb3dSCy Schubert #endif
502b15cb3dSCy Schubert #ifdef EVENT__HAVE_SYS_TIMERFD_H
512b15cb3dSCy Schubert #include <sys/timerfd.h>
522b15cb3dSCy Schubert #endif
532b15cb3dSCy Schubert
542b15cb3dSCy Schubert #include "event-internal.h"
552b15cb3dSCy Schubert #include "evsignal-internal.h"
562b15cb3dSCy Schubert #include "event2/thread.h"
572b15cb3dSCy Schubert #include "evthread-internal.h"
582b15cb3dSCy Schubert #include "log-internal.h"
592b15cb3dSCy Schubert #include "evmap-internal.h"
602b15cb3dSCy Schubert #include "changelist-internal.h"
612b15cb3dSCy Schubert #include "time-internal.h"
622b15cb3dSCy Schubert
632b15cb3dSCy Schubert /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection
642b15cb3dSCy Schubert using special EPOLLRDHUP flag on a read event.
652b15cb3dSCy Schubert */
662b15cb3dSCy Schubert #if !defined(EPOLLRDHUP)
672b15cb3dSCy Schubert #define EPOLLRDHUP 0
682b15cb3dSCy Schubert #define EARLY_CLOSE_IF_HAVE_RDHUP 0
692b15cb3dSCy Schubert #else
702b15cb3dSCy Schubert #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE
712b15cb3dSCy Schubert #endif
722b15cb3dSCy Schubert
732b15cb3dSCy Schubert #include "epolltable-internal.h"
742b15cb3dSCy Schubert
752b15cb3dSCy Schubert #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \
762b15cb3dSCy Schubert defined(EVENT__HAVE_TIMERFD_CREATE) && \
772b15cb3dSCy Schubert defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
782b15cb3dSCy Schubert defined(TFD_CLOEXEC)
792b15cb3dSCy Schubert /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
802b15cb3dSCy Schubert and working. This means that we can't support it on 2.6.25 (where timerfd
812b15cb3dSCy Schubert was introduced) or 2.6.26, since 2.6.27 introduced those flags.
822b15cb3dSCy Schubert */
832b15cb3dSCy Schubert #define USING_TIMERFD
842b15cb3dSCy Schubert #endif
852b15cb3dSCy Schubert
862b15cb3dSCy Schubert struct epollop {
872b15cb3dSCy Schubert struct epoll_event *events;
882b15cb3dSCy Schubert int nevents;
892b15cb3dSCy Schubert int epfd;
902b15cb3dSCy Schubert #ifdef USING_TIMERFD
912b15cb3dSCy Schubert int timerfd;
922b15cb3dSCy Schubert #endif
932b15cb3dSCy Schubert };
942b15cb3dSCy Schubert
952b15cb3dSCy Schubert static void *epoll_init(struct event_base *);
962b15cb3dSCy Schubert static int epoll_dispatch(struct event_base *, struct timeval *);
972b15cb3dSCy Schubert static void epoll_dealloc(struct event_base *);
982b15cb3dSCy Schubert
992b15cb3dSCy Schubert static const struct eventop epollops_changelist = {
1002b15cb3dSCy Schubert "epoll (with changelist)",
1012b15cb3dSCy Schubert epoll_init,
1022b15cb3dSCy Schubert event_changelist_add_,
1032b15cb3dSCy Schubert event_changelist_del_,
1042b15cb3dSCy Schubert epoll_dispatch,
1052b15cb3dSCy Schubert epoll_dealloc,
1062b15cb3dSCy Schubert 1, /* need reinit */
1072b15cb3dSCy Schubert EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP,
1082b15cb3dSCy Schubert EVENT_CHANGELIST_FDINFO_SIZE
1092b15cb3dSCy Schubert };
1102b15cb3dSCy Schubert
1112b15cb3dSCy Schubert
1122b15cb3dSCy Schubert static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
1132b15cb3dSCy Schubert short old, short events, void *p);
1142b15cb3dSCy Schubert static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
1152b15cb3dSCy Schubert short old, short events, void *p);
1162b15cb3dSCy Schubert
1172b15cb3dSCy Schubert const struct eventop epollops = {
1182b15cb3dSCy Schubert "epoll",
1192b15cb3dSCy Schubert epoll_init,
1202b15cb3dSCy Schubert epoll_nochangelist_add,
1212b15cb3dSCy Schubert epoll_nochangelist_del,
1222b15cb3dSCy Schubert epoll_dispatch,
1232b15cb3dSCy Schubert epoll_dealloc,
1242b15cb3dSCy Schubert 1, /* need reinit */
1252b15cb3dSCy Schubert EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE,
1262b15cb3dSCy Schubert 0
1272b15cb3dSCy Schubert };
1282b15cb3dSCy Schubert
1292b15cb3dSCy Schubert #define INITIAL_NEVENT 32
1302b15cb3dSCy Schubert #define MAX_NEVENT 4096
1312b15cb3dSCy Schubert
1322b15cb3dSCy Schubert /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
1332b15cb3dSCy Schubert * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
1342b15cb3dSCy Schubert * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
1352b15cb3dSCy Schubert * largest number of msec we can support here is 2147482. Let's
1362b15cb3dSCy Schubert * round that down by 47 seconds.
1372b15cb3dSCy Schubert */
1382b15cb3dSCy Schubert #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
1392b15cb3dSCy Schubert
1402b15cb3dSCy Schubert static void *
epoll_init(struct event_base * base)1412b15cb3dSCy Schubert epoll_init(struct event_base *base)
1422b15cb3dSCy Schubert {
1432b15cb3dSCy Schubert int epfd = -1;
1442b15cb3dSCy Schubert struct epollop *epollop;
1452b15cb3dSCy Schubert
1462b15cb3dSCy Schubert #ifdef EVENT__HAVE_EPOLL_CREATE1
1472b15cb3dSCy Schubert /* First, try the shiny new epoll_create1 interface, if we have it. */
1482b15cb3dSCy Schubert epfd = epoll_create1(EPOLL_CLOEXEC);
1492b15cb3dSCy Schubert #endif
1502b15cb3dSCy Schubert if (epfd == -1) {
1512b15cb3dSCy Schubert /* Initialize the kernel queue using the old interface. (The
1522b15cb3dSCy Schubert size field is ignored since 2.6.8.) */
1532b15cb3dSCy Schubert if ((epfd = epoll_create(32000)) == -1) {
1542b15cb3dSCy Schubert if (errno != ENOSYS)
1552b15cb3dSCy Schubert event_warn("epoll_create");
1562b15cb3dSCy Schubert return (NULL);
1572b15cb3dSCy Schubert }
1582b15cb3dSCy Schubert evutil_make_socket_closeonexec(epfd);
1592b15cb3dSCy Schubert }
1602b15cb3dSCy Schubert
1612b15cb3dSCy Schubert if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
1622b15cb3dSCy Schubert close(epfd);
1632b15cb3dSCy Schubert return (NULL);
1642b15cb3dSCy Schubert }
1652b15cb3dSCy Schubert
1662b15cb3dSCy Schubert epollop->epfd = epfd;
1672b15cb3dSCy Schubert
1682b15cb3dSCy Schubert /* Initialize fields */
1692b15cb3dSCy Schubert epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
1702b15cb3dSCy Schubert if (epollop->events == NULL) {
1712b15cb3dSCy Schubert mm_free(epollop);
1722b15cb3dSCy Schubert close(epfd);
1732b15cb3dSCy Schubert return (NULL);
1742b15cb3dSCy Schubert }
1752b15cb3dSCy Schubert epollop->nevents = INITIAL_NEVENT;
1762b15cb3dSCy Schubert
1772b15cb3dSCy Schubert if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
1782b15cb3dSCy Schubert ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
1792b15cb3dSCy Schubert evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {
1802b15cb3dSCy Schubert
1812b15cb3dSCy Schubert base->evsel = &epollops_changelist;
1822b15cb3dSCy Schubert }
1832b15cb3dSCy Schubert
1842b15cb3dSCy Schubert #ifdef USING_TIMERFD
1852b15cb3dSCy Schubert /*
1862b15cb3dSCy Schubert The epoll interface ordinarily gives us one-millisecond precision,
1872b15cb3dSCy Schubert so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
1882b15cb3dSCy Schubert timer. But when the user has set the new PRECISE_TIMER flag for an
1892b15cb3dSCy Schubert event_base, we can try to use timerfd to give them finer granularity.
1902b15cb3dSCy Schubert */
1912b15cb3dSCy Schubert if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
1922b15cb3dSCy Schubert base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
1932b15cb3dSCy Schubert int fd;
1942b15cb3dSCy Schubert fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
1952b15cb3dSCy Schubert if (epollop->timerfd >= 0) {
1962b15cb3dSCy Schubert struct epoll_event epev;
1972b15cb3dSCy Schubert memset(&epev, 0, sizeof(epev));
1982b15cb3dSCy Schubert epev.data.fd = epollop->timerfd;
1992b15cb3dSCy Schubert epev.events = EPOLLIN;
2002b15cb3dSCy Schubert if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
2012b15cb3dSCy Schubert event_warn("epoll_ctl(timerfd)");
2022b15cb3dSCy Schubert close(fd);
2032b15cb3dSCy Schubert epollop->timerfd = -1;
2042b15cb3dSCy Schubert }
2052b15cb3dSCy Schubert } else {
2062b15cb3dSCy Schubert if (errno != EINVAL && errno != ENOSYS) {
2072b15cb3dSCy Schubert /* These errors probably mean that we were
2082b15cb3dSCy Schubert * compiled with timerfd/TFD_* support, but
2092b15cb3dSCy Schubert * we're running on a kernel that lacks those.
2102b15cb3dSCy Schubert */
2112b15cb3dSCy Schubert event_warn("timerfd_create");
2122b15cb3dSCy Schubert }
2132b15cb3dSCy Schubert epollop->timerfd = -1;
2142b15cb3dSCy Schubert }
2152b15cb3dSCy Schubert } else {
2162b15cb3dSCy Schubert epollop->timerfd = -1;
2172b15cb3dSCy Schubert }
2182b15cb3dSCy Schubert #endif
2192b15cb3dSCy Schubert
2202b15cb3dSCy Schubert evsig_init_(base);
2212b15cb3dSCy Schubert
2222b15cb3dSCy Schubert return (epollop);
2232b15cb3dSCy Schubert }
2242b15cb3dSCy Schubert
2252b15cb3dSCy Schubert static const char *
change_to_string(int change)2262b15cb3dSCy Schubert change_to_string(int change)
2272b15cb3dSCy Schubert {
2282b15cb3dSCy Schubert change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
2292b15cb3dSCy Schubert if (change == EV_CHANGE_ADD) {
2302b15cb3dSCy Schubert return "add";
2312b15cb3dSCy Schubert } else if (change == EV_CHANGE_DEL) {
2322b15cb3dSCy Schubert return "del";
2332b15cb3dSCy Schubert } else if (change == 0) {
2342b15cb3dSCy Schubert return "none";
2352b15cb3dSCy Schubert } else {
2362b15cb3dSCy Schubert return "???";
2372b15cb3dSCy Schubert }
2382b15cb3dSCy Schubert }
2392b15cb3dSCy Schubert
2402b15cb3dSCy Schubert static const char *
epoll_op_to_string(int op)2412b15cb3dSCy Schubert epoll_op_to_string(int op)
2422b15cb3dSCy Schubert {
2432b15cb3dSCy Schubert return op == EPOLL_CTL_ADD?"ADD":
2442b15cb3dSCy Schubert op == EPOLL_CTL_DEL?"DEL":
2452b15cb3dSCy Schubert op == EPOLL_CTL_MOD?"MOD":
2462b15cb3dSCy Schubert "???";
2472b15cb3dSCy Schubert }
2482b15cb3dSCy Schubert
249*a466cc55SCy Schubert #define PRINT_CHANGES(op, events, ch, status) \
250*a466cc55SCy Schubert "Epoll %s(%d) on fd %d " status ". " \
251*a466cc55SCy Schubert "Old events were %d; " \
252*a466cc55SCy Schubert "read change was %d (%s); " \
253*a466cc55SCy Schubert "write change was %d (%s); " \
254*a466cc55SCy Schubert "close change was %d (%s)", \
255*a466cc55SCy Schubert epoll_op_to_string(op), \
256*a466cc55SCy Schubert events, \
257*a466cc55SCy Schubert ch->fd, \
258*a466cc55SCy Schubert ch->old_events, \
259*a466cc55SCy Schubert ch->read_change, \
260*a466cc55SCy Schubert change_to_string(ch->read_change), \
261*a466cc55SCy Schubert ch->write_change, \
262*a466cc55SCy Schubert change_to_string(ch->write_change), \
263*a466cc55SCy Schubert ch->close_change, \
264*a466cc55SCy Schubert change_to_string(ch->close_change)
265*a466cc55SCy Schubert
2662b15cb3dSCy Schubert static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)2672b15cb3dSCy Schubert epoll_apply_one_change(struct event_base *base,
2682b15cb3dSCy Schubert struct epollop *epollop,
2692b15cb3dSCy Schubert const struct event_change *ch)
2702b15cb3dSCy Schubert {
2712b15cb3dSCy Schubert struct epoll_event epev;
2722b15cb3dSCy Schubert int op, events = 0;
2732b15cb3dSCy Schubert int idx;
2742b15cb3dSCy Schubert
2752b15cb3dSCy Schubert idx = EPOLL_OP_TABLE_INDEX(ch);
2762b15cb3dSCy Schubert op = epoll_op_table[idx].op;
2772b15cb3dSCy Schubert events = epoll_op_table[idx].events;
2782b15cb3dSCy Schubert
2792b15cb3dSCy Schubert if (!events) {
2802b15cb3dSCy Schubert EVUTIL_ASSERT(op == 0);
2812b15cb3dSCy Schubert return 0;
2822b15cb3dSCy Schubert }
2832b15cb3dSCy Schubert
284*a466cc55SCy Schubert if ((ch->read_change|ch->write_change|ch->close_change) & EV_CHANGE_ET)
2852b15cb3dSCy Schubert events |= EPOLLET;
2862b15cb3dSCy Schubert
2872b15cb3dSCy Schubert memset(&epev, 0, sizeof(epev));
2882b15cb3dSCy Schubert epev.data.fd = ch->fd;
2892b15cb3dSCy Schubert epev.events = events;
2902b15cb3dSCy Schubert if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {
291*a466cc55SCy Schubert event_debug((PRINT_CHANGES(op, epev.events, ch, "okay")));
2922b15cb3dSCy Schubert return 0;
2932b15cb3dSCy Schubert }
2942b15cb3dSCy Schubert
2952b15cb3dSCy Schubert switch (op) {
2962b15cb3dSCy Schubert case EPOLL_CTL_MOD:
2972b15cb3dSCy Schubert if (errno == ENOENT) {
2982b15cb3dSCy Schubert /* If a MOD operation fails with ENOENT, the
2992b15cb3dSCy Schubert * fd was probably closed and re-opened. We
3002b15cb3dSCy Schubert * should retry the operation as an ADD.
3012b15cb3dSCy Schubert */
3022b15cb3dSCy Schubert if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
3032b15cb3dSCy Schubert event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
3042b15cb3dSCy Schubert (int)epev.events, ch->fd);
3052b15cb3dSCy Schubert return -1;
3062b15cb3dSCy Schubert } else {
3072b15cb3dSCy Schubert event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
3082b15cb3dSCy Schubert (int)epev.events,
3092b15cb3dSCy Schubert ch->fd));
3102b15cb3dSCy Schubert return 0;
3112b15cb3dSCy Schubert }
3122b15cb3dSCy Schubert }
3132b15cb3dSCy Schubert break;
3142b15cb3dSCy Schubert case EPOLL_CTL_ADD:
3152b15cb3dSCy Schubert if (errno == EEXIST) {
3162b15cb3dSCy Schubert /* If an ADD operation fails with EEXIST,
3172b15cb3dSCy Schubert * either the operation was redundant (as with a
3182b15cb3dSCy Schubert * precautionary add), or we ran into a fun
3192b15cb3dSCy Schubert * kernel bug where using dup*() to duplicate the
3202b15cb3dSCy Schubert * same file into the same fd gives you the same epitem
3212b15cb3dSCy Schubert * rather than a fresh one. For the second case,
3222b15cb3dSCy Schubert * we must retry with MOD. */
3232b15cb3dSCy Schubert if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
3242b15cb3dSCy Schubert event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
3252b15cb3dSCy Schubert (int)epev.events, ch->fd);
3262b15cb3dSCy Schubert return -1;
3272b15cb3dSCy Schubert } else {
3282b15cb3dSCy Schubert event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
3292b15cb3dSCy Schubert (int)epev.events,
3302b15cb3dSCy Schubert ch->fd));
3312b15cb3dSCy Schubert return 0;
3322b15cb3dSCy Schubert }
3332b15cb3dSCy Schubert }
3342b15cb3dSCy Schubert break;
3352b15cb3dSCy Schubert case EPOLL_CTL_DEL:
3362b15cb3dSCy Schubert if (errno == ENOENT || errno == EBADF || errno == EPERM) {
3372b15cb3dSCy Schubert /* If a delete fails with one of these errors,
3382b15cb3dSCy Schubert * that's fine too: we closed the fd before we
3392b15cb3dSCy Schubert * got around to calling epoll_dispatch. */
3402b15cb3dSCy Schubert event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
3412b15cb3dSCy Schubert (int)epev.events,
3422b15cb3dSCy Schubert ch->fd,
3432b15cb3dSCy Schubert strerror(errno)));
3442b15cb3dSCy Schubert return 0;
3452b15cb3dSCy Schubert }
3462b15cb3dSCy Schubert break;
3472b15cb3dSCy Schubert default:
3482b15cb3dSCy Schubert break;
3492b15cb3dSCy Schubert }
3502b15cb3dSCy Schubert
351*a466cc55SCy Schubert event_warn(PRINT_CHANGES(op, epev.events, ch, "failed"));
3522b15cb3dSCy Schubert return -1;
3532b15cb3dSCy Schubert }
3542b15cb3dSCy Schubert
3552b15cb3dSCy Schubert static int
epoll_apply_changes(struct event_base * base)3562b15cb3dSCy Schubert epoll_apply_changes(struct event_base *base)
3572b15cb3dSCy Schubert {
3582b15cb3dSCy Schubert struct event_changelist *changelist = &base->changelist;
3592b15cb3dSCy Schubert struct epollop *epollop = base->evbase;
3602b15cb3dSCy Schubert struct event_change *ch;
3612b15cb3dSCy Schubert
3622b15cb3dSCy Schubert int r = 0;
3632b15cb3dSCy Schubert int i;
3642b15cb3dSCy Schubert
3652b15cb3dSCy Schubert for (i = 0; i < changelist->n_changes; ++i) {
3662b15cb3dSCy Schubert ch = &changelist->changes[i];
3672b15cb3dSCy Schubert if (epoll_apply_one_change(base, epollop, ch) < 0)
3682b15cb3dSCy Schubert r = -1;
3692b15cb3dSCy Schubert }
3702b15cb3dSCy Schubert
3712b15cb3dSCy Schubert return (r);
3722b15cb3dSCy Schubert }
3732b15cb3dSCy Schubert
3742b15cb3dSCy Schubert static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)3752b15cb3dSCy Schubert epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
3762b15cb3dSCy Schubert short old, short events, void *p)
3772b15cb3dSCy Schubert {
3782b15cb3dSCy Schubert struct event_change ch;
3792b15cb3dSCy Schubert ch.fd = fd;
3802b15cb3dSCy Schubert ch.old_events = old;
3812b15cb3dSCy Schubert ch.read_change = ch.write_change = ch.close_change = 0;
3822b15cb3dSCy Schubert if (events & EV_WRITE)
3832b15cb3dSCy Schubert ch.write_change = EV_CHANGE_ADD |
3842b15cb3dSCy Schubert (events & EV_ET);
3852b15cb3dSCy Schubert if (events & EV_READ)
3862b15cb3dSCy Schubert ch.read_change = EV_CHANGE_ADD |
3872b15cb3dSCy Schubert (events & EV_ET);
3882b15cb3dSCy Schubert if (events & EV_CLOSED)
3892b15cb3dSCy Schubert ch.close_change = EV_CHANGE_ADD |
3902b15cb3dSCy Schubert (events & EV_ET);
3912b15cb3dSCy Schubert
3922b15cb3dSCy Schubert return epoll_apply_one_change(base, base->evbase, &ch);
3932b15cb3dSCy Schubert }
3942b15cb3dSCy Schubert
3952b15cb3dSCy Schubert static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)3962b15cb3dSCy Schubert epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
3972b15cb3dSCy Schubert short old, short events, void *p)
3982b15cb3dSCy Schubert {
3992b15cb3dSCy Schubert struct event_change ch;
4002b15cb3dSCy Schubert ch.fd = fd;
4012b15cb3dSCy Schubert ch.old_events = old;
4022b15cb3dSCy Schubert ch.read_change = ch.write_change = ch.close_change = 0;
4032b15cb3dSCy Schubert if (events & EV_WRITE)
404*a466cc55SCy Schubert ch.write_change = EV_CHANGE_DEL |
405*a466cc55SCy Schubert (events & EV_ET);
4062b15cb3dSCy Schubert if (events & EV_READ)
407*a466cc55SCy Schubert ch.read_change = EV_CHANGE_DEL |
408*a466cc55SCy Schubert (events & EV_ET);
4092b15cb3dSCy Schubert if (events & EV_CLOSED)
410*a466cc55SCy Schubert ch.close_change = EV_CHANGE_DEL |
411*a466cc55SCy Schubert (events & EV_ET);
4122b15cb3dSCy Schubert
4132b15cb3dSCy Schubert return epoll_apply_one_change(base, base->evbase, &ch);
4142b15cb3dSCy Schubert }
4152b15cb3dSCy Schubert
4162b15cb3dSCy Schubert static int
epoll_dispatch(struct event_base * base,struct timeval * tv)4172b15cb3dSCy Schubert epoll_dispatch(struct event_base *base, struct timeval *tv)
4182b15cb3dSCy Schubert {
4192b15cb3dSCy Schubert struct epollop *epollop = base->evbase;
4202b15cb3dSCy Schubert struct epoll_event *events = epollop->events;
4212b15cb3dSCy Schubert int i, res;
4222b15cb3dSCy Schubert long timeout = -1;
4232b15cb3dSCy Schubert
4242b15cb3dSCy Schubert #ifdef USING_TIMERFD
4252b15cb3dSCy Schubert if (epollop->timerfd >= 0) {
4262b15cb3dSCy Schubert struct itimerspec is;
4272b15cb3dSCy Schubert is.it_interval.tv_sec = 0;
4282b15cb3dSCy Schubert is.it_interval.tv_nsec = 0;
4292b15cb3dSCy Schubert if (tv == NULL) {
4302b15cb3dSCy Schubert /* No timeout; disarm the timer. */
4312b15cb3dSCy Schubert is.it_value.tv_sec = 0;
4322b15cb3dSCy Schubert is.it_value.tv_nsec = 0;
4332b15cb3dSCy Schubert } else {
4342b15cb3dSCy Schubert if (tv->tv_sec == 0 && tv->tv_usec == 0) {
4352b15cb3dSCy Schubert /* we need to exit immediately; timerfd can't
4362b15cb3dSCy Schubert * do that. */
4372b15cb3dSCy Schubert timeout = 0;
4382b15cb3dSCy Schubert }
4392b15cb3dSCy Schubert is.it_value.tv_sec = tv->tv_sec;
4402b15cb3dSCy Schubert is.it_value.tv_nsec = tv->tv_usec * 1000;
4412b15cb3dSCy Schubert }
4422b15cb3dSCy Schubert /* TODO: we could avoid unnecessary syscalls here by only
4432b15cb3dSCy Schubert calling timerfd_settime when the top timeout changes, or
4442b15cb3dSCy Schubert when we're called with a different timeval.
4452b15cb3dSCy Schubert */
4462b15cb3dSCy Schubert if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
4472b15cb3dSCy Schubert event_warn("timerfd_settime");
4482b15cb3dSCy Schubert }
4492b15cb3dSCy Schubert } else
4502b15cb3dSCy Schubert #endif
4512b15cb3dSCy Schubert if (tv != NULL) {
4522b15cb3dSCy Schubert timeout = evutil_tv_to_msec_(tv);
4532b15cb3dSCy Schubert if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
4542b15cb3dSCy Schubert /* Linux kernels can wait forever if the timeout is
4552b15cb3dSCy Schubert * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
4562b15cb3dSCy Schubert timeout = MAX_EPOLL_TIMEOUT_MSEC;
4572b15cb3dSCy Schubert }
4582b15cb3dSCy Schubert }
4592b15cb3dSCy Schubert
4602b15cb3dSCy Schubert epoll_apply_changes(base);
4612b15cb3dSCy Schubert event_changelist_remove_all_(&base->changelist, base);
4622b15cb3dSCy Schubert
4632b15cb3dSCy Schubert EVBASE_RELEASE_LOCK(base, th_base_lock);
4642b15cb3dSCy Schubert
4652b15cb3dSCy Schubert res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
4662b15cb3dSCy Schubert
4672b15cb3dSCy Schubert EVBASE_ACQUIRE_LOCK(base, th_base_lock);
4682b15cb3dSCy Schubert
4692b15cb3dSCy Schubert if (res == -1) {
4702b15cb3dSCy Schubert if (errno != EINTR) {
4712b15cb3dSCy Schubert event_warn("epoll_wait");
4722b15cb3dSCy Schubert return (-1);
4732b15cb3dSCy Schubert }
4742b15cb3dSCy Schubert
4752b15cb3dSCy Schubert return (0);
4762b15cb3dSCy Schubert }
4772b15cb3dSCy Schubert
4782b15cb3dSCy Schubert event_debug(("%s: epoll_wait reports %d", __func__, res));
4792b15cb3dSCy Schubert EVUTIL_ASSERT(res <= epollop->nevents);
4802b15cb3dSCy Schubert
4812b15cb3dSCy Schubert for (i = 0; i < res; i++) {
4822b15cb3dSCy Schubert int what = events[i].events;
4832b15cb3dSCy Schubert short ev = 0;
4842b15cb3dSCy Schubert #ifdef USING_TIMERFD
4852b15cb3dSCy Schubert if (events[i].data.fd == epollop->timerfd)
4862b15cb3dSCy Schubert continue;
4872b15cb3dSCy Schubert #endif
4882b15cb3dSCy Schubert
489*a466cc55SCy Schubert if (what & EPOLLERR) {
490*a466cc55SCy Schubert ev = EV_READ | EV_WRITE;
491*a466cc55SCy Schubert } else if ((what & EPOLLHUP) && !(what & EPOLLRDHUP)) {
4922b15cb3dSCy Schubert ev = EV_READ | EV_WRITE;
4932b15cb3dSCy Schubert } else {
4942b15cb3dSCy Schubert if (what & EPOLLIN)
4952b15cb3dSCy Schubert ev |= EV_READ;
4962b15cb3dSCy Schubert if (what & EPOLLOUT)
4972b15cb3dSCy Schubert ev |= EV_WRITE;
4982b15cb3dSCy Schubert if (what & EPOLLRDHUP)
4992b15cb3dSCy Schubert ev |= EV_CLOSED;
5002b15cb3dSCy Schubert }
5012b15cb3dSCy Schubert
5022b15cb3dSCy Schubert if (!ev)
5032b15cb3dSCy Schubert continue;
5042b15cb3dSCy Schubert
5052b15cb3dSCy Schubert evmap_io_active_(base, events[i].data.fd, ev | EV_ET);
5062b15cb3dSCy Schubert }
5072b15cb3dSCy Schubert
5082b15cb3dSCy Schubert if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
5092b15cb3dSCy Schubert /* We used all of the event space this time. We should
5102b15cb3dSCy Schubert be ready for more events next time. */
5112b15cb3dSCy Schubert int new_nevents = epollop->nevents * 2;
5122b15cb3dSCy Schubert struct epoll_event *new_events;
5132b15cb3dSCy Schubert
5142b15cb3dSCy Schubert new_events = mm_realloc(epollop->events,
5152b15cb3dSCy Schubert new_nevents * sizeof(struct epoll_event));
5162b15cb3dSCy Schubert if (new_events) {
5172b15cb3dSCy Schubert epollop->events = new_events;
5182b15cb3dSCy Schubert epollop->nevents = new_nevents;
5192b15cb3dSCy Schubert }
5202b15cb3dSCy Schubert }
5212b15cb3dSCy Schubert
5222b15cb3dSCy Schubert return (0);
5232b15cb3dSCy Schubert }
5242b15cb3dSCy Schubert
5252b15cb3dSCy Schubert
5262b15cb3dSCy Schubert static void
epoll_dealloc(struct event_base * base)5272b15cb3dSCy Schubert epoll_dealloc(struct event_base *base)
5282b15cb3dSCy Schubert {
5292b15cb3dSCy Schubert struct epollop *epollop = base->evbase;
5302b15cb3dSCy Schubert
5312b15cb3dSCy Schubert evsig_dealloc_(base);
5322b15cb3dSCy Schubert if (epollop->events)
5332b15cb3dSCy Schubert mm_free(epollop->events);
5342b15cb3dSCy Schubert if (epollop->epfd >= 0)
5352b15cb3dSCy Schubert close(epollop->epfd);
5362b15cb3dSCy Schubert #ifdef USING_TIMERFD
5372b15cb3dSCy Schubert if (epollop->timerfd >= 0)
5382b15cb3dSCy Schubert close(epollop->timerfd);
5392b15cb3dSCy Schubert #endif
5402b15cb3dSCy Schubert
5412b15cb3dSCy Schubert memset(epollop, 0, sizeof(struct epollop));
5422b15cb3dSCy Schubert mm_free(epollop);
5432b15cb3dSCy Schubert }
5442b15cb3dSCy Schubert
5452b15cb3dSCy Schubert #endif /* EVENT__HAVE_EPOLL */
546