1 /* $NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $ */ 2 /* 3 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu> 4 * Copyright 2007-2012 Niels Provos, Nick Mathewson 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. The name of the author may not be used to endorse or promote products 15 * derived from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 #include "event2/event-config.h" 29 #include <sys/cdefs.h> 30 __RCSID("$NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $"); 31 32 #include <stdint.h> 33 #include <sys/types.h> 34 #include <sys/resource.h> 35 #ifdef _EVENT_HAVE_SYS_TIME_H 36 #include <sys/time.h> 37 #endif 38 #include <sys/queue.h> 39 #include <sys/epoll.h> 40 #include <signal.h> 41 #include <limits.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include <errno.h> 47 #ifdef _EVENT_HAVE_FCNTL_H 48 #include <fcntl.h> 49 #endif 50 51 #include "event-internal.h" 52 #include "evsignal-internal.h" 53 #include "event2/thread.h" 54 #include "evthread-internal.h" 55 #include "log-internal.h" 56 #include "evmap-internal.h" 57 #include "changelist-internal.h" 58 59 struct epollop { 60 struct epoll_event *events; 61 int nevents; 62 int epfd; 63 }; 64 65 static void *epoll_init(struct event_base *); 66 static int epoll_dispatch(struct event_base *, struct timeval *); 67 static void epoll_dealloc(struct event_base *); 68 69 static const struct eventop epollops_changelist = { 70 "epoll (with changelist)", 71 epoll_init, 72 event_changelist_add, 73 event_changelist_del, 74 epoll_dispatch, 75 epoll_dealloc, 76 1, /* need reinit */ 77 EV_FEATURE_ET|EV_FEATURE_O1, 78 EVENT_CHANGELIST_FDINFO_SIZE 79 }; 80 81 82 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 83 short old, short events, void *p); 84 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 85 short old, short events, void *p); 86 87 const struct eventop epollops = { 88 "epoll", 89 epoll_init, 90 epoll_nochangelist_add, 91 epoll_nochangelist_del, 92 epoll_dispatch, 93 epoll_dealloc, 94 1, /* need reinit */ 95 EV_FEATURE_ET|EV_FEATURE_O1, 96 0 97 }; 98 99 #define INITIAL_NEVENT 32 100 #define MAX_NEVENT 4096 101 102 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 103 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 104 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 105 * largest number of msec we can support here is 2147482. Let's 106 * round that down by 47 seconds. 107 */ 108 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 109 110 static void * 111 epoll_init(struct event_base *base) 112 { 113 int epfd; 114 struct epollop *epollop; 115 116 /* Initialize the kernel queue. (The size field is ignored since 117 * 2.6.8.) */ 118 if ((epfd = epoll_create(32000)) == -1) { 119 if (errno != ENOSYS) 120 event_warn("epoll_create"); 121 return (NULL); 122 } 123 124 evutil_make_socket_closeonexec(epfd); 125 126 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { 127 close(epfd); 128 return (NULL); 129 } 130 131 epollop->epfd = epfd; 132 133 /* Initialize fields */ 134 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); 135 if (epollop->events == NULL) { 136 mm_free(epollop); 137 close(epfd); 138 return (NULL); 139 } 140 epollop->nevents = INITIAL_NEVENT; 141 142 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || 143 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && 144 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL)) 145 base->evsel = &epollops_changelist; 146 147 evsig_init(base); 148 149 return (epollop); 150 } 151 152 static const char * 153 change_to_string(int change) 154 { 155 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); 156 if (change == EV_CHANGE_ADD) { 157 return "add"; 158 } else if (change == EV_CHANGE_DEL) { 159 return "del"; 160 } else if (change == 0) { 161 return "none"; 162 } else { 163 return "???"; 164 } 165 } 166 167 static const char * 168 epoll_op_to_string(int op) 169 { 170 return op == EPOLL_CTL_ADD?"ADD": 171 op == EPOLL_CTL_DEL?"DEL": 172 op == EPOLL_CTL_MOD?"MOD": 173 "???"; 174 } 175 176 static int 177 epoll_apply_one_change(struct event_base *base, 178 struct epollop *epollop, 179 const struct event_change *ch) 180 { 181 struct epoll_event epev; 182 int op, events = 0; 183 184 if (1) { 185 /* The logic here is a little tricky. If we had no events set 186 on the fd before, we need to set op="ADD" and set 187 events=the events we want to add. If we had any events set 188 on the fd before, and we want any events to remain on the 189 fd, we need to say op="MOD" and set events=the events we 190 want to remain. But if we want to delete the last event, 191 we say op="DEL" and set events=the remaining events. What 192 fun! 193 */ 194 195 /* TODO: Turn this into a switch or a table lookup. */ 196 197 if ((ch->read_change & EV_CHANGE_ADD) || 198 (ch->write_change & EV_CHANGE_ADD)) { 199 /* If we are adding anything at all, we'll want to do 200 * either an ADD or a MOD. */ 201 events = 0; 202 op = EPOLL_CTL_ADD; 203 if (ch->read_change & EV_CHANGE_ADD) { 204 events |= EPOLLIN; 205 } else if (ch->read_change & EV_CHANGE_DEL) { 206 ; 207 } else if (ch->old_events & EV_READ) { 208 events |= EPOLLIN; 209 } 210 if (ch->write_change & EV_CHANGE_ADD) { 211 events |= EPOLLOUT; 212 } else if (ch->write_change & EV_CHANGE_DEL) { 213 ; 214 } else if (ch->old_events & EV_WRITE) { 215 events |= EPOLLOUT; 216 } 217 if ((ch->read_change|ch->write_change) & EV_ET) 218 events |= EPOLLET; 219 220 if (ch->old_events) { 221 /* If MOD fails, we retry as an ADD, and if 222 * ADD fails we will retry as a MOD. So the 223 * only hard part here is to guess which one 224 * will work. As a heuristic, we'll try 225 * MOD first if we think there were old 226 * events and ADD if we think there were none. 227 * 228 * We can be wrong about the MOD if the file 229 * has in fact been closed and re-opened. 230 * 231 * We can be wrong about the ADD if the 232 * the fd has been re-created with a dup() 233 * of the same file that it was before. 234 */ 235 op = EPOLL_CTL_MOD; 236 } 237 } else if ((ch->read_change & EV_CHANGE_DEL) || 238 (ch->write_change & EV_CHANGE_DEL)) { 239 /* If we're deleting anything, we'll want to do a MOD 240 * or a DEL. */ 241 op = EPOLL_CTL_DEL; 242 243 if (ch->read_change & EV_CHANGE_DEL) { 244 if (ch->write_change & EV_CHANGE_DEL) { 245 events = EPOLLIN|EPOLLOUT; 246 } else if (ch->old_events & EV_WRITE) { 247 events = EPOLLOUT; 248 op = EPOLL_CTL_MOD; 249 } else { 250 events = EPOLLIN; 251 } 252 } else if (ch->write_change & EV_CHANGE_DEL) { 253 if (ch->old_events & EV_READ) { 254 events = EPOLLIN; 255 op = EPOLL_CTL_MOD; 256 } else { 257 events = EPOLLOUT; 258 } 259 } 260 } 261 262 if (!events) 263 return 0; 264 265 memset(&epev, 0, sizeof(epev)); 266 epev.data.fd = ch->fd; 267 epev.events = events; 268 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) { 269 if (op == EPOLL_CTL_MOD && errno == ENOENT) { 270 /* If a MOD operation fails with ENOENT, the 271 * fd was probably closed and re-opened. We 272 * should retry the operation as an ADD. 273 */ 274 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { 275 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", 276 (int)epev.events, ch->fd); 277 return -1; 278 } else { 279 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", 280 (int)epev.events, 281 ch->fd)); 282 } 283 } else if (op == EPOLL_CTL_ADD && errno == EEXIST) { 284 /* If an ADD operation fails with EEXIST, 285 * either the operation was redundant (as with a 286 * precautionary add), or we ran into a fun 287 * kernel bug where using dup*() to duplicate the 288 * same file into the same fd gives you the same epitem 289 * rather than a fresh one. For the second case, 290 * we must retry with MOD. */ 291 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { 292 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", 293 (int)epev.events, ch->fd); 294 return -1; 295 } else { 296 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", 297 (int)epev.events, 298 ch->fd)); 299 } 300 } else if (op == EPOLL_CTL_DEL && 301 (errno == ENOENT || errno == EBADF || 302 errno == EPERM)) { 303 /* If a delete fails with one of these errors, 304 * that's fine too: we closed the fd before we 305 * got around to calling epoll_dispatch. */ 306 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", 307 (int)epev.events, 308 ch->fd, 309 strerror(errno))); 310 } else { 311 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)", 312 epoll_op_to_string(op), 313 (int)epev.events, 314 ch->fd, 315 ch->old_events, 316 ch->read_change, 317 change_to_string(ch->read_change), 318 ch->write_change, 319 change_to_string(ch->write_change)); 320 return -1; 321 } 322 } else { 323 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]", 324 epoll_op_to_string(op), 325 (int)epev.events, 326 (int)ch->fd, 327 ch->old_events, 328 ch->read_change, 329 ch->write_change)); 330 } 331 } 332 return 0; 333 } 334 335 static int 336 epoll_apply_changes(struct event_base *base) 337 { 338 struct event_changelist *changelist = &base->changelist; 339 struct epollop *epollop = base->evbase; 340 struct event_change *ch; 341 342 int r = 0; 343 int i; 344 345 for (i = 0; i < changelist->n_changes; ++i) { 346 ch = &changelist->changes[i]; 347 if (epoll_apply_one_change(base, epollop, ch) < 0) 348 r = -1; 349 } 350 351 return (r); 352 } 353 354 static int 355 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 356 short old, short events, void *p) 357 { 358 struct event_change ch; 359 ch.fd = fd; 360 ch.old_events = old; 361 ch.read_change = ch.write_change = 0; 362 if (events & EV_WRITE) 363 ch.write_change = EV_CHANGE_ADD | 364 (events & EV_ET); 365 if (events & EV_READ) 366 ch.read_change = EV_CHANGE_ADD | 367 (events & EV_ET); 368 369 return epoll_apply_one_change(base, base->evbase, &ch); 370 } 371 372 static int 373 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 374 short old, short events, void *p) 375 { 376 struct event_change ch; 377 ch.fd = fd; 378 ch.old_events = old; 379 ch.read_change = ch.write_change = 0; 380 if (events & EV_WRITE) 381 ch.write_change = EV_CHANGE_DEL; 382 if (events & EV_READ) 383 ch.read_change = EV_CHANGE_DEL; 384 385 return epoll_apply_one_change(base, base->evbase, &ch); 386 } 387 388 static int 389 epoll_dispatch(struct event_base *base, struct timeval *tv) 390 { 391 struct epollop *epollop = base->evbase; 392 struct epoll_event *events = epollop->events; 393 int i, res; 394 long timeout = -1; 395 396 if (tv != NULL) { 397 timeout = evutil_tv_to_msec(tv); 398 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { 399 /* Linux kernels can wait forever if the timeout is 400 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 401 timeout = MAX_EPOLL_TIMEOUT_MSEC; 402 } 403 } 404 405 epoll_apply_changes(base); 406 event_changelist_remove_all(&base->changelist, base); 407 408 EVBASE_RELEASE_LOCK(base, th_base_lock); 409 410 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 411 412 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 413 414 if (res == -1) { 415 if (errno != EINTR) { 416 event_warn("epoll_wait"); 417 return (-1); 418 } 419 420 return (0); 421 } 422 423 event_debug(("%s: epoll_wait reports %d", __func__, res)); 424 EVUTIL_ASSERT(res <= epollop->nevents); 425 426 for (i = 0; i < res; i++) { 427 int what = events[i].events; 428 short ev = 0; 429 430 if (what & (EPOLLHUP|EPOLLERR)) { 431 ev = EV_READ | EV_WRITE; 432 } else { 433 if (what & EPOLLIN) 434 ev |= EV_READ; 435 if (what & EPOLLOUT) 436 ev |= EV_WRITE; 437 } 438 439 if (!ev) 440 continue; 441 442 evmap_io_active(base, events[i].data.fd, ev | EV_ET); 443 } 444 445 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { 446 /* We used all of the event space this time. We should 447 be ready for more events next time. */ 448 int new_nevents = epollop->nevents * 2; 449 struct epoll_event *new_events; 450 451 new_events = mm_realloc(epollop->events, 452 new_nevents * sizeof(struct epoll_event)); 453 if (new_events) { 454 epollop->events = new_events; 455 epollop->nevents = new_nevents; 456 } 457 } 458 459 return (0); 460 } 461 462 463 static void 464 epoll_dealloc(struct event_base *base) 465 { 466 struct epollop *epollop = base->evbase; 467 468 evsig_dealloc(base); 469 if (epollop->events) 470 mm_free(epollop->events); 471 if (epollop->epfd >= 0) 472 close(epollop->epfd); 473 474 memset(epollop, 0, sizeof(struct epollop)); 475 mm_free(epollop); 476 } 477