xref: /minix/external/bsd/libevent/dist/epoll.c (revision e3b78ef1)
1 /*	$NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $	*/
2 /*
3  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
4  * Copyright 2007-2012 Niels Provos, Nick Mathewson
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 #include "event2/event-config.h"
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $");
31 
32 #include <stdint.h>
33 #include <sys/types.h>
34 #include <sys/resource.h>
35 #ifdef _EVENT_HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif
38 #include <sys/queue.h>
39 #include <sys/epoll.h>
40 #include <signal.h>
41 #include <limits.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <errno.h>
47 #ifdef _EVENT_HAVE_FCNTL_H
48 #include <fcntl.h>
49 #endif
50 
51 #include "event-internal.h"
52 #include "evsignal-internal.h"
53 #include "event2/thread.h"
54 #include "evthread-internal.h"
55 #include "log-internal.h"
56 #include "evmap-internal.h"
57 #include "changelist-internal.h"
58 
59 struct epollop {
60 	struct epoll_event *events;
61 	int nevents;
62 	int epfd;
63 };
64 
65 static void *epoll_init(struct event_base *);
66 static int epoll_dispatch(struct event_base *, struct timeval *);
67 static void epoll_dealloc(struct event_base *);
68 
69 static const struct eventop epollops_changelist = {
70 	"epoll (with changelist)",
71 	epoll_init,
72 	event_changelist_add,
73 	event_changelist_del,
74 	epoll_dispatch,
75 	epoll_dealloc,
76 	1, /* need reinit */
77 	EV_FEATURE_ET|EV_FEATURE_O1,
78 	EVENT_CHANGELIST_FDINFO_SIZE
79 };
80 
81 
82 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
83     short old, short events, void *p);
84 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
85     short old, short events, void *p);
86 
87 const struct eventop epollops = {
88 	"epoll",
89 	epoll_init,
90 	epoll_nochangelist_add,
91 	epoll_nochangelist_del,
92 	epoll_dispatch,
93 	epoll_dealloc,
94 	1, /* need reinit */
95 	EV_FEATURE_ET|EV_FEATURE_O1,
96 	0
97 };
98 
99 #define INITIAL_NEVENT 32
100 #define MAX_NEVENT 4096
101 
102 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
103  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
104  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
105  * largest number of msec we can support here is 2147482.  Let's
106  * round that down by 47 seconds.
107  */
108 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
109 
110 static void *
111 epoll_init(struct event_base *base)
112 {
113 	int epfd;
114 	struct epollop *epollop;
115 
116 	/* Initialize the kernel queue.  (The size field is ignored since
117 	 * 2.6.8.) */
118 	if ((epfd = epoll_create(32000)) == -1) {
119 		if (errno != ENOSYS)
120 			event_warn("epoll_create");
121 		return (NULL);
122 	}
123 
124 	evutil_make_socket_closeonexec(epfd);
125 
126 	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
127 		close(epfd);
128 		return (NULL);
129 	}
130 
131 	epollop->epfd = epfd;
132 
133 	/* Initialize fields */
134 	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
135 	if (epollop->events == NULL) {
136 		mm_free(epollop);
137 		close(epfd);
138 		return (NULL);
139 	}
140 	epollop->nevents = INITIAL_NEVENT;
141 
142 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
143 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
144 		evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
145 		base->evsel = &epollops_changelist;
146 
147 	evsig_init(base);
148 
149 	return (epollop);
150 }
151 
152 static const char *
153 change_to_string(int change)
154 {
155 	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
156 	if (change == EV_CHANGE_ADD) {
157 		return "add";
158 	} else if (change == EV_CHANGE_DEL) {
159 		return "del";
160 	} else if (change == 0) {
161 		return "none";
162 	} else {
163 		return "???";
164 	}
165 }
166 
167 static const char *
168 epoll_op_to_string(int op)
169 {
170 	return op == EPOLL_CTL_ADD?"ADD":
171 	    op == EPOLL_CTL_DEL?"DEL":
172 	    op == EPOLL_CTL_MOD?"MOD":
173 	    "???";
174 }
175 
176 static int
177 epoll_apply_one_change(struct event_base *base,
178     struct epollop *epollop,
179     const struct event_change *ch)
180 {
181 	struct epoll_event epev;
182 	int op, events = 0;
183 
184 	if (1) {
185 		/* The logic here is a little tricky.  If we had no events set
186 		   on the fd before, we need to set op="ADD" and set
187 		   events=the events we want to add.  If we had any events set
188 		   on the fd before, and we want any events to remain on the
189 		   fd, we need to say op="MOD" and set events=the events we
190 		   want to remain.  But if we want to delete the last event,
191 		   we say op="DEL" and set events=the remaining events.  What
192 		   fun!
193 		*/
194 
195 		/* TODO: Turn this into a switch or a table lookup. */
196 
197 		if ((ch->read_change & EV_CHANGE_ADD) ||
198 		    (ch->write_change & EV_CHANGE_ADD)) {
199 			/* If we are adding anything at all, we'll want to do
200 			 * either an ADD or a MOD. */
201 			events = 0;
202 			op = EPOLL_CTL_ADD;
203 			if (ch->read_change & EV_CHANGE_ADD) {
204 				events |= EPOLLIN;
205 			} else if (ch->read_change & EV_CHANGE_DEL) {
206 				;
207 			} else if (ch->old_events & EV_READ) {
208 				events |= EPOLLIN;
209 			}
210 			if (ch->write_change & EV_CHANGE_ADD) {
211 				events |= EPOLLOUT;
212 			} else if (ch->write_change & EV_CHANGE_DEL) {
213 				;
214 			} else if (ch->old_events & EV_WRITE) {
215 				events |= EPOLLOUT;
216 			}
217 			if ((ch->read_change|ch->write_change) & EV_ET)
218 				events |= EPOLLET;
219 
220 			if (ch->old_events) {
221 				/* If MOD fails, we retry as an ADD, and if
222 				 * ADD fails we will retry as a MOD.  So the
223 				 * only hard part here is to guess which one
224 				 * will work.  As a heuristic, we'll try
225 				 * MOD first if we think there were old
226 				 * events and ADD if we think there were none.
227 				 *
228 				 * We can be wrong about the MOD if the file
229 				 * has in fact been closed and re-opened.
230 				 *
231 				 * We can be wrong about the ADD if the
232 				 * the fd has been re-created with a dup()
233 				 * of the same file that it was before.
234 				 */
235 				op = EPOLL_CTL_MOD;
236 			}
237 		} else if ((ch->read_change & EV_CHANGE_DEL) ||
238 		    (ch->write_change & EV_CHANGE_DEL)) {
239 			/* If we're deleting anything, we'll want to do a MOD
240 			 * or a DEL. */
241 			op = EPOLL_CTL_DEL;
242 
243 			if (ch->read_change & EV_CHANGE_DEL) {
244 				if (ch->write_change & EV_CHANGE_DEL) {
245 					events = EPOLLIN|EPOLLOUT;
246 				} else if (ch->old_events & EV_WRITE) {
247 					events = EPOLLOUT;
248 					op = EPOLL_CTL_MOD;
249 				} else {
250 					events = EPOLLIN;
251 				}
252 			} else if (ch->write_change & EV_CHANGE_DEL) {
253 				if (ch->old_events & EV_READ) {
254 					events = EPOLLIN;
255 					op = EPOLL_CTL_MOD;
256 				} else {
257 					events = EPOLLOUT;
258 				}
259 			}
260 		}
261 
262 		if (!events)
263 			return 0;
264 
265 		memset(&epev, 0, sizeof(epev));
266 		epev.data.fd = ch->fd;
267 		epev.events = events;
268 		if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
269 			if (op == EPOLL_CTL_MOD && errno == ENOENT) {
270 				/* If a MOD operation fails with ENOENT, the
271 				 * fd was probably closed and re-opened.  We
272 				 * should retry the operation as an ADD.
273 				 */
274 				if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
275 					event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
276 					    (int)epev.events, ch->fd);
277 					return -1;
278 				} else {
279 					event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
280 						(int)epev.events,
281 						ch->fd));
282 				}
283 			} else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
284 				/* If an ADD operation fails with EEXIST,
285 				 * either the operation was redundant (as with a
286 				 * precautionary add), or we ran into a fun
287 				 * kernel bug where using dup*() to duplicate the
288 				 * same file into the same fd gives you the same epitem
289 				 * rather than a fresh one.  For the second case,
290 				 * we must retry with MOD. */
291 				if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
292 					event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
293 					    (int)epev.events, ch->fd);
294 					return -1;
295 				} else {
296 					event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
297 						(int)epev.events,
298 						ch->fd));
299 				}
300 			} else if (op == EPOLL_CTL_DEL &&
301 			    (errno == ENOENT || errno == EBADF ||
302 				errno == EPERM)) {
303 				/* If a delete fails with one of these errors,
304 				 * that's fine too: we closed the fd before we
305 				 * got around to calling epoll_dispatch. */
306 				event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
307 					(int)epev.events,
308 					ch->fd,
309 					strerror(errno)));
310 			} else {
311 				event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
312 				    epoll_op_to_string(op),
313 				    (int)epev.events,
314 				    ch->fd,
315 				    ch->old_events,
316 				    ch->read_change,
317 				    change_to_string(ch->read_change),
318 				    ch->write_change,
319 				    change_to_string(ch->write_change));
320 				return -1;
321 			}
322 		} else {
323 			event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
324 				epoll_op_to_string(op),
325 				(int)epev.events,
326 				(int)ch->fd,
327 				ch->old_events,
328 				ch->read_change,
329 				ch->write_change));
330 		}
331 	}
332 	return 0;
333 }
334 
335 static int
336 epoll_apply_changes(struct event_base *base)
337 {
338 	struct event_changelist *changelist = &base->changelist;
339 	struct epollop *epollop = base->evbase;
340 	struct event_change *ch;
341 
342 	int r = 0;
343 	int i;
344 
345 	for (i = 0; i < changelist->n_changes; ++i) {
346 		ch = &changelist->changes[i];
347 		if (epoll_apply_one_change(base, epollop, ch) < 0)
348 			r = -1;
349 	}
350 
351 	return (r);
352 }
353 
354 static int
355 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
356     short old, short events, void *p)
357 {
358 	struct event_change ch;
359 	ch.fd = fd;
360 	ch.old_events = old;
361 	ch.read_change = ch.write_change = 0;
362 	if (events & EV_WRITE)
363 		ch.write_change = EV_CHANGE_ADD |
364 		    (events & EV_ET);
365 	if (events & EV_READ)
366 		ch.read_change = EV_CHANGE_ADD |
367 		    (events & EV_ET);
368 
369 	return epoll_apply_one_change(base, base->evbase, &ch);
370 }
371 
372 static int
373 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
374     short old, short events, void *p)
375 {
376 	struct event_change ch;
377 	ch.fd = fd;
378 	ch.old_events = old;
379 	ch.read_change = ch.write_change = 0;
380 	if (events & EV_WRITE)
381 		ch.write_change = EV_CHANGE_DEL;
382 	if (events & EV_READ)
383 		ch.read_change = EV_CHANGE_DEL;
384 
385 	return epoll_apply_one_change(base, base->evbase, &ch);
386 }
387 
388 static int
389 epoll_dispatch(struct event_base *base, struct timeval *tv)
390 {
391 	struct epollop *epollop = base->evbase;
392 	struct epoll_event *events = epollop->events;
393 	int i, res;
394 	long timeout = -1;
395 
396 	if (tv != NULL) {
397 		timeout = evutil_tv_to_msec(tv);
398 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
399 			/* Linux kernels can wait forever if the timeout is
400 			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
401 			timeout = MAX_EPOLL_TIMEOUT_MSEC;
402 		}
403 	}
404 
405 	epoll_apply_changes(base);
406 	event_changelist_remove_all(&base->changelist, base);
407 
408 	EVBASE_RELEASE_LOCK(base, th_base_lock);
409 
410 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
411 
412 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
413 
414 	if (res == -1) {
415 		if (errno != EINTR) {
416 			event_warn("epoll_wait");
417 			return (-1);
418 		}
419 
420 		return (0);
421 	}
422 
423 	event_debug(("%s: epoll_wait reports %d", __func__, res));
424 	EVUTIL_ASSERT(res <= epollop->nevents);
425 
426 	for (i = 0; i < res; i++) {
427 		int what = events[i].events;
428 		short ev = 0;
429 
430 		if (what & (EPOLLHUP|EPOLLERR)) {
431 			ev = EV_READ | EV_WRITE;
432 		} else {
433 			if (what & EPOLLIN)
434 				ev |= EV_READ;
435 			if (what & EPOLLOUT)
436 				ev |= EV_WRITE;
437 		}
438 
439 		if (!ev)
440 			continue;
441 
442 		evmap_io_active(base, events[i].data.fd, ev | EV_ET);
443 	}
444 
445 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
446 		/* We used all of the event space this time.  We should
447 		   be ready for more events next time. */
448 		int new_nevents = epollop->nevents * 2;
449 		struct epoll_event *new_events;
450 
451 		new_events = mm_realloc(epollop->events,
452 		    new_nevents * sizeof(struct epoll_event));
453 		if (new_events) {
454 			epollop->events = new_events;
455 			epollop->nevents = new_nevents;
456 		}
457 	}
458 
459 	return (0);
460 }
461 
462 
463 static void
464 epoll_dealloc(struct event_base *base)
465 {
466 	struct epollop *epollop = base->evbase;
467 
468 	evsig_dealloc(base);
469 	if (epollop->events)
470 		mm_free(epollop->events);
471 	if (epollop->epfd >= 0)
472 		close(epollop->epfd);
473 
474 	memset(epollop, 0, sizeof(struct epollop));
475 	mm_free(epollop);
476 }
477