1 /*  Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
2 
3     This program is free software: you can redistribute it and/or modify
4     it under the terms of the GNU General Public License as published by
5     the Free Software Foundation, either version 3 of the License, or
6     (at your option) any later version.
7 
8     This program is distributed in the hope that it will be useful,
9     but WITHOUT ANY WARRANTY; without even the implied warranty of
10     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11     GNU General Public License for more details.
12 
13     You should have received a copy of the GNU General Public License
14     along with this program.  If not, see <https://www.gnu.org/licenses/>.
15  */
16 
17 #include <stdlib.h>
18 #include <string.h>
19 #include <unistd.h>
20 
21 #include "knot/common/fdset.h"
22 #include "contrib/time.h"
23 #include "contrib/macros.h"
24 
25 #define MEM_RESIZE(p, n) { \
26 	void *tmp = NULL; \
27 	if ((tmp = realloc((p), (n) * sizeof(*p))) == NULL) { \
28 		return KNOT_ENOMEM; \
29 	} \
30 	(p) = tmp; \
31 }
32 
fdset_resize(fdset_t * set,const unsigned size)33 static int fdset_resize(fdset_t *set, const unsigned size)
34 {
35 	assert(set);
36 
37 	MEM_RESIZE(set->ctx, size);
38 	MEM_RESIZE(set->timeout, size);
39 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
40 	MEM_RESIZE(set->ev, size);
41 #else
42 	MEM_RESIZE(set->pfd, size);
43 #endif
44 	set->size = size;
45 	return KNOT_EOK;
46 }
47 
fdset_init(fdset_t * set,const unsigned size)48 int fdset_init(fdset_t *set, const unsigned size)
49 {
50 	if (set == NULL) {
51 		return KNOT_EINVAL;
52 	}
53 
54 	memset(set, 0, sizeof(*set));
55 
56 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
57 #ifdef HAVE_EPOLL
58 	set->pfd = epoll_create1(0);
59 #elif HAVE_KQUEUE
60 	set->pfd = kqueue();
61 #endif
62 	if (set->pfd < 0) {
63 		return knot_map_errno();
64 	}
65 #endif
66 	int ret = fdset_resize(set, size);
67 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
68 	if (ret != KNOT_EOK) {
69 		close(set->pfd);
70 	}
71 #endif
72 	return ret;
73 }
74 
fdset_clear(fdset_t * set)75 void fdset_clear(fdset_t *set)
76 {
77 	if (set == NULL) {
78 		return;
79 	}
80 
81 	free(set->ctx);
82 	free(set->timeout);
83 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
84 	free(set->ev);
85 	free(set->recv_ev);
86 	close(set->pfd);
87 #else
88 	free(set->pfd);
89 #endif
90 	memset(set, 0, sizeof(*set));
91 }
92 
fdset_add(fdset_t * set,const int fd,const fdset_event_t events,void * ctx)93 int fdset_add(fdset_t *set, const int fd, const fdset_event_t events, void *ctx)
94 {
95 	if (set == NULL || fd < 0) {
96 		return KNOT_EINVAL;
97 	}
98 
99 	if (set->n == set->size &&
100 	    fdset_resize(set, set->size + FDSET_RESIZE_STEP) != KNOT_EOK) {
101 		return KNOT_ENOMEM;
102 	}
103 
104 	const int idx = set->n++;
105 	set->ctx[idx] = ctx;
106 	set->timeout[idx] = 0;
107 #ifdef HAVE_EPOLL
108 	set->ev[idx].data.fd = fd;
109 	set->ev[idx].events = events;
110 	struct epoll_event ev = {
111 		.data.u64 = idx,
112 		.events = events
113 	};
114 	if (epoll_ctl(set->pfd, EPOLL_CTL_ADD, fd, &ev) != 0) {
115 		return knot_map_errno();
116 	}
117 #elif HAVE_KQUEUE
118 	EV_SET(&set->ev[idx], fd, events, EV_ADD, 0, 0, (void *)(intptr_t)idx);
119 	if (kevent(set->pfd, &set->ev[idx], 1, NULL, 0, NULL) < 0) {
120 		return knot_map_errno();
121 	}
122 #else
123 	set->pfd[idx].fd = fd;
124 	set->pfd[idx].events = events;
125 	set->pfd[idx].revents = 0;
126 #endif
127 
128 	return idx;
129 }
130 
fdset_remove(fdset_t * set,const unsigned idx)131 int fdset_remove(fdset_t *set, const unsigned idx)
132 {
133 	if (set == NULL || idx >= set->n) {
134 		return KNOT_EINVAL;
135 	}
136 
137 	const int fd = fdset_get_fd(set, idx);
138 #ifdef HAVE_EPOLL
139 	/* This is necessary as DDNS duplicates file descriptors! */
140 	if (epoll_ctl(set->pfd, EPOLL_CTL_DEL, fd, NULL) != 0) {
141 		close(fd);
142 		return knot_map_errno();
143 	}
144 #elif HAVE_KQUEUE
145 	/* Return delete flag back to original filter number. */
146 #if defined(__NetBSD__)
147 	if ((signed short)set->ev[idx].filter < 0)
148 #else
149 	if (set->ev[idx].filter >= 0)
150 #endif
151 	{
152 		set->ev[idx].filter = ~set->ev[idx].filter;
153 	}
154 	set->ev[idx].flags = EV_DELETE;
155 	if (kevent(set->pfd, &set->ev[idx], 1, NULL, 0, NULL) < 0) {
156 		close(fd);
157 		return knot_map_errno();
158 	}
159 #endif
160 	close(fd);
161 
162 	const unsigned last = --set->n;
163 	/* Nothing else if it is the last one. Move last -> i if some remain. */
164 	if (idx < last) {
165 		set->ctx[idx] = set->ctx[last];
166 		set->timeout[idx] = set->timeout[last];
167 #if defined(HAVE_EPOLL) || defined (HAVE_KQUEUE)
168 		set->ev[idx] = set->ev[last];
169 #ifdef HAVE_EPOLL
170 		struct epoll_event ev = {
171 			.data.u64 = idx,
172 			.events = set->ev[idx].events
173 		};
174 		if (epoll_ctl(set->pfd, EPOLL_CTL_MOD, set->ev[last].data.fd, &ev) != 0) {
175 			return knot_map_errno();
176 		}
177 #elif HAVE_KQUEUE
178 		EV_SET(&set->ev[idx], set->ev[last].ident, set->ev[last].filter,
179 		       EV_ADD, 0, 0, (void *)(intptr_t)idx);
180 		if (kevent(set->pfd, &set->ev[idx], 1, NULL, 0, NULL) < 0) {
181 			return knot_map_errno();
182 		}
183 #endif
184 #else
185 		set->pfd[idx] = set->pfd[last];
186 #endif
187 	}
188 
189 	return KNOT_EOK;
190 }
191 
fdset_poll(fdset_t * set,fdset_it_t * it,const unsigned offset,const int timeout_ms)192 int fdset_poll(fdset_t *set, fdset_it_t *it, const unsigned offset, const int timeout_ms)
193 {
194 	if (it == NULL) {
195 		return KNOT_EINVAL;
196 	}
197 	it->unprocessed = 0;
198 
199 	if (set == NULL) {
200 		return KNOT_EINVAL;
201 	}
202 
203 	it->set = set;
204 	it->idx = offset;
205 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
206 	if (set->recv_size != set->size) {
207 		MEM_RESIZE(set->recv_ev, set->size);
208 		set->recv_size = set->size;
209 	}
210 	it->ptr = set->recv_ev;
211 	it->dirty = 0;
212 #ifdef HAVE_EPOLL
213 	if (set->n == 0) {
214 		return 0;
215 	}
216 	if ((it->unprocessed = epoll_wait(set->pfd, set->recv_ev, set->recv_size,
217 	                                  timeout_ms)) == -1) {
218 		return knot_map_errno();
219 	}
220 #ifndef NDEBUG
221 	/* In specific circumstances with valgrind, it sometimes happens that
222 	 * `set->n < it->unprocessed`. */
223 	if (it->unprocessed > 0 && unlikely(it->unprocessed > set->n)) {
224 		assert(it->unprocessed == 232);
225 		it->unprocessed = 0;
226 	}
227 #endif
228 #elif HAVE_KQUEUE
229 	struct timespec timeout = {
230 		.tv_sec = timeout_ms / 1000,
231 		.tv_nsec = (timeout_ms % 1000) * 1000000
232 	};
233 	if ((it->unprocessed = kevent(set->pfd, NULL, 0, set->recv_ev, set->recv_size,
234 	                              (timeout_ms >= 0) ? &timeout : NULL)) == -1) {
235 		return knot_map_errno();
236 	}
237 #endif
238 	/*
239 	 *  NOTE: Can't skip offset without bunch of syscalls!
240 	 *  Because of that it waits for `ctx->n` (every socket). Offset is set when TCP
241 	 *  trotlling is ON. Sometimes it can return with sockets where none of them is
242 	 *  connected socket, but it should not be common.
243 	 */
244 	while (it->unprocessed > 0 && fdset_it_get_idx(it) < it->idx) {
245 		it->ptr++;
246 		it->unprocessed--;
247 	}
248 	return it->unprocessed;
249 #else
250 	it->unprocessed = poll(&set->pfd[offset], set->n - offset, timeout_ms);
251 #ifndef NDEBUG
252 	/* In specific circumstances with valgrind, it sometimes happens that
253 	 * `set->n < it->unprocessed`. */
254 	if (it->unprocessed > 0 && unlikely(it->unprocessed > set->n - offset)) {
255 		assert(it->unprocessed == 7);
256 		it->unprocessed = 0;
257 	}
258 #endif
259 	while (it->unprocessed > 0 && set->pfd[it->idx].revents == 0) {
260 		it->idx++;
261 	}
262 	return it->unprocessed;
263 #endif
264 }
265 
fdset_it_commit(fdset_it_t * it)266 void fdset_it_commit(fdset_it_t *it)
267 {
268 	if (it == NULL) {
269 		return;
270 	}
271 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
272 	/* NOTE: reverse iteration to avoid as much "remove last" operations
273 	 *       as possible. I'm not sure about performance improvement. It
274 	 *       will skip some syscalls at begin of iteration, but what
275 	 *       performance increase do we get is a question.
276 	 */
277 	fdset_t *set = it->set;
278 	for (int i = set->n - 1; it->dirty > 0 && i >= 0; --i) {
279 #ifdef HAVE_EPOLL
280 		if (set->ev[i].events == FDSET_REMOVE_FLAG)
281 #else
282 #if defined(__NetBSD__)
283 		if ((signed short)set->ev[i].filter < 0)
284 #else
285 		if (set->ev[i].filter >= 0)
286 #endif
287 #endif
288 		{
289 			(void)fdset_remove(set, i);
290 			it->dirty--;
291 		}
292 	}
293 	assert(it->dirty == 0);
294 #endif
295 }
296 
fdset_set_watchdog(fdset_t * set,const unsigned idx,const int interval)297 int fdset_set_watchdog(fdset_t *set, const unsigned idx, const int interval)
298 {
299 	if (set == NULL || idx >= set->n) {
300 		return KNOT_EINVAL;
301 	}
302 
303 	/* Lift watchdog if interval is negative. */
304 	if (interval < 0) {
305 		set->timeout[idx] = 0;
306 		return KNOT_EOK;
307 	}
308 
309 	/* Update clock. */
310 	const struct timespec now = time_now();
311 	set->timeout[idx] = now.tv_sec + interval; /* Only seconds precision. */
312 
313 	return KNOT_EOK;
314 }
315 
fdset_sweep(fdset_t * set,const fdset_sweep_cb_t cb,void * data)316 void fdset_sweep(fdset_t *set, const fdset_sweep_cb_t cb, void *data)
317 {
318 	if (set == NULL || cb == NULL) {
319 		return;
320 	}
321 
322 	/* Get time threshold. */
323 	const struct timespec now = time_now();
324 	unsigned idx = 0;
325 	while (idx < set->n) {
326 		/* Check sweep state, remove if requested. */
327 		if (set->timeout[idx] > 0 && set->timeout[idx] <= now.tv_sec) {
328 			const int fd = fdset_get_fd(set, idx);
329 			if (cb(set, fd, data) == FDSET_SWEEP) {
330 				(void)fdset_remove(set, idx);
331 				continue;
332 			}
333 		}
334 		++idx;
335 	}
336 }
337