1 /* Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
2
3 This program is free software: you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation, either version 3 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program. If not, see <https://www.gnu.org/licenses/>.
15 */
16
17 #include <stdlib.h>
18 #include <string.h>
19 #include <unistd.h>
20
21 #include "knot/common/fdset.h"
22 #include "contrib/time.h"
23 #include "contrib/macros.h"
24
25 #define MEM_RESIZE(p, n) { \
26 void *tmp = NULL; \
27 if ((tmp = realloc((p), (n) * sizeof(*p))) == NULL) { \
28 return KNOT_ENOMEM; \
29 } \
30 (p) = tmp; \
31 }
32
fdset_resize(fdset_t * set,const unsigned size)33 static int fdset_resize(fdset_t *set, const unsigned size)
34 {
35 assert(set);
36
37 MEM_RESIZE(set->ctx, size);
38 MEM_RESIZE(set->timeout, size);
39 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
40 MEM_RESIZE(set->ev, size);
41 #else
42 MEM_RESIZE(set->pfd, size);
43 #endif
44 set->size = size;
45 return KNOT_EOK;
46 }
47
fdset_init(fdset_t * set,const unsigned size)48 int fdset_init(fdset_t *set, const unsigned size)
49 {
50 if (set == NULL) {
51 return KNOT_EINVAL;
52 }
53
54 memset(set, 0, sizeof(*set));
55
56 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
57 #ifdef HAVE_EPOLL
58 set->pfd = epoll_create1(0);
59 #elif HAVE_KQUEUE
60 set->pfd = kqueue();
61 #endif
62 if (set->pfd < 0) {
63 return knot_map_errno();
64 }
65 #endif
66 int ret = fdset_resize(set, size);
67 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
68 if (ret != KNOT_EOK) {
69 close(set->pfd);
70 }
71 #endif
72 return ret;
73 }
74
fdset_clear(fdset_t * set)75 void fdset_clear(fdset_t *set)
76 {
77 if (set == NULL) {
78 return;
79 }
80
81 free(set->ctx);
82 free(set->timeout);
83 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
84 free(set->ev);
85 free(set->recv_ev);
86 close(set->pfd);
87 #else
88 free(set->pfd);
89 #endif
90 memset(set, 0, sizeof(*set));
91 }
92
fdset_add(fdset_t * set,const int fd,const fdset_event_t events,void * ctx)93 int fdset_add(fdset_t *set, const int fd, const fdset_event_t events, void *ctx)
94 {
95 if (set == NULL || fd < 0) {
96 return KNOT_EINVAL;
97 }
98
99 if (set->n == set->size &&
100 fdset_resize(set, set->size + FDSET_RESIZE_STEP) != KNOT_EOK) {
101 return KNOT_ENOMEM;
102 }
103
104 const int idx = set->n++;
105 set->ctx[idx] = ctx;
106 set->timeout[idx] = 0;
107 #ifdef HAVE_EPOLL
108 set->ev[idx].data.fd = fd;
109 set->ev[idx].events = events;
110 struct epoll_event ev = {
111 .data.u64 = idx,
112 .events = events
113 };
114 if (epoll_ctl(set->pfd, EPOLL_CTL_ADD, fd, &ev) != 0) {
115 return knot_map_errno();
116 }
117 #elif HAVE_KQUEUE
118 EV_SET(&set->ev[idx], fd, events, EV_ADD, 0, 0, (void *)(intptr_t)idx);
119 if (kevent(set->pfd, &set->ev[idx], 1, NULL, 0, NULL) < 0) {
120 return knot_map_errno();
121 }
122 #else
123 set->pfd[idx].fd = fd;
124 set->pfd[idx].events = events;
125 set->pfd[idx].revents = 0;
126 #endif
127
128 return idx;
129 }
130
fdset_remove(fdset_t * set,const unsigned idx)131 int fdset_remove(fdset_t *set, const unsigned idx)
132 {
133 if (set == NULL || idx >= set->n) {
134 return KNOT_EINVAL;
135 }
136
137 const int fd = fdset_get_fd(set, idx);
138 #ifdef HAVE_EPOLL
139 /* This is necessary as DDNS duplicates file descriptors! */
140 if (epoll_ctl(set->pfd, EPOLL_CTL_DEL, fd, NULL) != 0) {
141 close(fd);
142 return knot_map_errno();
143 }
144 #elif HAVE_KQUEUE
145 /* Return delete flag back to original filter number. */
146 #if defined(__NetBSD__)
147 if ((signed short)set->ev[idx].filter < 0)
148 #else
149 if (set->ev[idx].filter >= 0)
150 #endif
151 {
152 set->ev[idx].filter = ~set->ev[idx].filter;
153 }
154 set->ev[idx].flags = EV_DELETE;
155 if (kevent(set->pfd, &set->ev[idx], 1, NULL, 0, NULL) < 0) {
156 close(fd);
157 return knot_map_errno();
158 }
159 #endif
160 close(fd);
161
162 const unsigned last = --set->n;
163 /* Nothing else if it is the last one. Move last -> i if some remain. */
164 if (idx < last) {
165 set->ctx[idx] = set->ctx[last];
166 set->timeout[idx] = set->timeout[last];
167 #if defined(HAVE_EPOLL) || defined (HAVE_KQUEUE)
168 set->ev[idx] = set->ev[last];
169 #ifdef HAVE_EPOLL
170 struct epoll_event ev = {
171 .data.u64 = idx,
172 .events = set->ev[idx].events
173 };
174 if (epoll_ctl(set->pfd, EPOLL_CTL_MOD, set->ev[last].data.fd, &ev) != 0) {
175 return knot_map_errno();
176 }
177 #elif HAVE_KQUEUE
178 EV_SET(&set->ev[idx], set->ev[last].ident, set->ev[last].filter,
179 EV_ADD, 0, 0, (void *)(intptr_t)idx);
180 if (kevent(set->pfd, &set->ev[idx], 1, NULL, 0, NULL) < 0) {
181 return knot_map_errno();
182 }
183 #endif
184 #else
185 set->pfd[idx] = set->pfd[last];
186 #endif
187 }
188
189 return KNOT_EOK;
190 }
191
fdset_poll(fdset_t * set,fdset_it_t * it,const unsigned offset,const int timeout_ms)192 int fdset_poll(fdset_t *set, fdset_it_t *it, const unsigned offset, const int timeout_ms)
193 {
194 if (it == NULL) {
195 return KNOT_EINVAL;
196 }
197 it->unprocessed = 0;
198
199 if (set == NULL) {
200 return KNOT_EINVAL;
201 }
202
203 it->set = set;
204 it->idx = offset;
205 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
206 if (set->recv_size != set->size) {
207 MEM_RESIZE(set->recv_ev, set->size);
208 set->recv_size = set->size;
209 }
210 it->ptr = set->recv_ev;
211 it->dirty = 0;
212 #ifdef HAVE_EPOLL
213 if (set->n == 0) {
214 return 0;
215 }
216 if ((it->unprocessed = epoll_wait(set->pfd, set->recv_ev, set->recv_size,
217 timeout_ms)) == -1) {
218 return knot_map_errno();
219 }
220 #ifndef NDEBUG
221 /* In specific circumstances with valgrind, it sometimes happens that
222 * `set->n < it->unprocessed`. */
223 if (it->unprocessed > 0 && unlikely(it->unprocessed > set->n)) {
224 assert(it->unprocessed == 232);
225 it->unprocessed = 0;
226 }
227 #endif
228 #elif HAVE_KQUEUE
229 struct timespec timeout = {
230 .tv_sec = timeout_ms / 1000,
231 .tv_nsec = (timeout_ms % 1000) * 1000000
232 };
233 if ((it->unprocessed = kevent(set->pfd, NULL, 0, set->recv_ev, set->recv_size,
234 (timeout_ms >= 0) ? &timeout : NULL)) == -1) {
235 return knot_map_errno();
236 }
237 #endif
238 /*
239 * NOTE: Can't skip offset without bunch of syscalls!
240 * Because of that it waits for `ctx->n` (every socket). Offset is set when TCP
241 * trotlling is ON. Sometimes it can return with sockets where none of them is
242 * connected socket, but it should not be common.
243 */
244 while (it->unprocessed > 0 && fdset_it_get_idx(it) < it->idx) {
245 it->ptr++;
246 it->unprocessed--;
247 }
248 return it->unprocessed;
249 #else
250 it->unprocessed = poll(&set->pfd[offset], set->n - offset, timeout_ms);
251 #ifndef NDEBUG
252 /* In specific circumstances with valgrind, it sometimes happens that
253 * `set->n < it->unprocessed`. */
254 if (it->unprocessed > 0 && unlikely(it->unprocessed > set->n - offset)) {
255 assert(it->unprocessed == 7);
256 it->unprocessed = 0;
257 }
258 #endif
259 while (it->unprocessed > 0 && set->pfd[it->idx].revents == 0) {
260 it->idx++;
261 }
262 return it->unprocessed;
263 #endif
264 }
265
fdset_it_commit(fdset_it_t * it)266 void fdset_it_commit(fdset_it_t *it)
267 {
268 if (it == NULL) {
269 return;
270 }
271 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
272 /* NOTE: reverse iteration to avoid as much "remove last" operations
273 * as possible. I'm not sure about performance improvement. It
274 * will skip some syscalls at begin of iteration, but what
275 * performance increase do we get is a question.
276 */
277 fdset_t *set = it->set;
278 for (int i = set->n - 1; it->dirty > 0 && i >= 0; --i) {
279 #ifdef HAVE_EPOLL
280 if (set->ev[i].events == FDSET_REMOVE_FLAG)
281 #else
282 #if defined(__NetBSD__)
283 if ((signed short)set->ev[i].filter < 0)
284 #else
285 if (set->ev[i].filter >= 0)
286 #endif
287 #endif
288 {
289 (void)fdset_remove(set, i);
290 it->dirty--;
291 }
292 }
293 assert(it->dirty == 0);
294 #endif
295 }
296
fdset_set_watchdog(fdset_t * set,const unsigned idx,const int interval)297 int fdset_set_watchdog(fdset_t *set, const unsigned idx, const int interval)
298 {
299 if (set == NULL || idx >= set->n) {
300 return KNOT_EINVAL;
301 }
302
303 /* Lift watchdog if interval is negative. */
304 if (interval < 0) {
305 set->timeout[idx] = 0;
306 return KNOT_EOK;
307 }
308
309 /* Update clock. */
310 const struct timespec now = time_now();
311 set->timeout[idx] = now.tv_sec + interval; /* Only seconds precision. */
312
313 return KNOT_EOK;
314 }
315
fdset_sweep(fdset_t * set,const fdset_sweep_cb_t cb,void * data)316 void fdset_sweep(fdset_t *set, const fdset_sweep_cb_t cb, void *data)
317 {
318 if (set == NULL || cb == NULL) {
319 return;
320 }
321
322 /* Get time threshold. */
323 const struct timespec now = time_now();
324 unsigned idx = 0;
325 while (idx < set->n) {
326 /* Check sweep state, remove if requested. */
327 if (set->timeout[idx] > 0 && set->timeout[idx] <= now.tv_sec) {
328 const int fd = fdset_get_fd(set, idx);
329 if (cb(set, fd, data) == FDSET_SWEEP) {
330 (void)fdset_remove(set, idx);
331 continue;
332 }
333 }
334 ++idx;
335 }
336 }
337