1 /*
2 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3 *
4 * SPDX-License-Identifier: MPL-2.0
5 *
6 * This Source Code Form is subject to the terms of the Mozilla Public
7 * License, v. 2.0. If a copy of the MPL was not distributed with this
8 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9 *
10 * See the COPYRIGHT file distributed with this work for additional
11 * information regarding copyright ownership.
12 */
13
14 /*! \file */
15
16 #include <inttypes.h>
17 #include <stdbool.h>
18 #include <sys/param.h>
19 #include <sys/socket.h>
20 #include <sys/stat.h>
21 #include <sys/types.h>
22 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
23 #include <sys/sysctl.h>
24 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
25 #include <sys/time.h>
26 #include <sys/uio.h>
27
28 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
29 #include <linux/netlink.h>
30 #include <linux/rtnetlink.h>
31 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
32 */
33
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <unistd.h>
39
40 #include <isc/app.h>
41 #include <isc/buffer.h>
42 #include <isc/condition.h>
43 #include <isc/formatcheck.h>
44 #include <isc/list.h>
45 #include <isc/log.h>
46 #include <isc/mem.h>
47 #include <isc/mutex.h>
48 #include <isc/net.h>
49 #include <isc/once.h>
50 #include <isc/platform.h>
51 #include <isc/print.h>
52 #include <isc/refcount.h>
53 #include <isc/region.h>
54 #include <isc/resource.h>
55 #include <isc/socket.h>
56 #include <isc/stats.h>
57 #include <isc/strerr.h>
58 #include <isc/string.h>
59 #include <isc/task.h>
60 #include <isc/thread.h>
61 #include <isc/util.h>
62
63 #ifdef ISC_PLATFORM_HAVESYSUNH
64 #include <sys/un.h>
65 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
66 #ifdef HAVE_KQUEUE
67 #include <sys/event.h>
68 #endif /* ifdef HAVE_KQUEUE */
69 #ifdef HAVE_EPOLL_CREATE1
70 #include <sys/epoll.h>
71 #endif /* ifdef HAVE_EPOLL_CREATE1 */
72 #if defined(HAVE_SYS_DEVPOLL_H)
73 #include <sys/devpoll.h>
74 #elif defined(HAVE_DEVPOLL_H)
75 #include <devpoll.h>
76 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
77
78 #include <netinet/tcp.h>
79
80 #include "errno2result.h"
81
82 #ifdef ENABLE_TCP_FASTOPEN
83 #include <netinet/tcp.h>
84 #endif /* ifdef ENABLE_TCP_FASTOPEN */
85
86 #ifdef HAVE_JSON_C
87 #include <json_object.h>
88 #endif /* HAVE_JSON_C */
89
90 #ifdef HAVE_LIBXML2
91 #include <libxml/xmlwriter.h>
92 #define ISC_XMLCHAR (const xmlChar *)
93 #endif /* HAVE_LIBXML2 */
94
95 /*%
96 * Choose the most preferable multiplex method.
97 */
98 #if defined(HAVE_KQUEUE)
99 #define USE_KQUEUE
100 #elif defined(HAVE_EPOLL_CREATE1)
101 #define USE_EPOLL
102 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
103 #define USE_DEVPOLL
104 typedef struct {
105 unsigned int want_read : 1, want_write : 1;
106 } pollinfo_t;
107 #else /* if defined(HAVE_KQUEUE) */
108 #define USE_SELECT
109 #endif /* HAVE_KQUEUE */
110
111 /*
112 * Set by the -T dscp option on the command line. If set to a value
113 * other than -1, we check to make sure DSCP values match it, and
114 * assert if not.
115 */
116 int isc_dscp_check_value = -1;
117
118 /*%
119 * Maximum number of allowable open sockets. This is also the maximum
120 * allowable socket file descriptor.
121 *
122 * Care should be taken before modifying this value for select():
123 * The API standard doesn't ensure select() accept more than (the system default
124 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
125 * the vast majority of cases. This constant should therefore be increased only
126 * when absolutely necessary and possible, i.e., the server is exhausting all
127 * available file descriptors (up to FD_SETSIZE) and the select() function
128 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
129 * always by true, but we keep using some of them to ensure as much
130 * portability as possible). Note also that overall server performance
131 * may be rather worsened with a larger value of this constant due to
132 * inherent scalability problems of select().
133 *
134 * As a special note, this value shouldn't have to be touched if
135 * this is a build for an authoritative only DNS server.
136 */
137 #ifndef ISC_SOCKET_MAXSOCKETS
138 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
139 #ifdef TUNE_LARGE
140 #define ISC_SOCKET_MAXSOCKETS 21000
141 #else /* ifdef TUNE_LARGE */
142 #define ISC_SOCKET_MAXSOCKETS 4096
143 #endif /* TUNE_LARGE */
144 #elif defined(USE_SELECT)
145 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
146 #endif /* USE_KQUEUE... */
147 #endif /* ISC_SOCKET_MAXSOCKETS */
148
149 #ifdef USE_SELECT
150 /*%
151 * Mac OS X needs a special definition to support larger values in select().
152 * We always define this because a larger value can be specified run-time.
153 */
154 #ifdef __APPLE__
155 #define _DARWIN_UNLIMITED_SELECT
156 #endif /* __APPLE__ */
157 #endif /* USE_SELECT */
158
159 #ifdef ISC_SOCKET_USE_POLLWATCH
160 /*%
161 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
162 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
163 * some of the specified FD. The idea is based on the observation that it's
164 * likely for a busy server to keep receiving packets. It specifically works
165 * as follows: the socket watcher is first initialized with the state of
166 * "poll_idle". While it's in the idle state it keeps sleeping until a socket
167 * event occurs. When it wakes up for a socket I/O event, it moves to the
168 * poll_active state, and sets the poll timeout to a short period
169 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
170 * watcher goes to the poll_checking state with the same timeout period.
171 * In this state, the watcher tries to detect whether this is a break
172 * during intermittent events or the kernel bug is triggered. If the next
173 * polling reports an event within the short period, the previous timeout is
174 * likely to be a kernel bug, and so the watcher goes back to the active state.
175 * Otherwise, it moves to the idle state again.
176 *
177 * It's not clear whether this is a thread-related bug, but since we've only
178 * seen this with threads, this workaround is used only when enabling threads.
179 */
180
181 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
182
183 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
184 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
185 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
186 #endif /* ISC_SOCKET_USE_POLLWATCH */
187
188 /*%
189 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
190 */
191 #define FDLOCK_BITS 10
192 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
193 #define FDLOCK_ID(fd) \
194 (((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
195 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
196
197 /*%
198 * Maximum number of events communicated with the kernel. There should normally
199 * be no need for having a large number.
200 */
201 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
202 #ifndef ISC_SOCKET_MAXEVENTS
203 #ifdef TUNE_LARGE
204 #define ISC_SOCKET_MAXEVENTS 2048
205 #else /* ifdef TUNE_LARGE */
206 #define ISC_SOCKET_MAXEVENTS 64
207 #endif /* TUNE_LARGE */
208 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
209 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
210 * */
211
212 /*%
213 * Some systems define the socket length argument as an int, some as size_t,
214 * some as socklen_t. This is here so it can be easily changed if needed.
215 */
216 #ifndef socklen_t
217 #define socklen_t unsigned int
218 #endif /* ifndef socklen_t */
219
220 /*%
221 * Define what the possible "soft" errors can be. These are non-fatal returns
222 * of various network related functions, like recv() and so on.
223 *
224 * For some reason, BSDI (and perhaps others) will sometimes return <0
225 * from recv() but will have errno==0. This is broken, but we have to
226 * work around it here.
227 */
228 #define SOFT_ERROR(e) \
229 ((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
230 (e) == EINTR || (e) == 0)
231
232 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
233
234 /*!<
235 * DLVL(90) -- Function entry/exit and other tracing.
236 * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
237 * DLVL(60) -- Socket data send/receive
238 * DLVL(50) -- Event tracing, including receiving/sending completion events.
239 * DLVL(20) -- Socket creation/destruction.
240 */
241 #define TRACE_LEVEL 90
242 #define CORRECTNESS_LEVEL 70
243 #define IOEVENT_LEVEL 60
244 #define EVENT_LEVEL 50
245 #define CREATION_LEVEL 20
246
247 #define TRACE DLVL(TRACE_LEVEL)
248 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
249 #define IOEVENT DLVL(IOEVENT_LEVEL)
250 #define EVENT DLVL(EVENT_LEVEL)
251 #define CREATION DLVL(CREATION_LEVEL)
252
253 typedef isc_event_t intev_t;
254
255 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
256 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
257
258 /*!
259 * IPv6 control information. If the socket is an IPv6 socket we want
260 * to collect the destination address and interface so the client can
261 * set them on outgoing packets.
262 */
263 #ifndef USE_CMSG
264 #define USE_CMSG 1
265 #endif /* ifndef USE_CMSG */
266
267 /*%
268 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
269 * a setsockopt() like interface to request timestamps, and if the OS
270 * doesn't do it for us, call gettimeofday() on every UDP receive?
271 */
272 #ifdef SO_TIMESTAMP
273 #ifndef USE_CMSG
274 #define USE_CMSG 1
275 #endif /* ifndef USE_CMSG */
276 #endif /* ifdef SO_TIMESTAMP */
277
278 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
279 #define SET_RCVBUF
280 #endif
281
282 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
283 #define SET_SNDBUF
284 #endif
285
286 /*%
287 * Instead of calculating the cmsgbuf lengths every time we take
288 * a rule of thumb approach - sizes are taken from x86_64 linux,
289 * multiplied by 2, everything should fit. Those sizes are not
290 * large enough to cause any concern.
291 */
292 #if defined(USE_CMSG)
293 #define CMSG_SP_IN6PKT 40
294 #else /* if defined(USE_CMSG) */
295 #define CMSG_SP_IN6PKT 0
296 #endif /* if defined(USE_CMSG) */
297
298 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
299 #define CMSG_SP_TIMESTAMP 32
300 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
301 #define CMSG_SP_TIMESTAMP 0
302 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
303
304 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
305 #define CMSG_SP_TCTOS 24
306 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
307 #define CMSG_SP_TCTOS 0
308 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
309
310 #define CMSG_SP_INT 24
311
312 /* Align cmsg buffers to be safe on SPARC etc. */
313 #define RECVCMSGBUFLEN \
314 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
315 1, \
316 sizeof(void *))
317 #define SENDCMSGBUFLEN \
318 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
319 sizeof(void *))
320
321 /*%
322 * The number of times a send operation is repeated if the result is EINTR.
323 */
324 #define NRETRIES 10
325
326 typedef struct isc__socketthread isc__socketthread_t;
327
328 #define NEWCONNSOCK(ev) ((ev)->newsocket)
329
330 struct isc_socket {
331 /* Not locked. */
332 unsigned int magic;
333 isc_socketmgr_t *manager;
334 isc_mutex_t lock;
335 isc_sockettype_t type;
336 const isc_statscounter_t *statsindex;
337 isc_refcount_t references;
338
339 /* Locked by socket lock. */
340 ISC_LINK(isc_socket_t) link;
341 int fd;
342 int pf;
343 int threadid;
344 char name[16];
345 void *tag;
346
347 ISC_LIST(isc_socketevent_t) send_list;
348 ISC_LIST(isc_socketevent_t) recv_list;
349 ISC_LIST(isc_socket_newconnev_t) accept_list;
350 ISC_LIST(isc_socket_connev_t) connect_list;
351
352 isc_sockaddr_t peer_address; /* remote address */
353
354 unsigned int listener : 1, /* listener socket */
355 connected : 1, connecting : 1, /* connect pending
356 * */
357 bound : 1, /* bound to local addr */
358 dupped : 1, active : 1, /* currently active */
359 pktdscp : 1; /* per packet dscp */
360
361 #ifdef ISC_PLATFORM_RECVOVERFLOW
362 unsigned char overflow; /* used for MSG_TRUNC fake */
363 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
364
365 unsigned int dscp;
366 };
367
368 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
369 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
370
371 struct isc_socketmgr {
372 /* Not locked. */
373 unsigned int magic;
374 isc_mem_t *mctx;
375 isc_mutex_t lock;
376 isc_stats_t *stats;
377 int nthreads;
378 isc__socketthread_t *threads;
379 unsigned int maxsocks;
380 /* Locked by manager lock. */
381 ISC_LIST(isc_socket_t) socklist;
382 int reserved; /* unlocked */
383 isc_condition_t shutdown_ok;
384 size_t maxudp;
385 };
386
387 struct isc__socketthread {
388 isc_socketmgr_t *manager;
389 int threadid;
390 isc_thread_t thread;
391 int pipe_fds[2];
392 isc_mutex_t *fdlock;
393 /* Locked by fdlock. */
394 isc_socket_t **fds;
395 int *fdstate;
396 #ifdef USE_KQUEUE
397 int kqueue_fd;
398 int nevents;
399 struct kevent *events;
400 #endif /* USE_KQUEUE */
401 #ifdef USE_EPOLL
402 int epoll_fd;
403 int nevents;
404 struct epoll_event *events;
405 uint32_t *epoll_events;
406 #endif /* USE_EPOLL */
407 #ifdef USE_DEVPOLL
408 int devpoll_fd;
409 isc_resourcevalue_t open_max;
410 unsigned int calls;
411 int nevents;
412 struct pollfd *events;
413 pollinfo_t *fdpollinfo;
414 #endif /* USE_DEVPOLL */
415 #ifdef USE_SELECT
416 int fd_bufsize;
417 fd_set *read_fds;
418 fd_set *read_fds_copy;
419 fd_set *write_fds;
420 fd_set *write_fds_copy;
421 int maxfd;
422 #endif /* USE_SELECT */
423 };
424
425 #define CLOSED 0 /* this one must be zero */
426 #define MANAGED 1
427 #define CLOSE_PENDING 2
428
429 /*
430 * send() and recv() iovec counts
431 */
432 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
433 #ifdef ISC_PLATFORM_RECVOVERFLOW
434 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
435 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
436 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
437 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
438
439 static isc_result_t
440 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
441 isc_socket_t **socketp, isc_socket_t *dup_socket);
442 static void
443 send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
444 static void
445 send_senddone_event(isc_socket_t *, isc_socketevent_t **);
446 static void
447 send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
448 static void
449 free_socket(isc_socket_t **);
450 static isc_result_t
451 allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
452 static void
453 destroy(isc_socket_t **);
454 static void
455 internal_accept(isc_socket_t *);
456 static void
457 internal_connect(isc_socket_t *);
458 static void
459 internal_recv(isc_socket_t *);
460 static void
461 internal_send(isc_socket_t *);
462 static void
463 process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
464 static void
465 build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
466 struct iovec *, size_t *);
467 static void
468 build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
469 struct iovec *, size_t *);
470 static bool
471 process_ctlfd(isc__socketthread_t *thread);
472 static void
473 setdscp(isc_socket_t *sock, isc_dscp_t dscp);
474
475 #define SELECT_POKE_SHUTDOWN (-1)
476 #define SELECT_POKE_NOTHING (-2)
477 #define SELECT_POKE_READ (-3)
478 #define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
479 #define SELECT_POKE_WRITE (-4)
480 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
481 #define SELECT_POKE_CLOSE (-5)
482
483 /*%
484 * Shortcut index arrays to get access to statistics counters.
485 */
486 enum {
487 STATID_OPEN = 0,
488 STATID_OPENFAIL = 1,
489 STATID_CLOSE = 2,
490 STATID_BINDFAIL = 3,
491 STATID_CONNECTFAIL = 4,
492 STATID_CONNECT = 5,
493 STATID_ACCEPTFAIL = 6,
494 STATID_ACCEPT = 7,
495 STATID_SENDFAIL = 8,
496 STATID_RECVFAIL = 9,
497 STATID_ACTIVE = 10
498 };
499 static const isc_statscounter_t udp4statsindex[] = {
500 isc_sockstatscounter_udp4open,
501 isc_sockstatscounter_udp4openfail,
502 isc_sockstatscounter_udp4close,
503 isc_sockstatscounter_udp4bindfail,
504 isc_sockstatscounter_udp4connectfail,
505 isc_sockstatscounter_udp4connect,
506 -1,
507 -1,
508 isc_sockstatscounter_udp4sendfail,
509 isc_sockstatscounter_udp4recvfail,
510 isc_sockstatscounter_udp4active
511 };
512 static const isc_statscounter_t udp6statsindex[] = {
513 isc_sockstatscounter_udp6open,
514 isc_sockstatscounter_udp6openfail,
515 isc_sockstatscounter_udp6close,
516 isc_sockstatscounter_udp6bindfail,
517 isc_sockstatscounter_udp6connectfail,
518 isc_sockstatscounter_udp6connect,
519 -1,
520 -1,
521 isc_sockstatscounter_udp6sendfail,
522 isc_sockstatscounter_udp6recvfail,
523 isc_sockstatscounter_udp6active
524 };
525 static const isc_statscounter_t tcp4statsindex[] = {
526 isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail,
527 isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail,
528 isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
529 isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept,
530 isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail,
531 isc_sockstatscounter_tcp4active
532 };
533 static const isc_statscounter_t tcp6statsindex[] = {
534 isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail,
535 isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail,
536 isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
537 isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept,
538 isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail,
539 isc_sockstatscounter_tcp6active
540 };
541 static const isc_statscounter_t unixstatsindex[] = {
542 isc_sockstatscounter_unixopen, isc_sockstatscounter_unixopenfail,
543 isc_sockstatscounter_unixclose, isc_sockstatscounter_unixbindfail,
544 isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
545 isc_sockstatscounter_unixacceptfail, isc_sockstatscounter_unixaccept,
546 isc_sockstatscounter_unixsendfail, isc_sockstatscounter_unixrecvfail,
547 isc_sockstatscounter_unixactive
548 };
549 static const isc_statscounter_t rawstatsindex[] = {
550 isc_sockstatscounter_rawopen,
551 isc_sockstatscounter_rawopenfail,
552 isc_sockstatscounter_rawclose,
553 -1,
554 -1,
555 -1,
556 -1,
557 -1,
558 -1,
559 isc_sockstatscounter_rawrecvfail,
560 isc_sockstatscounter_rawactive
561 };
562
563 static int
564 gen_threadid(isc_socket_t *sock);
565
566 static int
gen_threadid(isc_socket_t * sock)567 gen_threadid(isc_socket_t *sock) {
568 return (sock->fd % sock->manager->nthreads);
569 }
570
571 static void
572 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
573 isc_logmodule_t *module, int level, const char *fmt, ...)
574 ISC_FORMAT_PRINTF(5, 6);
575 static void
manager_log(isc_socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)576 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
577 isc_logmodule_t *module, int level, const char *fmt, ...) {
578 char msgbuf[2048];
579 va_list ap;
580
581 if (!isc_log_wouldlog(isc_lctx, level)) {
582 return;
583 }
584
585 va_start(ap, fmt);
586 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
587 va_end(ap);
588
589 isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
590 sockmgr, msgbuf);
591 }
592
593 static void
594 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
595 isc_logmodule_t *module, int level, const char *fmt, ...)
596 ISC_FORMAT_PRINTF(5, 6);
597 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)598 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
599 isc_logmodule_t *module, int level, const char *fmt, ...) {
600 char msgbuf[2048];
601 va_list ap;
602
603 if (!isc_log_wouldlog(isc_lctx, level)) {
604 return;
605 }
606
607 va_start(ap, fmt);
608 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
609 va_end(ap);
610
611 isc_log_write(isc_lctx, category, module, level,
612 "sockmgr %p thread %d: %s", thread->manager,
613 thread->threadid, msgbuf);
614 }
615
616 static void
617 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
618 isc_logcategory_t *category, isc_logmodule_t *module, int level,
619 const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
620 static void
socket_log(isc_socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)621 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
622 isc_logcategory_t *category, isc_logmodule_t *module, int level,
623 const char *fmt, ...) {
624 char msgbuf[2048];
625 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
626 va_list ap;
627
628 if (!isc_log_wouldlog(isc_lctx, level)) {
629 return;
630 }
631
632 va_start(ap, fmt);
633 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
634 va_end(ap);
635
636 if (address == NULL) {
637 isc_log_write(isc_lctx, category, module, level,
638 "socket %p: %s", sock, msgbuf);
639 } else {
640 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
641 isc_log_write(isc_lctx, category, module, level,
642 "socket %p %s: %s", sock, peerbuf, msgbuf);
643 }
644 }
645
646 /*%
647 * Increment socket-related statistics counters.
648 */
649 static inline void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)650 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
651 REQUIRE(counterid != -1);
652
653 if (stats != NULL) {
654 isc_stats_increment(stats, counterid);
655 }
656 }
657
658 /*%
659 * Decrement socket-related statistics counters.
660 */
661 static inline void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)662 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
663 REQUIRE(counterid != -1);
664
665 if (stats != NULL) {
666 isc_stats_decrement(stats, counterid);
667 }
668 }
669
670 static inline isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)671 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
672 isc_result_t result = ISC_R_SUCCESS;
673
674 #ifdef USE_KQUEUE
675 struct kevent evchange;
676
677 memset(&evchange, 0, sizeof(evchange));
678 if (msg == SELECT_POKE_READ) {
679 evchange.filter = EVFILT_READ;
680 } else {
681 evchange.filter = EVFILT_WRITE;
682 }
683 evchange.flags = EV_ADD;
684 evchange.ident = fd;
685 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
686 result = isc__errno2result(errno);
687 }
688
689 return (result);
690 #elif defined(USE_EPOLL)
691 struct epoll_event event;
692 uint32_t oldevents;
693 int ret;
694 int op;
695
696 oldevents = thread->epoll_events[fd];
697 if (msg == SELECT_POKE_READ) {
698 thread->epoll_events[fd] |= EPOLLIN;
699 } else {
700 thread->epoll_events[fd] |= EPOLLOUT;
701 }
702
703 event.events = thread->epoll_events[fd];
704 memset(&event.data, 0, sizeof(event.data));
705 event.data.fd = fd;
706
707 op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
708 if (thread->fds[fd] != NULL) {
709 LOCK(&thread->fds[fd]->lock);
710 }
711 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
712 if (thread->fds[fd] != NULL) {
713 UNLOCK(&thread->fds[fd]->lock);
714 }
715 if (ret == -1) {
716 if (errno == EEXIST) {
717 UNEXPECTED_ERROR(__FILE__, __LINE__,
718 "epoll_ctl(ADD/MOD) returned "
719 "EEXIST for fd %d",
720 fd);
721 }
722 result = isc__errno2result(errno);
723 }
724
725 return (result);
726 #elif defined(USE_DEVPOLL)
727 struct pollfd pfd;
728
729 memset(&pfd, 0, sizeof(pfd));
730 if (msg == SELECT_POKE_READ) {
731 pfd.events = POLLIN;
732 } else {
733 pfd.events = POLLOUT;
734 }
735 pfd.fd = fd;
736 pfd.revents = 0;
737 if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
738 result = isc__errno2result(errno);
739 } else {
740 if (msg == SELECT_POKE_READ) {
741 thread->fdpollinfo[fd].want_read = 1;
742 } else {
743 thread->fdpollinfo[fd].want_write = 1;
744 }
745 }
746
747 return (result);
748 #elif defined(USE_SELECT)
749 LOCK(&thread->manager->lock);
750 if (msg == SELECT_POKE_READ) {
751 FD_SET(fd, thread->read_fds);
752 }
753 if (msg == SELECT_POKE_WRITE) {
754 FD_SET(fd, thread->write_fds);
755 }
756 UNLOCK(&thread->manager->lock);
757
758 return (result);
759 #endif /* ifdef USE_KQUEUE */
760 }
761
762 static inline isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)763 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
764 isc_result_t result = ISC_R_SUCCESS;
765
766 #ifdef USE_KQUEUE
767 struct kevent evchange;
768
769 memset(&evchange, 0, sizeof(evchange));
770 if (msg == SELECT_POKE_READ) {
771 evchange.filter = EVFILT_READ;
772 } else {
773 evchange.filter = EVFILT_WRITE;
774 }
775 evchange.flags = EV_DELETE;
776 evchange.ident = fd;
777 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
778 result = isc__errno2result(errno);
779 }
780
781 return (result);
782 #elif defined(USE_EPOLL)
783 struct epoll_event event;
784 int ret;
785 int op;
786
787 if (msg == SELECT_POKE_READ) {
788 thread->epoll_events[fd] &= ~(EPOLLIN);
789 } else {
790 thread->epoll_events[fd] &= ~(EPOLLOUT);
791 }
792
793 event.events = thread->epoll_events[fd];
794 memset(&event.data, 0, sizeof(event.data));
795 event.data.fd = fd;
796
797 op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
798 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
799 if (ret == -1 && errno != ENOENT) {
800 char strbuf[ISC_STRERRORSIZE];
801 strerror_r(errno, strbuf, sizeof(strbuf));
802 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
803 fd, strbuf);
804 result = ISC_R_UNEXPECTED;
805 }
806 return (result);
807 #elif defined(USE_DEVPOLL)
808 struct pollfd pfds[2];
809 size_t writelen = sizeof(pfds[0]);
810
811 memset(pfds, 0, sizeof(pfds));
812 pfds[0].events = POLLREMOVE;
813 pfds[0].fd = fd;
814
815 /*
816 * Canceling read or write polling via /dev/poll is tricky. Since it
817 * only provides a way of canceling per FD, we may need to re-poll the
818 * socket for the other operation.
819 */
820 if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
821 pfds[1].events = POLLOUT;
822 pfds[1].fd = fd;
823 writelen += sizeof(pfds[1]);
824 }
825 if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
826 pfds[1].events = POLLIN;
827 pfds[1].fd = fd;
828 writelen += sizeof(pfds[1]);
829 }
830
831 if (write(thread->devpoll_fd, pfds, writelen) == -1) {
832 result = isc__errno2result(errno);
833 } else {
834 if (msg == SELECT_POKE_READ) {
835 thread->fdpollinfo[fd].want_read = 0;
836 } else {
837 thread->fdpollinfo[fd].want_write = 0;
838 }
839 }
840
841 return (result);
842 #elif defined(USE_SELECT)
843 LOCK(&thread->manager->lock);
844 if (msg == SELECT_POKE_READ) {
845 FD_CLR(fd, thread->read_fds);
846 } else if (msg == SELECT_POKE_WRITE) {
847 FD_CLR(fd, thread->write_fds);
848 }
849 UNLOCK(&thread->manager->lock);
850
851 return (result);
852 #endif /* ifdef USE_KQUEUE */
853 }
854
855 /*
856 * A poke message was received, perform a proper watch/unwatch
857 * on a fd provided
858 */
859 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)860 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
861 isc_result_t result;
862 int lockid = FDLOCK_ID(fd);
863
864 /*
865 * This is a wakeup on a socket. If the socket is not in the
866 * process of being closed, start watching it for either reads
867 * or writes.
868 */
869
870 INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
871
872 if (msg == SELECT_POKE_CLOSE) {
873 LOCK(&thread->fdlock[lockid]);
874 INSIST(thread->fdstate[fd] == CLOSE_PENDING);
875 thread->fdstate[fd] = CLOSED;
876 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
877 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
878 (void)close(fd);
879 UNLOCK(&thread->fdlock[lockid]);
880 return;
881 }
882
883 LOCK(&thread->fdlock[lockid]);
884 if (thread->fdstate[fd] == CLOSE_PENDING) {
885 /*
886 * We accept (and ignore) any error from unwatch_fd() as we are
887 * closing the socket, hoping it doesn't leave dangling state in
888 * the kernel.
889 * Note that unwatch_fd() must be called after releasing the
890 * fdlock; otherwise it could cause deadlock due to a lock order
891 * reversal.
892 */
893 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
894 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
895 UNLOCK(&thread->fdlock[lockid]);
896 return;
897 }
898 if (thread->fdstate[fd] != MANAGED) {
899 UNLOCK(&thread->fdlock[lockid]);
900 return;
901 }
902
903 /*
904 * Set requested bit.
905 */
906 result = watch_fd(thread, fd, msg);
907 if (result != ISC_R_SUCCESS) {
908 /*
909 * XXXJT: what should we do? Ignoring the failure of watching
910 * a socket will make the application dysfunctional, but there
911 * seems to be no reasonable recovery process.
912 */
913 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
914 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
915 "failed to start watching FD (%d): %s", fd,
916 isc_result_totext(result));
917 }
918 UNLOCK(&thread->fdlock[lockid]);
919 }
920
921 /*
922 * Poke the select loop when there is something for us to do.
923 * The write is required (by POSIX) to complete. That is, we
924 * will not get partial writes.
925 */
926 static void
select_poke(isc_socketmgr_t * mgr,int threadid,int fd,int msg)927 select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
928 int cc;
929 int buf[2];
930 char strbuf[ISC_STRERRORSIZE];
931
932 buf[0] = fd;
933 buf[1] = msg;
934
935 do {
936 cc = write(mgr->threads[threadid].pipe_fds[1], buf,
937 sizeof(buf));
938 #ifdef ENOSR
939 /*
940 * Treat ENOSR as EAGAIN but loop slowly as it is
941 * unlikely to clear fast.
942 */
943 if (cc < 0 && errno == ENOSR) {
944 sleep(1);
945 errno = EAGAIN;
946 }
947 #endif /* ifdef ENOSR */
948 } while (cc < 0 && SOFT_ERROR(errno));
949
950 if (cc < 0) {
951 strerror_r(errno, strbuf, sizeof(strbuf));
952 FATAL_ERROR(__FILE__, __LINE__,
953 "write() failed during watcher poke: %s", strbuf);
954 }
955
956 INSIST(cc == sizeof(buf));
957 }
958
959 /*
960 * Read a message on the internal fd.
961 */
962 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)963 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
964 int buf[2];
965 int cc;
966 char strbuf[ISC_STRERRORSIZE];
967
968 cc = read(thread->pipe_fds[0], buf, sizeof(buf));
969 if (cc < 0) {
970 *msg = SELECT_POKE_NOTHING;
971 *fd = -1; /* Silence compiler. */
972 if (SOFT_ERROR(errno)) {
973 return;
974 }
975
976 strerror_r(errno, strbuf, sizeof(strbuf));
977 FATAL_ERROR(__FILE__, __LINE__,
978 "read() failed during watcher poke: %s", strbuf);
979 }
980 INSIST(cc == sizeof(buf));
981
982 *fd = buf[0];
983 *msg = buf[1];
984 }
985
986 /*
987 * Make a fd non-blocking.
988 */
989 static isc_result_t
make_nonblock(int fd)990 make_nonblock(int fd) {
991 int ret;
992 char strbuf[ISC_STRERRORSIZE];
993 #ifdef USE_FIONBIO_IOCTL
994 int on = 1;
995 #else /* ifdef USE_FIONBIO_IOCTL */
996 int flags;
997 #endif /* ifdef USE_FIONBIO_IOCTL */
998
999 #ifdef USE_FIONBIO_IOCTL
1000 ret = ioctl(fd, FIONBIO, (char *)&on);
1001 #else /* ifdef USE_FIONBIO_IOCTL */
1002 flags = fcntl(fd, F_GETFL, 0);
1003 flags |= PORT_NONBLOCK;
1004 ret = fcntl(fd, F_SETFL, flags);
1005 #endif /* ifdef USE_FIONBIO_IOCTL */
1006
1007 if (ret == -1) {
1008 strerror_r(errno, strbuf, sizeof(strbuf));
1009 UNEXPECTED_ERROR(__FILE__, __LINE__,
1010 #ifdef USE_FIONBIO_IOCTL
1011 "ioctl(%d, FIONBIO, &on): %s", fd,
1012 #else /* ifdef USE_FIONBIO_IOCTL */
1013 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1014 #endif /* ifdef USE_FIONBIO_IOCTL */
1015 strbuf);
1016
1017 return (ISC_R_UNEXPECTED);
1018 }
1019
1020 return (ISC_R_SUCCESS);
1021 }
1022
1023 #ifdef USE_CMSG
1024 /*
1025 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1026 * In order to ensure as much portability as possible, we provide wrapper
1027 * functions of these macros.
1028 * Note that cmsg_space() could run slow on OSes that do not have
1029 * CMSG_SPACE.
1030 */
1031 static inline socklen_t
cmsg_len(socklen_t len)1032 cmsg_len(socklen_t len) {
1033 #ifdef CMSG_LEN
1034 return (CMSG_LEN(len));
1035 #else /* ifdef CMSG_LEN */
1036 socklen_t hdrlen;
1037
1038 /*
1039 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1040 * is correct.
1041 */
1042 hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1043 return (hdrlen + len);
1044 #endif /* ifdef CMSG_LEN */
1045 }
1046
1047 static inline socklen_t
cmsg_space(socklen_t len)1048 cmsg_space(socklen_t len) {
1049 #ifdef CMSG_SPACE
1050 return (CMSG_SPACE(len));
1051 #else /* ifdef CMSG_SPACE */
1052 struct msghdr msg;
1053 struct cmsghdr *cmsgp;
1054 /*
1055 * XXX: The buffer length is an ad-hoc value, but should be enough
1056 * in a practical sense.
1057 */
1058 char dummybuf[sizeof(struct cmsghdr) + 1024];
1059
1060 memset(&msg, 0, sizeof(msg));
1061 msg.msg_control = dummybuf;
1062 msg.msg_controllen = sizeof(dummybuf);
1063
1064 cmsgp = (struct cmsghdr *)dummybuf;
1065 cmsgp->cmsg_len = cmsg_len(len);
1066
1067 cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1068 if (cmsgp != NULL) {
1069 return ((char *)cmsgp - (char *)msg.msg_control);
1070 } else {
1071 return (0);
1072 }
1073 #endif /* ifdef CMSG_SPACE */
1074 }
1075 #endif /* USE_CMSG */
1076
1077 /*
1078 * Process control messages received on a socket.
1079 */
1080 static void
process_cmsg(isc_socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1081 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1082 #ifdef USE_CMSG
1083 struct cmsghdr *cmsgp;
1084 struct in6_pktinfo *pktinfop;
1085 #ifdef SO_TIMESTAMP
1086 void *timevalp;
1087 #endif /* ifdef SO_TIMESTAMP */
1088 #endif /* ifdef USE_CMSG */
1089
1090 /*
1091 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1092 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1093 * They are all here, outside of the CPP tests, because it is
1094 * more consistent with the usual ISC coding style.
1095 */
1096 UNUSED(sock);
1097 UNUSED(msg);
1098 UNUSED(dev);
1099
1100 #ifdef MSG_TRUNC
1101 if ((msg->msg_flags & MSG_TRUNC) != 0) {
1102 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1103 }
1104 #endif /* ifdef MSG_TRUNC */
1105
1106 #ifdef MSG_CTRUNC
1107 if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1108 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1109 }
1110 #endif /* ifdef MSG_CTRUNC */
1111
1112 #ifndef USE_CMSG
1113 return;
1114 #else /* ifndef USE_CMSG */
1115 if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1116 return;
1117 }
1118
1119 #ifdef SO_TIMESTAMP
1120 timevalp = NULL;
1121 #endif /* ifdef SO_TIMESTAMP */
1122 pktinfop = NULL;
1123
1124 cmsgp = CMSG_FIRSTHDR(msg);
1125 while (cmsgp != NULL) {
1126 socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1127
1128 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1129 cmsgp->cmsg_type == IPV6_PKTINFO) {
1130 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1131 memmove(&dev->pktinfo, pktinfop,
1132 sizeof(struct in6_pktinfo));
1133 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1134 socket_log(sock, NULL, TRACE,
1135 "interface received on ifindex %u",
1136 dev->pktinfo.ipi6_ifindex);
1137 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1138 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1139 }
1140 goto next;
1141 }
1142
1143 #ifdef SO_TIMESTAMP
1144 if (cmsgp->cmsg_level == SOL_SOCKET &&
1145 cmsgp->cmsg_type == SCM_TIMESTAMP) {
1146 struct timeval tv;
1147 timevalp = CMSG_DATA(cmsgp);
1148 memmove(&tv, timevalp, sizeof(tv));
1149 dev->timestamp.seconds = tv.tv_sec;
1150 dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1151 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1152 goto next;
1153 }
1154 #endif /* ifdef SO_TIMESTAMP */
1155
1156 #ifdef IPV6_TCLASS
1157 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1158 cmsgp->cmsg_type == IPV6_TCLASS) {
1159 dev->dscp = *(int *)CMSG_DATA(cmsgp);
1160 dev->dscp >>= 2;
1161 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1162 goto next;
1163 }
1164 #endif /* ifdef IPV6_TCLASS */
1165
1166 #ifdef IP_TOS
1167 if (cmsgp->cmsg_level == IPPROTO_IP &&
1168 (cmsgp->cmsg_type == IP_TOS
1169 #ifdef IP_RECVTOS
1170 || cmsgp->cmsg_type == IP_RECVTOS
1171 #endif /* ifdef IP_RECVTOS */
1172 ))
1173 {
1174 dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1175 dev->dscp >>= 2;
1176 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1177 goto next;
1178 }
1179 #endif /* ifdef IP_TOS */
1180 next:
1181 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1182 }
1183 #endif /* USE_CMSG */
1184 }
1185
1186 /*
1187 * Construct an iov array and attach it to the msghdr passed in. This is
1188 * the SEND constructor, which will use the used region of the buffer
1189 * (if using a buffer list) or will use the internal region (if a single
1190 * buffer I/O is requested).
1191 *
1192 * Nothing can be NULL, and the done event must list at least one buffer
1193 * on the buffer linked list for this function to be meaningful.
1194 *
1195 * If write_countp != NULL, *write_countp will hold the number of bytes
1196 * this transaction can send.
1197 */
1198 static void
build_msghdr_send(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1199 build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1200 struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1201 unsigned int iovcount;
1202 size_t write_count;
1203 struct cmsghdr *cmsgp;
1204
1205 memset(msg, 0, sizeof(*msg));
1206
1207 if (!sock->connected) {
1208 msg->msg_name = (void *)&dev->address.type.sa;
1209 msg->msg_namelen = dev->address.length;
1210 } else {
1211 msg->msg_name = NULL;
1212 msg->msg_namelen = 0;
1213 }
1214
1215 write_count = dev->region.length - dev->n;
1216 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1217 iov[0].iov_len = write_count;
1218 iovcount = 1;
1219
1220 msg->msg_iov = iov;
1221 msg->msg_iovlen = iovcount;
1222 msg->msg_control = NULL;
1223 msg->msg_controllen = 0;
1224 msg->msg_flags = 0;
1225 #if defined(USE_CMSG)
1226
1227 if ((sock->type == isc_sockettype_udp) &&
1228 ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1229 {
1230 struct in6_pktinfo *pktinfop;
1231
1232 socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1233 dev->pktinfo.ipi6_ifindex);
1234
1235 msg->msg_control = (void *)cmsgbuf;
1236 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1237 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1238
1239 cmsgp = (struct cmsghdr *)cmsgbuf;
1240 cmsgp->cmsg_level = IPPROTO_IPV6;
1241 cmsgp->cmsg_type = IPV6_PKTINFO;
1242 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1243 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1244 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1245 }
1246
1247 #if defined(IPV6_USE_MIN_MTU)
1248 if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1249 ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1250 {
1251 int use_min_mtu = 1; /* -1, 0, 1 */
1252
1253 cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1254 msg->msg_control = (void *)cmsgbuf;
1255 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1256 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1257
1258 cmsgp->cmsg_level = IPPROTO_IPV6;
1259 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1260 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1261 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1262 }
1263 #endif /* if defined(IPV6_USE_MIN_MTU) */
1264
1265 if (isc_dscp_check_value > -1) {
1266 if (sock->type == isc_sockettype_udp) {
1267 INSIST((int)dev->dscp == isc_dscp_check_value);
1268 } else if (sock->type == isc_sockettype_tcp) {
1269 INSIST((int)sock->dscp == isc_dscp_check_value);
1270 }
1271 }
1272
1273 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1274 if ((sock->type == isc_sockettype_udp) &&
1275 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1276 {
1277 int dscp = (dev->dscp << 2) & 0xff;
1278
1279 INSIST(dev->dscp < 0x40);
1280
1281 #ifdef IP_TOS
1282 if (sock->pf == AF_INET && sock->pktdscp) {
1283 cmsgp = (struct cmsghdr *)(cmsgbuf +
1284 msg->msg_controllen);
1285 msg->msg_control = (void *)cmsgbuf;
1286 msg->msg_controllen += cmsg_space(sizeof(dscp));
1287 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1288
1289 cmsgp->cmsg_level = IPPROTO_IP;
1290 cmsgp->cmsg_type = IP_TOS;
1291 cmsgp->cmsg_len = cmsg_len(sizeof(char));
1292 *(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1293 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1294 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1295 (void *)&dscp, sizeof(int)) < 0) {
1296 char strbuf[ISC_STRERRORSIZE];
1297 strerror_r(errno, strbuf, sizeof(strbuf));
1298 UNEXPECTED_ERROR(__FILE__, __LINE__,
1299 "setsockopt(%d, IP_TOS, %.02x)"
1300 " failed: %s",
1301 sock->fd, dscp >> 2, strbuf);
1302 } else {
1303 sock->dscp = dscp;
1304 }
1305 }
1306 #endif /* ifdef IP_TOS */
1307 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1308 if (sock->pf == AF_INET6 && sock->pktdscp) {
1309 cmsgp = (struct cmsghdr *)(cmsgbuf +
1310 msg->msg_controllen);
1311 msg->msg_control = (void *)cmsgbuf;
1312 msg->msg_controllen += cmsg_space(sizeof(dscp));
1313 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1314
1315 cmsgp->cmsg_level = IPPROTO_IPV6;
1316 cmsgp->cmsg_type = IPV6_TCLASS;
1317 cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1318 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1319 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1320 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1321 (void *)&dscp, sizeof(int)) < 0)
1322 {
1323 char strbuf[ISC_STRERRORSIZE];
1324 strerror_r(errno, strbuf, sizeof(strbuf));
1325 UNEXPECTED_ERROR(__FILE__, __LINE__,
1326 "setsockopt(%d, IPV6_TCLASS, "
1327 "%.02x) failed: %s",
1328 sock->fd, dscp >> 2, strbuf);
1329 } else {
1330 sock->dscp = dscp;
1331 }
1332 }
1333 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1334 if (msg->msg_controllen != 0 &&
1335 msg->msg_controllen < SENDCMSGBUFLEN) {
1336 memset(cmsgbuf + msg->msg_controllen, 0,
1337 SENDCMSGBUFLEN - msg->msg_controllen);
1338 }
1339 }
1340 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1341 * defined(IPV6_TCLASS)) \
1342 * */
1343 #endif /* USE_CMSG */
1344
1345 if (write_countp != NULL) {
1346 *write_countp = write_count;
1347 }
1348 }
1349
1350 /*
1351 * Construct an iov array and attach it to the msghdr passed in. This is
1352 * the RECV constructor, which will use the available region of the buffer
1353 * (if using a buffer list) or will use the internal region (if a single
1354 * buffer I/O is requested).
1355 *
1356 * Nothing can be NULL, and the done event must list at least one buffer
1357 * on the buffer linked list for this function to be meaningful.
1358 *
1359 * If read_countp != NULL, *read_countp will hold the number of bytes
1360 * this transaction can receive.
1361 */
1362 static void
build_msghdr_recv(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1363 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1364 struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1365 unsigned int iovcount;
1366 size_t read_count;
1367
1368 memset(msg, 0, sizeof(struct msghdr));
1369
1370 if (sock->type == isc_sockettype_udp) {
1371 memset(&dev->address, 0, sizeof(dev->address));
1372 msg->msg_name = (void *)&dev->address.type.sa;
1373 msg->msg_namelen = sizeof(dev->address.type);
1374 } else { /* TCP */
1375 msg->msg_name = NULL;
1376 msg->msg_namelen = 0;
1377 dev->address = sock->peer_address;
1378 }
1379
1380 read_count = dev->region.length - dev->n;
1381 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1382 iov[0].iov_len = read_count;
1383 iovcount = 1;
1384
1385 /*
1386 * If needed, set up to receive that one extra byte.
1387 */
1388 #ifdef ISC_PLATFORM_RECVOVERFLOW
1389 if (sock->type == isc_sockettype_udp) {
1390 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1391 iov[iovcount].iov_base = (void *)(&sock->overflow);
1392 iov[iovcount].iov_len = 1;
1393 iovcount++;
1394 }
1395 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1396
1397 msg->msg_iov = iov;
1398 msg->msg_iovlen = iovcount;
1399
1400 #if defined(USE_CMSG)
1401 msg->msg_control = cmsgbuf;
1402 msg->msg_controllen = RECVCMSGBUFLEN;
1403 #else /* if defined(USE_CMSG) */
1404 msg->msg_control = NULL;
1405 msg->msg_controllen = 0;
1406 #endif /* USE_CMSG */
1407 msg->msg_flags = 0;
1408
1409 if (read_countp != NULL) {
1410 *read_countp = read_count;
1411 }
1412 }
1413
1414 static void
set_dev_address(const isc_sockaddr_t * address,isc_socket_t * sock,isc_socketevent_t * dev)1415 set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
1416 isc_socketevent_t *dev) {
1417 if (sock->type == isc_sockettype_udp) {
1418 if (address != NULL) {
1419 dev->address = *address;
1420 } else {
1421 dev->address = sock->peer_address;
1422 }
1423 } else if (sock->type == isc_sockettype_tcp) {
1424 INSIST(address == NULL);
1425 dev->address = sock->peer_address;
1426 }
1427 }
1428
1429 static void
destroy_socketevent(isc_event_t * event)1430 destroy_socketevent(isc_event_t *event) {
1431 isc_socketevent_t *ev = (isc_socketevent_t *)event;
1432
1433 (ev->destroy)(event);
1434 }
1435
1436 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1437 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1438 isc_taskaction_t action, void *arg) {
1439 isc_socketevent_t *ev;
1440
1441 ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1442 action, arg, sizeof(*ev));
1443
1444 ev->result = ISC_R_UNSET;
1445 ISC_LINK_INIT(ev, ev_link);
1446 ev->region.base = NULL;
1447 ev->n = 0;
1448 ev->offset = 0;
1449 ev->attributes = 0;
1450 ev->destroy = ev->ev_destroy;
1451 ev->ev_destroy = destroy_socketevent;
1452 ev->dscp = 0;
1453
1454 return (ev);
1455 }
1456
1457 #if defined(ISC_SOCKET_DEBUG)
1458 static void
dump_msg(struct msghdr * msg)1459 dump_msg(struct msghdr *msg) {
1460 unsigned int i;
1461
1462 printf("MSGHDR %p\n", msg);
1463 printf("\tname %p, namelen %ld\n", msg->msg_name,
1464 (long)msg->msg_namelen);
1465 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1466 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1467 printf("\t\t%u\tbase %p, len %ld\n", i,
1468 msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1469 printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1470 (long)msg->msg_controllen);
1471 }
1472 #endif /* if defined(ISC_SOCKET_DEBUG) */
1473
1474 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1475 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
1476 #define DOIO_HARD 2 /* i/o error, event sent */
1477 #define DOIO_EOF 3 /* EOF, no event sent */
1478
1479 static int
doio_recv(isc_socket_t * sock,isc_socketevent_t * dev)1480 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1481 int cc;
1482 struct iovec iov[MAXSCATTERGATHER_RECV];
1483 size_t read_count;
1484 struct msghdr msghdr;
1485 int recv_errno;
1486 char strbuf[ISC_STRERRORSIZE];
1487 char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1488
1489 build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1490
1491 #if defined(ISC_SOCKET_DEBUG)
1492 dump_msg(&msghdr);
1493 #endif /* if defined(ISC_SOCKET_DEBUG) */
1494
1495 cc = recvmsg(sock->fd, &msghdr, 0);
1496 recv_errno = errno;
1497
1498 #if defined(ISC_SOCKET_DEBUG)
1499 dump_msg(&msghdr);
1500 #endif /* if defined(ISC_SOCKET_DEBUG) */
1501
1502 if (cc < 0) {
1503 if (SOFT_ERROR(recv_errno)) {
1504 return (DOIO_SOFT);
1505 }
1506
1507 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1508 strerror_r(recv_errno, strbuf, sizeof(strbuf));
1509 socket_log(sock, NULL, IOEVENT,
1510 "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1511 sock->fd, cc, recv_errno, strbuf);
1512 }
1513
1514 #define SOFT_OR_HARD(_system, _isc) \
1515 if (recv_errno == _system) { \
1516 if (sock->connected) { \
1517 dev->result = _isc; \
1518 inc_stats(sock->manager->stats, \
1519 sock->statsindex[STATID_RECVFAIL]); \
1520 return (DOIO_HARD); \
1521 } \
1522 return (DOIO_SOFT); \
1523 }
1524 #define ALWAYS_HARD(_system, _isc) \
1525 if (recv_errno == _system) { \
1526 dev->result = _isc; \
1527 inc_stats(sock->manager->stats, \
1528 sock->statsindex[STATID_RECVFAIL]); \
1529 return (DOIO_HARD); \
1530 }
1531
1532 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1533 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1534 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1535 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1536 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1537 /*
1538 * Older operating systems may still return EPROTO in some
1539 * situations, for example when receiving ICMP/ICMPv6 errors.
1540 * A real life scenario is when ICMPv6 returns code 5 or 6.
1541 * These codes are introduced in RFC 4443 from March 2006,
1542 * and the document obsoletes RFC 1885. But unfortunately not
1543 * all operating systems have caught up with the new standard
1544 * (in 2020) and thus a generic protocol error is returned.
1545 */
1546 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1547 /* Should never get this one but it was seen. */
1548 #ifdef ENOPROTOOPT
1549 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1550 #endif /* ifdef ENOPROTOOPT */
1551 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1552
1553 #undef SOFT_OR_HARD
1554 #undef ALWAYS_HARD
1555
1556 dev->result = isc__errno2result(recv_errno);
1557 inc_stats(sock->manager->stats,
1558 sock->statsindex[STATID_RECVFAIL]);
1559 return (DOIO_HARD);
1560 }
1561
1562 /*
1563 * On TCP and UNIX sockets, zero length reads indicate EOF,
1564 * while on UDP sockets, zero length reads are perfectly valid,
1565 * although strange.
1566 */
1567 switch (sock->type) {
1568 case isc_sockettype_tcp:
1569 case isc_sockettype_unix:
1570 if (cc == 0) {
1571 return (DOIO_EOF);
1572 }
1573 break;
1574 case isc_sockettype_udp:
1575 case isc_sockettype_raw:
1576 break;
1577 default:
1578 INSIST(0);
1579 ISC_UNREACHABLE();
1580 }
1581
1582 if (sock->type == isc_sockettype_udp) {
1583 dev->address.length = msghdr.msg_namelen;
1584 if (isc_sockaddr_getport(&dev->address) == 0) {
1585 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1586 socket_log(sock, &dev->address, IOEVENT,
1587 "dropping source port zero packet");
1588 }
1589 return (DOIO_SOFT);
1590 }
1591 /*
1592 * Simulate a firewall blocking UDP responses bigger than
1593 * 'maxudp' bytes.
1594 */
1595 if (sock->manager->maxudp != 0 &&
1596 cc > (int)sock->manager->maxudp) {
1597 return (DOIO_SOFT);
1598 }
1599 }
1600
1601 socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1602
1603 /*
1604 * Overflow bit detection. If we received MORE bytes than we should,
1605 * this indicates an overflow situation. Set the flag in the
1606 * dev entry and adjust how much we read by one.
1607 */
1608 #ifdef ISC_PLATFORM_RECVOVERFLOW
1609 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1610 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1611 cc--;
1612 }
1613 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1614
1615 /*
1616 * If there are control messages attached, run through them and pull
1617 * out the interesting bits.
1618 */
1619 process_cmsg(sock, &msghdr, dev);
1620
1621 /*
1622 * update the buffers (if any) and the i/o count
1623 */
1624 dev->n += cc;
1625
1626 /*
1627 * If we read less than we expected, update counters,
1628 * and let the upper layer poke the descriptor.
1629 */
1630 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1631 return (DOIO_SOFT);
1632 }
1633
1634 /*
1635 * Full reads are posted, or partials if partials are ok.
1636 */
1637 dev->result = ISC_R_SUCCESS;
1638 return (DOIO_SUCCESS);
1639 }
1640
1641 /*
1642 * Returns:
1643 * DOIO_SUCCESS The operation succeeded. dev->result contains
1644 * ISC_R_SUCCESS.
1645 *
1646 * DOIO_HARD A hard or unexpected I/O error was encountered.
1647 * dev->result contains the appropriate error.
1648 *
1649 * DOIO_SOFT A soft I/O error was encountered. No senddone
1650 * event was sent. The operation should be retried.
1651 *
1652 * No other return values are possible.
1653 */
1654 static int
doio_send(isc_socket_t * sock,isc_socketevent_t * dev)1655 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1656 int cc;
1657 struct iovec iov[MAXSCATTERGATHER_SEND];
1658 size_t write_count;
1659 struct msghdr msghdr;
1660 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1661 int attempts = 0;
1662 int send_errno;
1663 char strbuf[ISC_STRERRORSIZE];
1664 char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1665
1666 build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1667
1668 resend:
1669 if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1670 write_count > sock->manager->maxudp)
1671 {
1672 cc = write_count;
1673 } else {
1674 cc = sendmsg(sock->fd, &msghdr, 0);
1675 }
1676 send_errno = errno;
1677
1678 /*
1679 * Check for error or block condition.
1680 */
1681 if (cc < 0) {
1682 if (send_errno == EINTR && ++attempts < NRETRIES) {
1683 goto resend;
1684 }
1685
1686 if (SOFT_ERROR(send_errno)) {
1687 if (errno == EWOULDBLOCK || errno == EAGAIN) {
1688 dev->result = ISC_R_WOULDBLOCK;
1689 }
1690 return (DOIO_SOFT);
1691 }
1692
1693 #define SOFT_OR_HARD(_system, _isc) \
1694 if (send_errno == _system) { \
1695 if (sock->connected) { \
1696 dev->result = _isc; \
1697 inc_stats(sock->manager->stats, \
1698 sock->statsindex[STATID_SENDFAIL]); \
1699 return (DOIO_HARD); \
1700 } \
1701 return (DOIO_SOFT); \
1702 }
1703 #define ALWAYS_HARD(_system, _isc) \
1704 if (send_errno == _system) { \
1705 dev->result = _isc; \
1706 inc_stats(sock->manager->stats, \
1707 sock->statsindex[STATID_SENDFAIL]); \
1708 return (DOIO_HARD); \
1709 }
1710
1711 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1712 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1713 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1714 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1715 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1716 #ifdef EHOSTDOWN
1717 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1718 #endif /* ifdef EHOSTDOWN */
1719 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1720 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1721 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1722 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1723 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1724
1725 #undef SOFT_OR_HARD
1726 #undef ALWAYS_HARD
1727
1728 /*
1729 * The other error types depend on whether or not the
1730 * socket is UDP or TCP. If it is UDP, some errors
1731 * that we expect to be fatal under TCP are merely
1732 * annoying, and are really soft errors.
1733 *
1734 * However, these soft errors are still returned as
1735 * a status.
1736 */
1737 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1738 strerror_r(send_errno, strbuf, sizeof(strbuf));
1739 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1740 addrbuf, strbuf);
1741 dev->result = isc__errno2result(send_errno);
1742 inc_stats(sock->manager->stats,
1743 sock->statsindex[STATID_SENDFAIL]);
1744 return (DOIO_HARD);
1745 }
1746
1747 if (cc == 0) {
1748 inc_stats(sock->manager->stats,
1749 sock->statsindex[STATID_SENDFAIL]);
1750 UNEXPECTED_ERROR(__FILE__, __LINE__,
1751 "doio_send: send() returned 0");
1752 }
1753
1754 /*
1755 * If we write less than we expected, update counters, poke.
1756 */
1757 dev->n += cc;
1758 if ((size_t)cc != write_count) {
1759 return (DOIO_SOFT);
1760 }
1761
1762 /*
1763 * Exactly what we wanted to write. We're done with this
1764 * entry. Post its completion event.
1765 */
1766 dev->result = ISC_R_SUCCESS;
1767 return (DOIO_SUCCESS);
1768 }
1769
1770 /*
1771 * Kill.
1772 *
1773 * Caller must ensure that the socket is not locked and no external
1774 * references exist.
1775 */
1776 static void
socketclose(isc__socketthread_t * thread,isc_socket_t * sock,int fd)1777 socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
1778 int lockid = FDLOCK_ID(fd);
1779 /*
1780 * No one has this socket open, so the watcher doesn't have to be
1781 * poked, and the socket doesn't have to be locked.
1782 */
1783 LOCK(&thread->fdlock[lockid]);
1784 thread->fds[fd] = NULL;
1785 thread->fdstate[fd] = CLOSE_PENDING;
1786 UNLOCK(&thread->fdlock[lockid]);
1787 select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
1788
1789 inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1790
1791 LOCK(&sock->lock);
1792 if (sock->active == 1) {
1793 dec_stats(thread->manager->stats,
1794 sock->statsindex[STATID_ACTIVE]);
1795 sock->active = 0;
1796 }
1797 UNLOCK(&sock->lock);
1798
1799 /*
1800 * update manager->maxfd here (XXX: this should be implemented more
1801 * efficiently)
1802 */
1803 #ifdef USE_SELECT
1804 LOCK(&thread->manager->lock);
1805 if (thread->maxfd == fd) {
1806 int i;
1807
1808 thread->maxfd = 0;
1809 for (i = fd - 1; i >= 0; i--) {
1810 lockid = FDLOCK_ID(i);
1811
1812 LOCK(&thread->fdlock[lockid]);
1813 if (thread->fdstate[i] == MANAGED) {
1814 thread->maxfd = i;
1815 UNLOCK(&thread->fdlock[lockid]);
1816 break;
1817 }
1818 UNLOCK(&thread->fdlock[lockid]);
1819 }
1820 if (thread->maxfd < thread->pipe_fds[0]) {
1821 thread->maxfd = thread->pipe_fds[0];
1822 }
1823 }
1824
1825 UNLOCK(&thread->manager->lock);
1826 #endif /* USE_SELECT */
1827 }
1828
1829 static void
destroy(isc_socket_t ** sockp)1830 destroy(isc_socket_t **sockp) {
1831 int fd = 0;
1832 isc_socket_t *sock = *sockp;
1833 isc_socketmgr_t *manager = sock->manager;
1834 isc__socketthread_t *thread = NULL;
1835
1836 socket_log(sock, NULL, CREATION, "destroying");
1837
1838 isc_refcount_destroy(&sock->references);
1839
1840 LOCK(&sock->lock);
1841 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1842 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1843 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1844 INSIST(ISC_LIST_EMPTY(sock->send_list));
1845 INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1846
1847 if (sock->fd >= 0) {
1848 fd = sock->fd;
1849 thread = &manager->threads[sock->threadid];
1850 sock->fd = -1;
1851 sock->threadid = -1;
1852 }
1853 UNLOCK(&sock->lock);
1854
1855 if (fd > 0) {
1856 socketclose(thread, sock, fd);
1857 }
1858
1859 LOCK(&manager->lock);
1860
1861 ISC_LIST_UNLINK(manager->socklist, sock, link);
1862
1863 if (ISC_LIST_EMPTY(manager->socklist)) {
1864 SIGNAL(&manager->shutdown_ok);
1865 }
1866
1867 /* can't unlock manager as its memory context is still used */
1868 free_socket(sockp);
1869
1870 UNLOCK(&manager->lock);
1871 }
1872
1873 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1874 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1875 isc_socket_t **socketp) {
1876 isc_socket_t *sock;
1877
1878 sock = isc_mem_get(manager->mctx, sizeof(*sock));
1879
1880 sock->magic = 0;
1881 isc_refcount_init(&sock->references, 0);
1882
1883 sock->manager = manager;
1884 sock->type = type;
1885 sock->fd = -1;
1886 sock->threadid = -1;
1887 sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1888 sock->dupped = 0;
1889 sock->statsindex = NULL;
1890 sock->active = 0;
1891
1892 ISC_LINK_INIT(sock, link);
1893
1894 memset(sock->name, 0, sizeof(sock->name));
1895 sock->tag = NULL;
1896
1897 /*
1898 * Set up list of readers and writers to be initially empty.
1899 */
1900 ISC_LIST_INIT(sock->recv_list);
1901 ISC_LIST_INIT(sock->send_list);
1902 ISC_LIST_INIT(sock->accept_list);
1903 ISC_LIST_INIT(sock->connect_list);
1904
1905 sock->listener = 0;
1906 sock->connected = 0;
1907 sock->connecting = 0;
1908 sock->bound = 0;
1909 sock->pktdscp = 0;
1910
1911 /*
1912 * Initialize the lock.
1913 */
1914 isc_mutex_init(&sock->lock);
1915
1916 sock->magic = SOCKET_MAGIC;
1917 *socketp = sock;
1918
1919 return (ISC_R_SUCCESS);
1920 }
1921
1922 /*
1923 * This event requires that the various lists be empty, that the reference
1924 * count be 1, and that the magic number is valid. The other socket bits,
1925 * like the lock, must be initialized as well. The fd associated must be
1926 * marked as closed, by setting it to -1 on close, or this routine will
1927 * also close the socket.
1928 */
1929 static void
free_socket(isc_socket_t ** socketp)1930 free_socket(isc_socket_t **socketp) {
1931 isc_socket_t *sock = *socketp;
1932 *socketp = NULL;
1933
1934 INSIST(VALID_SOCKET(sock));
1935 isc_refcount_destroy(&sock->references);
1936 LOCK(&sock->lock);
1937 INSIST(!sock->connecting);
1938 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1939 INSIST(ISC_LIST_EMPTY(sock->send_list));
1940 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1941 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1942 INSIST(!ISC_LINK_LINKED(sock, link));
1943 UNLOCK(&sock->lock);
1944
1945 sock->magic = 0;
1946
1947 isc_mutex_destroy(&sock->lock);
1948
1949 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1950 }
1951
1952 #if defined(SET_RCVBUF)
1953 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1954 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1955
1956 static void
set_rcvbuf(void)1957 set_rcvbuf(void) {
1958 int fd;
1959 int max = rcvbuf, min;
1960 socklen_t len;
1961
1962 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1963 if (fd == -1) {
1964 switch (errno) {
1965 case EPROTONOSUPPORT:
1966 case EPFNOSUPPORT:
1967 case EAFNOSUPPORT:
1968 /*
1969 * Linux 2.2 (and maybe others) return EINVAL instead of
1970 * EAFNOSUPPORT.
1971 */
1972 case EINVAL:
1973 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
1974 break;
1975 }
1976 }
1977 if (fd == -1) {
1978 return;
1979 }
1980
1981 len = sizeof(min);
1982 if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
1983 min < rcvbuf)
1984 {
1985 again:
1986 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
1987 sizeof(rcvbuf)) == -1)
1988 {
1989 if (errno == ENOBUFS && rcvbuf > min) {
1990 max = rcvbuf - 1;
1991 rcvbuf = (rcvbuf + min) / 2;
1992 goto again;
1993 } else {
1994 rcvbuf = min;
1995 goto cleanup;
1996 }
1997 } else {
1998 min = rcvbuf;
1999 }
2000 if (min != max) {
2001 rcvbuf = max;
2002 goto again;
2003 }
2004 }
2005 cleanup:
2006 close(fd);
2007 }
2008 #endif /* ifdef SO_RCVBUF */
2009
2010 #if defined(SET_SNDBUF)
2011 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
2012 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2013
2014 static void
set_sndbuf(void)2015 set_sndbuf(void) {
2016 int fd;
2017 int max = sndbuf, min;
2018 socklen_t len;
2019
2020 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2021 if (fd == -1) {
2022 switch (errno) {
2023 case EPROTONOSUPPORT:
2024 case EPFNOSUPPORT:
2025 case EAFNOSUPPORT:
2026 /*
2027 * Linux 2.2 (and maybe others) return EINVAL instead of
2028 * EAFNOSUPPORT.
2029 */
2030 case EINVAL:
2031 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2032 break;
2033 }
2034 }
2035 if (fd == -1) {
2036 return;
2037 }
2038
2039 len = sizeof(min);
2040 if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2041 min < sndbuf)
2042 {
2043 again:
2044 if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2045 sizeof(sndbuf)) == -1)
2046 {
2047 if (errno == ENOBUFS && sndbuf > min) {
2048 max = sndbuf - 1;
2049 sndbuf = (sndbuf + min) / 2;
2050 goto again;
2051 } else {
2052 sndbuf = min;
2053 goto cleanup;
2054 }
2055 } else {
2056 min = sndbuf;
2057 }
2058 if (min != max) {
2059 sndbuf = max;
2060 goto again;
2061 }
2062 }
2063 cleanup:
2064 close(fd);
2065 }
2066 #endif /* ifdef SO_SNDBUF */
2067
2068 static void
use_min_mtu(isc_socket_t * sock)2069 use_min_mtu(isc_socket_t *sock) {
2070 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2071 UNUSED(sock);
2072 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2073 #ifdef IPV6_USE_MIN_MTU
2074 /* use minimum MTU */
2075 if (sock->pf == AF_INET6) {
2076 int on = 1;
2077 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2078 (void *)&on, sizeof(on));
2079 }
2080 #endif /* ifdef IPV6_USE_MIN_MTU */
2081 #if defined(IPV6_MTU)
2082 /*
2083 * Use minimum MTU on IPv6 sockets.
2084 */
2085 if (sock->pf == AF_INET6) {
2086 int mtu = 1280;
2087 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2088 sizeof(mtu));
2089 }
2090 #endif /* if defined(IPV6_MTU) */
2091 }
2092
2093 static void
set_tcp_maxseg(isc_socket_t * sock,int size)2094 set_tcp_maxseg(isc_socket_t *sock, int size) {
2095 #ifdef TCP_MAXSEG
2096 if (sock->type == isc_sockettype_tcp) {
2097 (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2098 (void *)&size, sizeof(size));
2099 }
2100 #endif /* ifdef TCP_MAXSEG */
2101 }
2102
2103 static void
set_ip_disable_pmtud(isc_socket_t * sock)2104 set_ip_disable_pmtud(isc_socket_t *sock) {
2105 /*
2106 * Disable Path MTU Discover on IP packets
2107 */
2108 if (sock->pf == AF_INET6) {
2109 #if defined(IPV6_DONTFRAG)
2110 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
2111 &(int){ 0 }, sizeof(int));
2112 #endif
2113 #if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2114 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2115 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2116 #endif
2117 } else if (sock->pf == AF_INET) {
2118 #if defined(IP_DONTFRAG)
2119 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
2120 sizeof(int));
2121 #endif
2122 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2123 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2124 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2125 #endif
2126 }
2127 }
2128
2129 static isc_result_t
opensocket(isc_socketmgr_t * manager,isc_socket_t * sock,isc_socket_t * dup_socket)2130 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock,
2131 isc_socket_t *dup_socket) {
2132 isc_result_t result;
2133 char strbuf[ISC_STRERRORSIZE];
2134 const char *err = "socket";
2135 int tries = 0;
2136 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2137 int on = 1;
2138 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2139 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2140 socklen_t optlen;
2141 int size = 0;
2142 #endif
2143
2144 again:
2145 if (dup_socket == NULL) {
2146 switch (sock->type) {
2147 case isc_sockettype_udp:
2148 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2149 break;
2150 case isc_sockettype_tcp:
2151 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2152 break;
2153 case isc_sockettype_unix:
2154 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2155 break;
2156 case isc_sockettype_raw:
2157 errno = EPFNOSUPPORT;
2158 /*
2159 * PF_ROUTE is a alias for PF_NETLINK on linux.
2160 */
2161 #if defined(PF_ROUTE)
2162 if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2163 #ifdef NETLINK_ROUTE
2164 sock->fd = socket(sock->pf, SOCK_RAW,
2165 NETLINK_ROUTE);
2166 #else /* ifdef NETLINK_ROUTE */
2167 sock->fd = socket(sock->pf, SOCK_RAW, 0);
2168 #endif /* ifdef NETLINK_ROUTE */
2169 if (sock->fd != -1) {
2170 #ifdef NETLINK_ROUTE
2171 struct sockaddr_nl sa;
2172 int n;
2173
2174 /*
2175 * Do an implicit bind.
2176 */
2177 memset(&sa, 0, sizeof(sa));
2178 sa.nl_family = AF_NETLINK;
2179 sa.nl_groups = RTMGRP_IPV4_IFADDR |
2180 RTMGRP_IPV6_IFADDR;
2181 n = bind(sock->fd,
2182 (struct sockaddr *)&sa,
2183 sizeof(sa));
2184 if (n < 0) {
2185 close(sock->fd);
2186 sock->fd = -1;
2187 }
2188 #endif /* ifdef NETLINK_ROUTE */
2189 sock->bound = 1;
2190 }
2191 }
2192 #endif /* if defined(PF_ROUTE) */
2193 break;
2194 }
2195 } else {
2196 sock->fd = dup(dup_socket->fd);
2197 sock->dupped = 1;
2198 sock->bound = dup_socket->bound;
2199 }
2200 if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2201 goto again;
2202 }
2203
2204 #ifdef F_DUPFD
2205 /*
2206 * Leave a space for stdio and TCP to work in.
2207 */
2208 if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2209 sock->fd >= 0 && sock->fd < manager->reserved)
2210 {
2211 int newfd, tmp;
2212 newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2213 tmp = errno;
2214 (void)close(sock->fd);
2215 errno = tmp;
2216 sock->fd = newfd;
2217 err = "isc_socket_create: fcntl/reserved";
2218 } else if (sock->fd >= 0 && sock->fd < 20) {
2219 int newfd, tmp;
2220 newfd = fcntl(sock->fd, F_DUPFD, 20);
2221 tmp = errno;
2222 (void)close(sock->fd);
2223 errno = tmp;
2224 sock->fd = newfd;
2225 err = "isc_socket_create: fcntl";
2226 }
2227 #endif /* ifdef F_DUPFD */
2228
2229 if (sock->fd >= (int)manager->maxsocks) {
2230 (void)close(sock->fd);
2231 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2232 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2233 "socket: file descriptor exceeds limit (%d/%u)",
2234 sock->fd, manager->maxsocks);
2235 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2236 return (ISC_R_NORESOURCES);
2237 }
2238
2239 if (sock->fd < 0) {
2240 switch (errno) {
2241 case EMFILE:
2242 case ENFILE:
2243 strerror_r(errno, strbuf, sizeof(strbuf));
2244 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2245 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2246 "%s: %s", err, strbuf);
2247 /* fallthrough */
2248 case ENOBUFS:
2249 inc_stats(manager->stats,
2250 sock->statsindex[STATID_OPENFAIL]);
2251 return (ISC_R_NORESOURCES);
2252
2253 case EPROTONOSUPPORT:
2254 case EPFNOSUPPORT:
2255 case EAFNOSUPPORT:
2256 /*
2257 * Linux 2.2 (and maybe others) return EINVAL instead of
2258 * EAFNOSUPPORT.
2259 */
2260 case EINVAL:
2261 inc_stats(manager->stats,
2262 sock->statsindex[STATID_OPENFAIL]);
2263 return (ISC_R_FAMILYNOSUPPORT);
2264
2265 default:
2266 strerror_r(errno, strbuf, sizeof(strbuf));
2267 UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2268 err, strbuf);
2269 inc_stats(manager->stats,
2270 sock->statsindex[STATID_OPENFAIL]);
2271 return (ISC_R_UNEXPECTED);
2272 }
2273 }
2274
2275 if (dup_socket != NULL) {
2276 goto setup_done;
2277 }
2278
2279 result = make_nonblock(sock->fd);
2280 if (result != ISC_R_SUCCESS) {
2281 (void)close(sock->fd);
2282 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2283 return (result);
2284 }
2285
2286 #ifdef SO_NOSIGPIPE
2287 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2288 sizeof(on)) < 0) {
2289 strerror_r(errno, strbuf, sizeof(strbuf));
2290 UNEXPECTED_ERROR(__FILE__, __LINE__,
2291 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2292 sock->fd, strbuf);
2293 /* Press on... */
2294 }
2295 #endif /* ifdef SO_NOSIGPIPE */
2296
2297 /*
2298 * Use minimum mtu if possible.
2299 */
2300 if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2301 use_min_mtu(sock);
2302 set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2303 }
2304
2305 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2306 if (sock->type == isc_sockettype_udp) {
2307 #if defined(USE_CMSG)
2308 #if defined(SO_TIMESTAMP)
2309 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2310 sizeof(on)) < 0 &&
2311 errno != ENOPROTOOPT)
2312 {
2313 strerror_r(errno, strbuf, sizeof(strbuf));
2314 UNEXPECTED_ERROR(__FILE__, __LINE__,
2315 "setsockopt(%d, SO_TIMESTAMP) failed: "
2316 "%s",
2317 sock->fd, strbuf);
2318 /* Press on... */
2319 }
2320 #endif /* SO_TIMESTAMP */
2321
2322 #ifdef IPV6_RECVPKTINFO
2323 /* RFC 3542 */
2324 if ((sock->pf == AF_INET6) &&
2325 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2326 (void *)&on, sizeof(on)) < 0))
2327 {
2328 strerror_r(errno, strbuf, sizeof(strbuf));
2329 UNEXPECTED_ERROR(__FILE__, __LINE__,
2330 "setsockopt(%d, IPV6_RECVPKTINFO) "
2331 "failed: %s",
2332 sock->fd, strbuf);
2333 }
2334 #else /* ifdef IPV6_RECVPKTINFO */
2335 /* RFC 2292 */
2336 if ((sock->pf == AF_INET6) &&
2337 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2338 (void *)&on, sizeof(on)) < 0))
2339 {
2340 strerror_r(errno, strbuf, sizeof(strbuf));
2341 UNEXPECTED_ERROR(__FILE__, __LINE__,
2342 "setsockopt(%d, IPV6_PKTINFO) failed: "
2343 "%s",
2344 sock->fd, strbuf);
2345 }
2346 #endif /* IPV6_RECVPKTINFO */
2347 #endif /* defined(USE_CMSG) */
2348
2349 #if defined(SET_RCVBUF)
2350 optlen = sizeof(size);
2351 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2352 &optlen) == 0 &&
2353 size < rcvbuf)
2354 {
2355 RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2356 ISC_R_SUCCESS);
2357 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2358 (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2359 {
2360 strerror_r(errno, strbuf, sizeof(strbuf));
2361 UNEXPECTED_ERROR(__FILE__, __LINE__,
2362 "setsockopt(%d, SO_RCVBUF, "
2363 "%d) failed: %s",
2364 sock->fd, rcvbuf, strbuf);
2365 }
2366 }
2367 #endif /* if defined(SET_RCVBUF) */
2368
2369 #if defined(SET_SNDBUF)
2370 optlen = sizeof(size);
2371 if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2372 &optlen) == 0 &&
2373 size < sndbuf)
2374 {
2375 RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2376 ISC_R_SUCCESS);
2377 if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2378 (void *)&sndbuf, sizeof(sndbuf)) == -1)
2379 {
2380 strerror_r(errno, strbuf, sizeof(strbuf));
2381 UNEXPECTED_ERROR(__FILE__, __LINE__,
2382 "setsockopt(%d, SO_SNDBUF, "
2383 "%d) failed: %s",
2384 sock->fd, sndbuf, strbuf);
2385 }
2386 }
2387 #endif /* if defined(SO_SNDBUF) */
2388 }
2389 #ifdef IPV6_RECVTCLASS
2390 if ((sock->pf == AF_INET6) &&
2391 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2392 sizeof(on)) < 0))
2393 {
2394 strerror_r(errno, strbuf, sizeof(strbuf));
2395 UNEXPECTED_ERROR(__FILE__, __LINE__,
2396 "setsockopt(%d, IPV6_RECVTCLASS) "
2397 "failed: %s",
2398 sock->fd, strbuf);
2399 }
2400 #endif /* ifdef IPV6_RECVTCLASS */
2401 #ifdef IP_RECVTOS
2402 if ((sock->pf == AF_INET) &&
2403 (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2404 sizeof(on)) < 0))
2405 {
2406 strerror_r(errno, strbuf, sizeof(strbuf));
2407 UNEXPECTED_ERROR(__FILE__, __LINE__,
2408 "setsockopt(%d, IP_RECVTOS) "
2409 "failed: %s",
2410 sock->fd, strbuf);
2411 }
2412 #endif /* ifdef IP_RECVTOS */
2413 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2414
2415 set_ip_disable_pmtud(sock);
2416
2417 setup_done:
2418 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2419 if (sock->active == 0) {
2420 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2421 sock->active = 1;
2422 }
2423
2424 return (ISC_R_SUCCESS);
2425 }
2426
2427 /*
2428 * Create a 'type' socket or duplicate an existing socket, managed
2429 * by 'manager'. Events will be posted to 'task' and when dispatched
2430 * 'action' will be called with 'arg' as the arg value. The new
2431 * socket is returned in 'socketp'.
2432 */
2433 static isc_result_t
socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp,isc_socket_t * dup_socket)2434 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2435 isc_socket_t **socketp, isc_socket_t *dup_socket) {
2436 isc_socket_t *sock = NULL;
2437 isc__socketthread_t *thread;
2438 isc_result_t result;
2439 int lockid;
2440
2441 REQUIRE(VALID_MANAGER(manager));
2442 REQUIRE(socketp != NULL && *socketp == NULL);
2443
2444 result = allocate_socket(manager, type, &sock);
2445 if (result != ISC_R_SUCCESS) {
2446 return (result);
2447 }
2448
2449 switch (sock->type) {
2450 case isc_sockettype_udp:
2451 sock->statsindex = (pf == AF_INET) ? udp4statsindex
2452 : udp6statsindex;
2453 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2454 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2455 break;
2456 case isc_sockettype_tcp:
2457 sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2458 : tcp6statsindex;
2459 break;
2460 case isc_sockettype_unix:
2461 sock->statsindex = unixstatsindex;
2462 break;
2463 case isc_sockettype_raw:
2464 sock->statsindex = rawstatsindex;
2465 break;
2466 default:
2467 INSIST(0);
2468 ISC_UNREACHABLE();
2469 }
2470
2471 sock->pf = pf;
2472
2473 result = opensocket(manager, sock, dup_socket);
2474 if (result != ISC_R_SUCCESS) {
2475 free_socket(&sock);
2476 return (result);
2477 }
2478
2479 if (sock->fd == -1) {
2480 abort();
2481 }
2482 sock->threadid = gen_threadid(sock);
2483 isc_refcount_increment0(&sock->references);
2484 thread = &manager->threads[sock->threadid];
2485 *socketp = sock;
2486
2487 /*
2488 * Note we don't have to lock the socket like we normally would because
2489 * there are no external references to it yet.
2490 */
2491
2492 lockid = FDLOCK_ID(sock->fd);
2493 LOCK(&thread->fdlock[lockid]);
2494 thread->fds[sock->fd] = sock;
2495 thread->fdstate[sock->fd] = MANAGED;
2496 #if defined(USE_EPOLL)
2497 thread->epoll_events[sock->fd] = 0;
2498 #endif /* if defined(USE_EPOLL) */
2499 #ifdef USE_DEVPOLL
2500 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2501 thread->fdpollinfo[sock->fd].want_write == 0);
2502 #endif /* ifdef USE_DEVPOLL */
2503 UNLOCK(&thread->fdlock[lockid]);
2504
2505 LOCK(&manager->lock);
2506 ISC_LIST_APPEND(manager->socklist, sock, link);
2507 #ifdef USE_SELECT
2508 if (thread->maxfd < sock->fd) {
2509 thread->maxfd = sock->fd;
2510 }
2511 #endif /* ifdef USE_SELECT */
2512 UNLOCK(&manager->lock);
2513
2514 socket_log(sock, NULL, CREATION,
2515 dup_socket != NULL ? "dupped" : "created");
2516
2517 return (ISC_R_SUCCESS);
2518 }
2519
2520 /*%
2521 * Create a new 'type' socket managed by 'manager'. Events
2522 * will be posted to 'task' and when dispatched 'action' will be
2523 * called with 'arg' as the arg value. The new socket is returned
2524 * in 'socketp'.
2525 */
2526 isc_result_t
isc_socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2527 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2528 isc_socket_t **socketp) {
2529 return (socket_create(manager0, pf, type, socketp, NULL));
2530 }
2531
2532 /*%
2533 * Duplicate an existing socket. The new socket is returned
2534 * in 'socketp'.
2535 */
2536 isc_result_t
isc_socket_dup(isc_socket_t * sock,isc_socket_t ** socketp)2537 isc_socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
2538 REQUIRE(VALID_SOCKET(sock));
2539 REQUIRE(socketp != NULL && *socketp == NULL);
2540
2541 return (socket_create(sock->manager, sock->pf, sock->type, socketp,
2542 sock));
2543 }
2544
2545 isc_result_t
isc_socket_open(isc_socket_t * sock)2546 isc_socket_open(isc_socket_t *sock) {
2547 isc_result_t result;
2548 isc__socketthread_t *thread;
2549
2550 REQUIRE(VALID_SOCKET(sock));
2551
2552 LOCK(&sock->lock);
2553
2554 REQUIRE(isc_refcount_current(&sock->references) >= 1);
2555 REQUIRE(sock->fd == -1);
2556 REQUIRE(sock->threadid == -1);
2557
2558 result = opensocket(sock->manager, sock, NULL);
2559
2560 UNLOCK(&sock->lock);
2561
2562 if (result != ISC_R_SUCCESS) {
2563 sock->fd = -1;
2564 } else {
2565 sock->threadid = gen_threadid(sock);
2566 thread = &sock->manager->threads[sock->threadid];
2567 int lockid = FDLOCK_ID(sock->fd);
2568
2569 LOCK(&thread->fdlock[lockid]);
2570 thread->fds[sock->fd] = sock;
2571 thread->fdstate[sock->fd] = MANAGED;
2572 #if defined(USE_EPOLL)
2573 thread->epoll_events[sock->fd] = 0;
2574 #endif /* if defined(USE_EPOLL) */
2575 #ifdef USE_DEVPOLL
2576 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2577 thread->fdpollinfo[sock->fd].want_write == 0);
2578 #endif /* ifdef USE_DEVPOLL */
2579 UNLOCK(&thread->fdlock[lockid]);
2580
2581 #ifdef USE_SELECT
2582 LOCK(&sock->manager->lock);
2583 if (thread->maxfd < sock->fd) {
2584 thread->maxfd = sock->fd;
2585 }
2586 UNLOCK(&sock->manager->lock);
2587 #endif /* ifdef USE_SELECT */
2588 }
2589
2590 return (result);
2591 }
2592
2593 /*
2594 * Attach to a socket. Caller must explicitly detach when it is done.
2595 */
2596 void
isc_socket_attach(isc_socket_t * sock,isc_socket_t ** socketp)2597 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2598 REQUIRE(VALID_SOCKET(sock));
2599 REQUIRE(socketp != NULL && *socketp == NULL);
2600
2601 int old_refs = isc_refcount_increment(&sock->references);
2602 REQUIRE(old_refs > 0);
2603
2604 *socketp = sock;
2605 }
2606
2607 /*
2608 * Dereference a socket. If this is the last reference to it, clean things
2609 * up by destroying the socket.
2610 */
2611 void
isc_socket_detach(isc_socket_t ** socketp)2612 isc_socket_detach(isc_socket_t **socketp) {
2613 isc_socket_t *sock;
2614
2615 REQUIRE(socketp != NULL);
2616 sock = *socketp;
2617 REQUIRE(VALID_SOCKET(sock));
2618 if (isc_refcount_decrement(&sock->references) == 1) {
2619 destroy(&sock);
2620 }
2621
2622 *socketp = NULL;
2623 }
2624
2625 isc_result_t
isc_socket_close(isc_socket_t * sock)2626 isc_socket_close(isc_socket_t *sock) {
2627 int fd;
2628 isc_socketmgr_t *manager;
2629 isc__socketthread_t *thread;
2630 fflush(stdout);
2631 REQUIRE(VALID_SOCKET(sock));
2632
2633 LOCK(&sock->lock);
2634
2635 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2636
2637 INSIST(!sock->connecting);
2638 INSIST(ISC_LIST_EMPTY(sock->recv_list));
2639 INSIST(ISC_LIST_EMPTY(sock->send_list));
2640 INSIST(ISC_LIST_EMPTY(sock->accept_list));
2641 INSIST(ISC_LIST_EMPTY(sock->connect_list));
2642
2643 manager = sock->manager;
2644 thread = &manager->threads[sock->threadid];
2645 fd = sock->fd;
2646 sock->fd = -1;
2647 sock->threadid = -1;
2648
2649 sock->dupped = 0;
2650 memset(sock->name, 0, sizeof(sock->name));
2651 sock->tag = NULL;
2652 sock->listener = 0;
2653 sock->connected = 0;
2654 sock->connecting = 0;
2655 sock->bound = 0;
2656 isc_sockaddr_any(&sock->peer_address);
2657
2658 UNLOCK(&sock->lock);
2659
2660 socketclose(thread, sock, fd);
2661
2662 return (ISC_R_SUCCESS);
2663 }
2664
2665 /*
2666 * Dequeue an item off the given socket's read queue, set the result code
2667 * in the done event to the one provided, and send it to the task it was
2668 * destined for.
2669 *
2670 * If the event to be sent is on a list, remove it before sending. If
2671 * asked to, send and detach from the socket as well.
2672 *
2673 * Caller must have the socket locked if the event is attached to the socket.
2674 */
2675 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2676 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2677 isc_task_t *task;
2678
2679 task = (*dev)->ev_sender;
2680
2681 (*dev)->ev_sender = sock;
2682
2683 if (ISC_LINK_LINKED(*dev, ev_link)) {
2684 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2685 }
2686
2687 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2688 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2689 sock->threadid);
2690 } else {
2691 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2692 }
2693 }
2694
2695 /*
2696 * See comments for send_recvdone_event() above.
2697 *
2698 * Caller must have the socket locked if the event is attached to the socket.
2699 */
2700 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2701 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2702 isc_task_t *task;
2703
2704 INSIST(dev != NULL && *dev != NULL);
2705
2706 task = (*dev)->ev_sender;
2707 (*dev)->ev_sender = sock;
2708
2709 if (ISC_LINK_LINKED(*dev, ev_link)) {
2710 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2711 }
2712
2713 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2714 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2715 sock->threadid);
2716 } else {
2717 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2718 }
2719 }
2720
2721 /*
2722 * See comments for send_recvdone_event() above.
2723 *
2724 * Caller must have the socket locked if the event is attached to the socket.
2725 */
2726 static void
send_connectdone_event(isc_socket_t * sock,isc_socket_connev_t ** dev)2727 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
2728 isc_task_t *task;
2729
2730 INSIST(dev != NULL && *dev != NULL);
2731
2732 task = (*dev)->ev_sender;
2733 (*dev)->ev_sender = sock;
2734
2735 if (ISC_LINK_LINKED(*dev, ev_link)) {
2736 ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2737 }
2738
2739 isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2740 }
2741
2742 /*
2743 * Call accept() on a socket, to get the new file descriptor. The listen
2744 * socket is used as a prototype to create a new isc_socket_t. The new
2745 * socket has one outstanding reference. The task receiving the event
2746 * will be detached from just after the event is delivered.
2747 *
2748 * On entry to this function, the event delivered is the internal
2749 * readable event, and the first item on the accept_list should be
2750 * the done event we want to send. If the list is empty, this is a no-op,
2751 * so just unlock and return.
2752 */
2753 static void
internal_accept(isc_socket_t * sock)2754 internal_accept(isc_socket_t *sock) {
2755 isc_socketmgr_t *manager;
2756 isc__socketthread_t *thread, *nthread;
2757 isc_socket_newconnev_t *dev;
2758 isc_task_t *task;
2759 socklen_t addrlen;
2760 int fd;
2761 isc_result_t result = ISC_R_SUCCESS;
2762 char strbuf[ISC_STRERRORSIZE];
2763 const char *err = "accept";
2764
2765 INSIST(VALID_SOCKET(sock));
2766 REQUIRE(sock->fd >= 0);
2767
2768 socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2769
2770 manager = sock->manager;
2771 INSIST(VALID_MANAGER(manager));
2772 thread = &manager->threads[sock->threadid];
2773
2774 INSIST(sock->listener);
2775
2776 /*
2777 * Get the first item off the accept list.
2778 * If it is empty, unlock the socket and return.
2779 */
2780 dev = ISC_LIST_HEAD(sock->accept_list);
2781 if (dev == NULL) {
2782 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2783 UNLOCK(&sock->lock);
2784 return;
2785 }
2786
2787 /*
2788 * Try to accept the new connection. If the accept fails with
2789 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2790 * again. Also ignore ECONNRESET, which has been reported to
2791 * be spuriously returned on Linux 2.2.19 although it is not
2792 * a documented error for accept(). ECONNABORTED has been
2793 * reported for Solaris 8. The rest are thrown in not because
2794 * we have seen them but because they are ignored by other
2795 * daemons such as BIND 8 and Apache.
2796 */
2797
2798 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2799 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2800 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2801 (void *)&addrlen);
2802
2803 #ifdef F_DUPFD
2804 /*
2805 * Leave a space for stdio to work in.
2806 */
2807 if (fd >= 0 && fd < 20) {
2808 int newfd, tmp;
2809 newfd = fcntl(fd, F_DUPFD, 20);
2810 tmp = errno;
2811 (void)close(fd);
2812 errno = tmp;
2813 fd = newfd;
2814 err = "accept/fcntl";
2815 }
2816 #endif /* ifdef F_DUPFD */
2817
2818 if (fd < 0) {
2819 if (SOFT_ERROR(errno)) {
2820 goto soft_error;
2821 }
2822 switch (errno) {
2823 case ENFILE:
2824 case EMFILE:
2825 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2826 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2827 "%s: too many open file descriptors",
2828 err);
2829 goto soft_error;
2830
2831 case ENOBUFS:
2832 case ENOMEM:
2833 case ECONNRESET:
2834 case ECONNABORTED:
2835 case EHOSTUNREACH:
2836 case EHOSTDOWN:
2837 case ENETUNREACH:
2838 case ENETDOWN:
2839 case ECONNREFUSED:
2840 #ifdef EPROTO
2841 case EPROTO:
2842 #endif /* ifdef EPROTO */
2843 #ifdef ENONET
2844 case ENONET:
2845 #endif /* ifdef ENONET */
2846 goto soft_error;
2847 default:
2848 break;
2849 }
2850 strerror_r(errno, strbuf, sizeof(strbuf));
2851 UNEXPECTED_ERROR(__FILE__, __LINE__,
2852 "internal_accept: %s() failed: %s", err,
2853 strbuf);
2854 fd = -1;
2855 result = ISC_R_UNEXPECTED;
2856 } else {
2857 if (addrlen == 0U) {
2858 UNEXPECTED_ERROR(__FILE__, __LINE__,
2859 "internal_accept(): "
2860 "accept() failed to return "
2861 "remote address");
2862
2863 (void)close(fd);
2864 goto soft_error;
2865 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2866 sock->pf) {
2867 UNEXPECTED_ERROR(
2868 __FILE__, __LINE__,
2869 "internal_accept(): "
2870 "accept() returned peer address "
2871 "family %u (expected %u)",
2872 NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2873 sock->pf);
2874 (void)close(fd);
2875 goto soft_error;
2876 } else if (fd >= (int)manager->maxsocks) {
2877 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2878 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2879 "accept: file descriptor exceeds limit "
2880 "(%d/%u)",
2881 fd, manager->maxsocks);
2882 (void)close(fd);
2883 goto soft_error;
2884 }
2885 }
2886
2887 if (fd != -1) {
2888 NEWCONNSOCK(dev)->peer_address.length = addrlen;
2889 NEWCONNSOCK(dev)->pf = sock->pf;
2890 }
2891
2892 /*
2893 * Pull off the done event.
2894 */
2895 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2896
2897 /*
2898 * Poke watcher if there are more pending accepts.
2899 */
2900 if (ISC_LIST_EMPTY(sock->accept_list)) {
2901 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2902 }
2903
2904 if (fd != -1) {
2905 result = make_nonblock(fd);
2906 if (result != ISC_R_SUCCESS) {
2907 (void)close(fd);
2908 fd = -1;
2909 }
2910 }
2911
2912 /*
2913 * We need to unlock sock->lock now to be able to lock manager->lock
2914 * without risking a deadlock with xmlstats.
2915 */
2916 UNLOCK(&sock->lock);
2917
2918 /*
2919 * -1 means the new socket didn't happen.
2920 */
2921 if (fd != -1) {
2922 int lockid = FDLOCK_ID(fd);
2923
2924 NEWCONNSOCK(dev)->fd = fd;
2925 NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2926 NEWCONNSOCK(dev)->bound = 1;
2927 NEWCONNSOCK(dev)->connected = 1;
2928 nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2929
2930 /*
2931 * We already hold a lock on one fdlock in accepting thread,
2932 * we need to make sure that we don't double lock.
2933 */
2934 bool same_bucket = (sock->threadid ==
2935 NEWCONNSOCK(dev)->threadid) &&
2936 (FDLOCK_ID(sock->fd) == lockid);
2937
2938 /*
2939 * Use minimum mtu if possible.
2940 */
2941 use_min_mtu(NEWCONNSOCK(dev));
2942 set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
2943
2944 /*
2945 * Ensure DSCP settings are inherited across accept.
2946 */
2947 setdscp(NEWCONNSOCK(dev), sock->dscp);
2948
2949 /*
2950 * Save away the remote address
2951 */
2952 dev->address = NEWCONNSOCK(dev)->peer_address;
2953
2954 if (NEWCONNSOCK(dev)->active == 0) {
2955 inc_stats(manager->stats,
2956 NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
2957 NEWCONNSOCK(dev)->active = 1;
2958 }
2959
2960 if (!same_bucket) {
2961 LOCK(&nthread->fdlock[lockid]);
2962 }
2963 nthread->fds[fd] = NEWCONNSOCK(dev);
2964 nthread->fdstate[fd] = MANAGED;
2965 #if defined(USE_EPOLL)
2966 nthread->epoll_events[fd] = 0;
2967 #endif /* if defined(USE_EPOLL) */
2968 if (!same_bucket) {
2969 UNLOCK(&nthread->fdlock[lockid]);
2970 }
2971
2972 LOCK(&manager->lock);
2973
2974 #ifdef USE_SELECT
2975 if (nthread->maxfd < fd) {
2976 nthread->maxfd = fd;
2977 }
2978 #endif /* ifdef USE_SELECT */
2979
2980 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
2981 "accepted connection, new socket %p",
2982 dev->newsocket);
2983
2984 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
2985
2986 UNLOCK(&manager->lock);
2987
2988 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2989 } else {
2990 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2991 isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
2992 free_socket((isc_socket_t **)&dev->newsocket);
2993 }
2994
2995 /*
2996 * Fill in the done event details and send it off.
2997 */
2998 dev->result = result;
2999 task = dev->ev_sender;
3000 dev->ev_sender = sock;
3001
3002 isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
3003 return;
3004
3005 soft_error:
3006 watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
3007 UNLOCK(&sock->lock);
3008
3009 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3010 return;
3011 }
3012
3013 static void
internal_recv(isc_socket_t * sock)3014 internal_recv(isc_socket_t *sock) {
3015 isc_socketevent_t *dev;
3016
3017 INSIST(VALID_SOCKET(sock));
3018 REQUIRE(sock->fd >= 0);
3019
3020 dev = ISC_LIST_HEAD(sock->recv_list);
3021 if (dev == NULL) {
3022 goto finish;
3023 }
3024
3025 socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
3026 dev, dev->ev_sender);
3027
3028 /*
3029 * Try to do as much I/O as possible on this socket. There are no
3030 * limits here, currently.
3031 */
3032 while (dev != NULL) {
3033 switch (doio_recv(sock, dev)) {
3034 case DOIO_SOFT:
3035 goto finish;
3036
3037 case DOIO_EOF:
3038 /*
3039 * read of 0 means the remote end was closed.
3040 * Run through the event queue and dispatch all
3041 * the events with an EOF result code.
3042 */
3043 do {
3044 dev->result = ISC_R_EOF;
3045 send_recvdone_event(sock, &dev);
3046 dev = ISC_LIST_HEAD(sock->recv_list);
3047 } while (dev != NULL);
3048 goto finish;
3049
3050 case DOIO_SUCCESS:
3051 case DOIO_HARD:
3052 send_recvdone_event(sock, &dev);
3053 break;
3054 }
3055
3056 dev = ISC_LIST_HEAD(sock->recv_list);
3057 }
3058
3059 finish:
3060 if (ISC_LIST_EMPTY(sock->recv_list)) {
3061 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3062 SELECT_POKE_READ);
3063 }
3064 }
3065
3066 static void
internal_send(isc_socket_t * sock)3067 internal_send(isc_socket_t *sock) {
3068 isc_socketevent_t *dev;
3069
3070 INSIST(VALID_SOCKET(sock));
3071 REQUIRE(sock->fd >= 0);
3072
3073 dev = ISC_LIST_HEAD(sock->send_list);
3074 if (dev == NULL) {
3075 goto finish;
3076 }
3077 socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3078 dev->ev_sender);
3079
3080 /*
3081 * Try to do as much I/O as possible on this socket. There are no
3082 * limits here, currently.
3083 */
3084 while (dev != NULL) {
3085 switch (doio_send(sock, dev)) {
3086 case DOIO_SOFT:
3087 goto finish;
3088
3089 case DOIO_HARD:
3090 case DOIO_SUCCESS:
3091 send_senddone_event(sock, &dev);
3092 break;
3093 }
3094
3095 dev = ISC_LIST_HEAD(sock->send_list);
3096 }
3097
3098 finish:
3099 if (ISC_LIST_EMPTY(sock->send_list)) {
3100 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3101 SELECT_POKE_WRITE);
3102 }
3103 }
3104
3105 /*
3106 * Process read/writes on each fd here. Avoid locking
3107 * and unlocking twice if both reads and writes are possible.
3108 */
3109 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3110 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3111 isc_socket_t *sock;
3112 int lockid = FDLOCK_ID(fd);
3113
3114 /*
3115 * If the socket is going to be closed, don't do more I/O.
3116 */
3117 LOCK(&thread->fdlock[lockid]);
3118 if (thread->fdstate[fd] == CLOSE_PENDING) {
3119 UNLOCK(&thread->fdlock[lockid]);
3120
3121 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3122 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3123 return;
3124 }
3125
3126 sock = thread->fds[fd];
3127 if (sock == NULL) {
3128 UNLOCK(&thread->fdlock[lockid]);
3129 return;
3130 }
3131
3132 LOCK(&sock->lock);
3133
3134 if (sock->fd < 0) {
3135 /*
3136 * Sock is being closed - the final external reference
3137 * is gone but it was not yet removed from event loop
3138 * and fdstate[]/fds[] as destroy() is waiting on
3139 * thread->fdlock[lockid] or sock->lock that we're holding.
3140 * Just release the locks and bail.
3141 */
3142 UNLOCK(&sock->lock);
3143 UNLOCK(&thread->fdlock[lockid]);
3144 return;
3145 }
3146
3147 REQUIRE(readable || writeable);
3148 if (writeable) {
3149 if (sock->connecting) {
3150 internal_connect(sock);
3151 } else {
3152 internal_send(sock);
3153 }
3154 }
3155
3156 if (readable) {
3157 if (sock->listener) {
3158 internal_accept(sock); /* unlocks sock */
3159 } else {
3160 internal_recv(sock);
3161 UNLOCK(&sock->lock);
3162 }
3163 } else {
3164 UNLOCK(&sock->lock);
3165 }
3166
3167 UNLOCK(&thread->fdlock[lockid]);
3168
3169 /*
3170 * Socket destruction might be pending, it will resume
3171 * after releasing fdlock and sock->lock.
3172 */
3173 }
3174
3175 /*
3176 * process_fds is different for different event loops
3177 * it takes the events from event loops and for each FD
3178 * launches process_fd
3179 */
3180 #ifdef USE_KQUEUE
3181 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3182 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3183 int i;
3184 bool readable, writable;
3185 bool done = false;
3186 bool have_ctlevent = false;
3187 if (nevents == thread->nevents) {
3188 /*
3189 * This is not an error, but something unexpected. If this
3190 * happens, it may indicate the need for increasing
3191 * ISC_SOCKET_MAXEVENTS.
3192 */
3193 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3194 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3195 "maximum number of FD events (%d) received",
3196 nevents);
3197 }
3198
3199 for (i = 0; i < nevents; i++) {
3200 REQUIRE(events[i].ident < thread->manager->maxsocks);
3201 if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3202 have_ctlevent = true;
3203 continue;
3204 }
3205 readable = (events[i].filter == EVFILT_READ);
3206 writable = (events[i].filter == EVFILT_WRITE);
3207 process_fd(thread, events[i].ident, readable, writable);
3208 }
3209
3210 if (have_ctlevent) {
3211 done = process_ctlfd(thread);
3212 }
3213
3214 return (done);
3215 }
3216 #elif defined(USE_EPOLL)
3217 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3218 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3219 int nevents) {
3220 int i;
3221 bool done = false;
3222 bool have_ctlevent = false;
3223
3224 if (nevents == thread->nevents) {
3225 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3226 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3227 "maximum number of FD events (%d) received",
3228 nevents);
3229 }
3230
3231 for (i = 0; i < nevents; i++) {
3232 REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3233 if (events[i].data.fd == thread->pipe_fds[0]) {
3234 have_ctlevent = true;
3235 continue;
3236 }
3237 if ((events[i].events & EPOLLERR) != 0 ||
3238 (events[i].events & EPOLLHUP) != 0) {
3239 /*
3240 * epoll does not set IN/OUT bits on an erroneous
3241 * condition, so we need to try both anyway. This is a
3242 * bit inefficient, but should be okay for such rare
3243 * events. Note also that the read or write attempt
3244 * won't block because we use non-blocking sockets.
3245 */
3246 int fd = events[i].data.fd;
3247 events[i].events |= thread->epoll_events[fd];
3248 }
3249 process_fd(thread, events[i].data.fd,
3250 (events[i].events & EPOLLIN) != 0,
3251 (events[i].events & EPOLLOUT) != 0);
3252 }
3253
3254 if (have_ctlevent) {
3255 done = process_ctlfd(thread);
3256 }
3257
3258 return (done);
3259 }
3260 #elif defined(USE_DEVPOLL)
3261 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3262 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3263 int i;
3264 bool done = false;
3265 bool have_ctlevent = false;
3266
3267 if (nevents == thread->nevents) {
3268 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3269 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3270 "maximum number of FD events (%d) received",
3271 nevents);
3272 }
3273
3274 for (i = 0; i < nevents; i++) {
3275 REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3276 if (events[i].fd == thread->pipe_fds[0]) {
3277 have_ctlevent = true;
3278 continue;
3279 }
3280 process_fd(thread, events[i].fd,
3281 (events[i].events & POLLIN) != 0,
3282 (events[i].events & POLLOUT) != 0);
3283 }
3284
3285 if (have_ctlevent) {
3286 done = process_ctlfd(thread);
3287 }
3288
3289 return (done);
3290 }
3291 #elif defined(USE_SELECT)
3292 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3293 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3294 fd_set *writefds) {
3295 int i;
3296
3297 REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3298
3299 for (i = 0; i < maxfd; i++) {
3300 if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3301 continue;
3302 }
3303 process_fd(thread, i, FD_ISSET(i, readfds),
3304 FD_ISSET(i, writefds));
3305 }
3306 }
3307 #endif /* ifdef USE_KQUEUE */
3308
3309 static bool
process_ctlfd(isc__socketthread_t * thread)3310 process_ctlfd(isc__socketthread_t *thread) {
3311 int msg, fd;
3312
3313 for (;;) {
3314 select_readmsg(thread, &fd, &msg);
3315
3316 thread_log(thread, IOEVENT,
3317 "watcher got message %d for socket %d", msg, fd);
3318
3319 /*
3320 * Nothing to read?
3321 */
3322 if (msg == SELECT_POKE_NOTHING) {
3323 break;
3324 }
3325
3326 /*
3327 * Handle shutdown message. We really should
3328 * jump out of this loop right away, but
3329 * it doesn't matter if we have to do a little
3330 * more work first.
3331 */
3332 if (msg == SELECT_POKE_SHUTDOWN) {
3333 return (true);
3334 }
3335
3336 /*
3337 * This is a wakeup on a socket. Look
3338 * at the event queue for both read and write,
3339 * and decide if we need to watch on it now
3340 * or not.
3341 */
3342 wakeup_socket(thread, fd, msg);
3343 }
3344
3345 return (false);
3346 }
3347
3348 /*
3349 * This is the thread that will loop forever, always in a select or poll
3350 * call.
3351 *
3352 * When select returns something to do, do whatever's necessary and post
3353 * an event to the task that was requesting the action.
3354 */
3355 static isc_threadresult_t
netthread(void * uap)3356 netthread(void *uap) {
3357 isc__socketthread_t *thread = uap;
3358 isc_socketmgr_t *manager = thread->manager;
3359 (void)manager;
3360 bool done;
3361 int cc;
3362 #ifdef USE_KQUEUE
3363 const char *fnname = "kevent()";
3364 #elif defined(USE_EPOLL)
3365 const char *fnname = "epoll_wait()";
3366 #elif defined(USE_DEVPOLL)
3367 isc_result_t result;
3368 const char *fnname = "ioctl(DP_POLL)";
3369 struct dvpoll dvp;
3370 int pass;
3371 #if defined(ISC_SOCKET_USE_POLLWATCH)
3372 pollstate_t pollstate = poll_idle;
3373 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3374 #elif defined(USE_SELECT)
3375 const char *fnname = "select()";
3376 int maxfd;
3377 int ctlfd;
3378 #endif /* ifdef USE_KQUEUE */
3379 char strbuf[ISC_STRERRORSIZE];
3380
3381 #if defined(USE_SELECT)
3382 /*
3383 * Get the control fd here. This will never change.
3384 */
3385 ctlfd = thread->pipe_fds[0];
3386 #endif /* if defined(USE_SELECT) */
3387 done = false;
3388 while (!done) {
3389 do {
3390 #ifdef USE_KQUEUE
3391 cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3392 thread->nevents, NULL);
3393 #elif defined(USE_EPOLL)
3394 cc = epoll_wait(thread->epoll_fd, thread->events,
3395 thread->nevents, -1);
3396 #elif defined(USE_DEVPOLL)
3397 /*
3398 * Re-probe every thousand calls.
3399 */
3400 if (thread->calls++ > 1000U) {
3401 result = isc_resource_getcurlimit(
3402 isc_resource_openfiles,
3403 &thread->open_max);
3404 if (result != ISC_R_SUCCESS) {
3405 thread->open_max = 64;
3406 }
3407 thread->calls = 0;
3408 }
3409 for (pass = 0; pass < 2; pass++) {
3410 dvp.dp_fds = thread->events;
3411 dvp.dp_nfds = thread->nevents;
3412 if (dvp.dp_nfds >= thread->open_max) {
3413 dvp.dp_nfds = thread->open_max - 1;
3414 }
3415 #ifndef ISC_SOCKET_USE_POLLWATCH
3416 dvp.dp_timeout = -1;
3417 #else /* ifndef ISC_SOCKET_USE_POLLWATCH */
3418 if (pollstate == poll_idle) {
3419 dvp.dp_timeout = -1;
3420 } else {
3421 dvp.dp_timeout =
3422 ISC_SOCKET_POLLWATCH_TIMEOUT;
3423 }
3424 #endif /* ISC_SOCKET_USE_POLLWATCH */
3425 cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3426 if (cc == -1 && errno == EINVAL) {
3427 /*
3428 * {OPEN_MAX} may have dropped. Look
3429 * up the current value and try again.
3430 */
3431 result = isc_resource_getcurlimit(
3432 isc_resource_openfiles,
3433 &thread->open_max);
3434 if (result != ISC_R_SUCCESS) {
3435 thread->open_max = 64;
3436 }
3437 } else {
3438 break;
3439 }
3440 }
3441 #elif defined(USE_SELECT)
3442 /*
3443 * We will have only one thread anyway, we can lock
3444 * manager lock and don't care
3445 */
3446 LOCK(&manager->lock);
3447 memmove(thread->read_fds_copy, thread->read_fds,
3448 thread->fd_bufsize);
3449 memmove(thread->write_fds_copy, thread->write_fds,
3450 thread->fd_bufsize);
3451 maxfd = thread->maxfd + 1;
3452 UNLOCK(&manager->lock);
3453
3454 cc = select(maxfd, thread->read_fds_copy,
3455 thread->write_fds_copy, NULL, NULL);
3456 #endif /* USE_KQUEUE */
3457
3458 if (cc < 0 && !SOFT_ERROR(errno)) {
3459 strerror_r(errno, strbuf, sizeof(strbuf));
3460 FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3461 fnname, strbuf);
3462 }
3463
3464 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3465 if (cc == 0) {
3466 if (pollstate == poll_active) {
3467 pollstate = poll_checking;
3468 } else if (pollstate == poll_checking) {
3469 pollstate = poll_idle;
3470 }
3471 } else if (cc > 0) {
3472 if (pollstate == poll_checking) {
3473 /*
3474 * XXX: We'd like to use a more
3475 * verbose log level as it's actually an
3476 * unexpected event, but the kernel bug
3477 * reportedly happens pretty frequently
3478 * (and it can also be a false positive)
3479 * so it would be just too noisy.
3480 */
3481 thread_log(thread,
3482 ISC_LOGCATEGORY_GENERAL,
3483 ISC_LOGMODULE_SOCKET,
3484 ISC_LOG_DEBUG(1),
3485 "unexpected POLL timeout");
3486 }
3487 pollstate = poll_active;
3488 }
3489 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3490 } while (cc < 0);
3491
3492 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3493 done = process_fds(thread, thread->events, cc);
3494 #elif defined(USE_SELECT)
3495 process_fds(thread, maxfd, thread->read_fds_copy,
3496 thread->write_fds_copy);
3497
3498 /*
3499 * Process reads on internal, control fd.
3500 */
3501 if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3502 done = process_ctlfd(thread);
3503 }
3504 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3505 * */
3506 }
3507
3508 thread_log(thread, TRACE, "watcher exiting");
3509 return ((isc_threadresult_t)0);
3510 }
3511
3512 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager,uint32_t reserved)3513 isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
3514 REQUIRE(VALID_MANAGER(manager));
3515
3516 manager->reserved = reserved;
3517 }
3518
3519 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager,unsigned int maxudp)3520 isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
3521 REQUIRE(VALID_MANAGER(manager));
3522
3523 manager->maxudp = maxudp;
3524 }
3525
3526 /*
3527 * Setup socket thread, thread->manager and thread->threadid must be filled.
3528 */
3529
3530 static isc_result_t
setup_thread(isc__socketthread_t * thread)3531 setup_thread(isc__socketthread_t *thread) {
3532 isc_result_t result = ISC_R_SUCCESS;
3533 int i;
3534 char strbuf[ISC_STRERRORSIZE];
3535
3536 REQUIRE(thread != NULL);
3537 REQUIRE(VALID_MANAGER(thread->manager));
3538 REQUIRE(thread->threadid >= 0 &&
3539 thread->threadid < thread->manager->nthreads);
3540
3541 thread->fds =
3542 isc_mem_get(thread->manager->mctx,
3543 thread->manager->maxsocks * sizeof(isc_socket_t *));
3544
3545 memset(thread->fds, 0,
3546 thread->manager->maxsocks * sizeof(isc_socket_t *));
3547
3548 thread->fdstate = isc_mem_get(thread->manager->mctx,
3549 thread->manager->maxsocks * sizeof(int));
3550
3551 memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3552
3553 thread->fdlock = isc_mem_get(thread->manager->mctx,
3554 FDLOCK_COUNT * sizeof(isc_mutex_t));
3555
3556 for (i = 0; i < FDLOCK_COUNT; i++) {
3557 isc_mutex_init(&thread->fdlock[i]);
3558 }
3559
3560 if (pipe(thread->pipe_fds) != 0) {
3561 strerror_r(errno, strbuf, sizeof(strbuf));
3562 UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3563 strbuf);
3564 return (ISC_R_UNEXPECTED);
3565 }
3566 RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3567
3568 #ifdef USE_KQUEUE
3569 thread->nevents = ISC_SOCKET_MAXEVENTS;
3570 thread->events = isc_mem_get(thread->manager->mctx,
3571 sizeof(struct kevent) * thread->nevents);
3572
3573 thread->kqueue_fd = kqueue();
3574 if (thread->kqueue_fd == -1) {
3575 result = isc__errno2result(errno);
3576 strerror_r(errno, strbuf, sizeof(strbuf));
3577 UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3578 strbuf);
3579 isc_mem_put(thread->manager->mctx, thread->events,
3580 sizeof(struct kevent) * thread->nevents);
3581 return (result);
3582 }
3583
3584 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3585 if (result != ISC_R_SUCCESS) {
3586 close(thread->kqueue_fd);
3587 isc_mem_put(thread->manager->mctx, thread->events,
3588 sizeof(struct kevent) * thread->nevents);
3589 }
3590 return (result);
3591
3592 #elif defined(USE_EPOLL)
3593 thread->nevents = ISC_SOCKET_MAXEVENTS;
3594 thread->epoll_events =
3595 isc_mem_get(thread->manager->mctx,
3596 (thread->manager->maxsocks * sizeof(uint32_t)));
3597
3598 memset(thread->epoll_events, 0,
3599 thread->manager->maxsocks * sizeof(uint32_t));
3600
3601 thread->events =
3602 isc_mem_get(thread->manager->mctx,
3603 sizeof(struct epoll_event) * thread->nevents);
3604
3605 thread->epoll_fd = epoll_create(thread->nevents);
3606 if (thread->epoll_fd == -1) {
3607 result = isc__errno2result(errno);
3608 strerror_r(errno, strbuf, sizeof(strbuf));
3609 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3610 strbuf);
3611 return (result);
3612 }
3613
3614 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3615 return (result);
3616
3617 #elif defined(USE_DEVPOLL)
3618 thread->nevents = ISC_SOCKET_MAXEVENTS;
3619 result = isc_resource_getcurlimit(isc_resource_openfiles,
3620 &thread->open_max);
3621 if (result != ISC_R_SUCCESS) {
3622 thread->open_max = 64;
3623 }
3624 thread->calls = 0;
3625 thread->events = isc_mem_get(thread->manager->mctx,
3626 sizeof(struct pollfd) * thread->nevents);
3627
3628 /*
3629 * Note: fdpollinfo should be able to support all possible FDs, so
3630 * it must have maxsocks entries (not nevents).
3631 */
3632 thread->fdpollinfo =
3633 isc_mem_get(thread->manager->mctx,
3634 sizeof(pollinfo_t) * thread->manager->maxsocks);
3635 memset(thread->fdpollinfo, 0,
3636 sizeof(pollinfo_t) * thread->manager->maxsocks);
3637 thread->devpoll_fd = open("/dev/poll", O_RDWR);
3638 if (thread->devpoll_fd == -1) {
3639 result = isc__errno2result(errno);
3640 strerror_r(errno, strbuf, sizeof(strbuf));
3641 UNEXPECTED_ERROR(__FILE__, __LINE__,
3642 "open(/dev/poll) failed: %s", strbuf);
3643 isc_mem_put(thread->manager->mctx, thread->events,
3644 sizeof(struct pollfd) * thread->nevents);
3645 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3646 sizeof(pollinfo_t) * thread->manager->maxsocks);
3647 return (result);
3648 }
3649 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3650 if (result != ISC_R_SUCCESS) {
3651 close(thread->devpoll_fd);
3652 isc_mem_put(thread->manager->mctx, thread->events,
3653 sizeof(struct pollfd) * thread->nevents);
3654 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3655 sizeof(pollinfo_t) * thread->manager->maxsocks);
3656 return (result);
3657 }
3658
3659 return (ISC_R_SUCCESS);
3660 #elif defined(USE_SELECT)
3661 UNUSED(result);
3662
3663 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3664 /*
3665 * Note: this code should also cover the case of MAXSOCKETS <=
3666 * FD_SETSIZE, but we separate the cases to avoid possible portability
3667 * issues regarding howmany() and the actual representation of fd_set.
3668 */
3669 thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3670 sizeof(fd_mask);
3671 #else /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3672 thread->fd_bufsize = sizeof(fd_set);
3673 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3674
3675 thread->read_fds = isc_mem_get(thread->manager->mctx,
3676 thread->fd_bufsize);
3677 thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3678 thread->fd_bufsize);
3679 thread->write_fds = isc_mem_get(thread->manager->mctx,
3680 thread->fd_bufsize);
3681 thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3682 thread->fd_bufsize);
3683 memset(thread->read_fds, 0, thread->fd_bufsize);
3684 memset(thread->write_fds, 0, thread->fd_bufsize);
3685
3686 (void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3687 thread->maxfd = thread->pipe_fds[0];
3688
3689 return (ISC_R_SUCCESS);
3690 #endif /* USE_KQUEUE */
3691 }
3692
3693 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3694 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3695 isc_result_t result;
3696 int i;
3697
3698 result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3699 if (result != ISC_R_SUCCESS) {
3700 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3701 }
3702 #ifdef USE_KQUEUE
3703 close(thread->kqueue_fd);
3704 isc_mem_put(mctx, thread->events,
3705 sizeof(struct kevent) * thread->nevents);
3706 #elif defined(USE_EPOLL)
3707 close(thread->epoll_fd);
3708
3709 isc_mem_put(mctx, thread->events,
3710 sizeof(struct epoll_event) * thread->nevents);
3711 #elif defined(USE_DEVPOLL)
3712 close(thread->devpoll_fd);
3713 isc_mem_put(mctx, thread->events,
3714 sizeof(struct pollfd) * thread->nevents);
3715 isc_mem_put(mctx, thread->fdpollinfo,
3716 sizeof(pollinfo_t) * thread->manager->maxsocks);
3717 #elif defined(USE_SELECT)
3718 if (thread->read_fds != NULL) {
3719 isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3720 }
3721 if (thread->read_fds_copy != NULL) {
3722 isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3723 }
3724 if (thread->write_fds != NULL) {
3725 isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3726 }
3727 if (thread->write_fds_copy != NULL) {
3728 isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3729 }
3730 #endif /* USE_KQUEUE */
3731 for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3732 if (thread->fdstate[i] == CLOSE_PENDING) {
3733 /* no need to lock */
3734 (void)close(i);
3735 }
3736 }
3737
3738 #if defined(USE_EPOLL)
3739 isc_mem_put(thread->manager->mctx, thread->epoll_events,
3740 thread->manager->maxsocks * sizeof(uint32_t));
3741 #endif /* if defined(USE_EPOLL) */
3742 isc_mem_put(thread->manager->mctx, thread->fds,
3743 thread->manager->maxsocks * sizeof(isc_socket_t *));
3744 isc_mem_put(thread->manager->mctx, thread->fdstate,
3745 thread->manager->maxsocks * sizeof(int));
3746
3747 for (i = 0; i < FDLOCK_COUNT; i++) {
3748 isc_mutex_destroy(&thread->fdlock[i]);
3749 }
3750 isc_mem_put(thread->manager->mctx, thread->fdlock,
3751 FDLOCK_COUNT * sizeof(isc_mutex_t));
3752 }
3753
3754 isc_result_t
isc_socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp)3755 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3756 return (isc_socketmgr_create2(mctx, managerp, 0, 1));
3757 }
3758
3759 isc_result_t
isc_socketmgr_create2(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3760 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3761 unsigned int maxsocks, int nthreads) {
3762 int i;
3763 isc_socketmgr_t *manager;
3764
3765 REQUIRE(managerp != NULL && *managerp == NULL);
3766
3767 if (maxsocks == 0) {
3768 maxsocks = ISC_SOCKET_MAXSOCKETS;
3769 }
3770
3771 manager = isc_mem_get(mctx, sizeof(*manager));
3772
3773 /* zero-clear so that necessary cleanup on failure will be easy */
3774 memset(manager, 0, sizeof(*manager));
3775 manager->maxsocks = maxsocks;
3776 manager->reserved = 0;
3777 manager->maxudp = 0;
3778 manager->nthreads = nthreads;
3779 manager->stats = NULL;
3780
3781 manager->magic = SOCKET_MANAGER_MAGIC;
3782 manager->mctx = NULL;
3783 ISC_LIST_INIT(manager->socklist);
3784 isc_mutex_init(&manager->lock);
3785 isc_condition_init(&manager->shutdown_ok);
3786
3787 /*
3788 * Start up the select/poll thread.
3789 */
3790 manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3791 manager->nthreads);
3792 isc_mem_attach(mctx, &manager->mctx);
3793
3794 for (i = 0; i < manager->nthreads; i++) {
3795 manager->threads[i].manager = manager;
3796 manager->threads[i].threadid = i;
3797 setup_thread(&manager->threads[i]);
3798 isc_thread_create(netthread, &manager->threads[i],
3799 &manager->threads[i].thread);
3800 char tname[1024];
3801 sprintf(tname, "isc-socket-%d", i);
3802 isc_thread_setname(manager->threads[i].thread, tname);
3803 }
3804
3805 *managerp = manager;
3806
3807 return (ISC_R_SUCCESS);
3808 }
3809
3810 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager,unsigned int * nsockp)3811 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3812 REQUIRE(VALID_MANAGER(manager));
3813 REQUIRE(nsockp != NULL);
3814
3815 *nsockp = manager->maxsocks;
3816
3817 return (ISC_R_SUCCESS);
3818 }
3819
3820 void
isc_socketmgr_setstats(isc_socketmgr_t * manager,isc_stats_t * stats)3821 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3822 REQUIRE(VALID_MANAGER(manager));
3823 REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3824 REQUIRE(manager->stats == NULL);
3825 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3826
3827 isc_stats_attach(stats, &manager->stats);
3828 }
3829
3830 void
isc_socketmgr_destroy(isc_socketmgr_t ** managerp)3831 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3832 isc_socketmgr_t *manager;
3833
3834 /*
3835 * Destroy a socket manager.
3836 */
3837
3838 REQUIRE(managerp != NULL);
3839 manager = *managerp;
3840 REQUIRE(VALID_MANAGER(manager));
3841
3842 LOCK(&manager->lock);
3843
3844 /*
3845 * Wait for all sockets to be destroyed.
3846 */
3847 while (!ISC_LIST_EMPTY(manager->socklist)) {
3848 manager_log(manager, CREATION, "sockets exist");
3849 WAIT(&manager->shutdown_ok, &manager->lock);
3850 }
3851
3852 UNLOCK(&manager->lock);
3853
3854 /*
3855 * Here, poke our select/poll thread. Do this by closing the write
3856 * half of the pipe, which will send EOF to the read half.
3857 * This is currently a no-op in the non-threaded case.
3858 */
3859 for (int i = 0; i < manager->nthreads; i++) {
3860 select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3861 }
3862
3863 /*
3864 * Wait for thread to exit.
3865 */
3866 for (int i = 0; i < manager->nthreads; i++) {
3867 isc_thread_join(manager->threads[i].thread, NULL);
3868 cleanup_thread(manager->mctx, &manager->threads[i]);
3869 }
3870 /*
3871 * Clean up.
3872 */
3873 isc_mem_put(manager->mctx, manager->threads,
3874 sizeof(isc__socketthread_t) * manager->nthreads);
3875 (void)isc_condition_destroy(&manager->shutdown_ok);
3876
3877 if (manager->stats != NULL) {
3878 isc_stats_detach(&manager->stats);
3879 }
3880 isc_mutex_destroy(&manager->lock);
3881 manager->magic = 0;
3882 isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
3883
3884 *managerp = NULL;
3885 }
3886
3887 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)3888 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3889 unsigned int flags) {
3890 int io_state;
3891 bool have_lock = false;
3892 isc_task_t *ntask = NULL;
3893 isc_result_t result = ISC_R_SUCCESS;
3894
3895 dev->ev_sender = task;
3896
3897 if (sock->type == isc_sockettype_udp) {
3898 io_state = doio_recv(sock, dev);
3899 } else {
3900 LOCK(&sock->lock);
3901 have_lock = true;
3902
3903 if (ISC_LIST_EMPTY(sock->recv_list)) {
3904 io_state = doio_recv(sock, dev);
3905 } else {
3906 io_state = DOIO_SOFT;
3907 }
3908 }
3909
3910 switch (io_state) {
3911 case DOIO_SOFT:
3912 /*
3913 * We couldn't read all or part of the request right now, so
3914 * queue it.
3915 *
3916 * Attach to socket and to task
3917 */
3918 isc_task_attach(task, &ntask);
3919 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3920
3921 if (!have_lock) {
3922 LOCK(&sock->lock);
3923 have_lock = true;
3924 }
3925
3926 /*
3927 * Enqueue the request. If the socket was previously not being
3928 * watched, poke the watcher to start paying attention to it.
3929 */
3930 bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
3931 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3932 if (do_poke) {
3933 select_poke(sock->manager, sock->threadid, sock->fd,
3934 SELECT_POKE_READ);
3935 }
3936
3937 socket_log(sock, NULL, EVENT,
3938 "socket_recv: event %p -> task %p", dev, ntask);
3939
3940 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
3941 result = ISC_R_INPROGRESS;
3942 }
3943 break;
3944
3945 case DOIO_EOF:
3946 dev->result = ISC_R_EOF;
3947 /* fallthrough */
3948
3949 case DOIO_HARD:
3950 case DOIO_SUCCESS:
3951 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
3952 send_recvdone_event(sock, &dev);
3953 }
3954 break;
3955 }
3956
3957 if (have_lock) {
3958 UNLOCK(&sock->lock);
3959 }
3960
3961 return (result);
3962 }
3963
3964 isc_result_t
isc_socket_recv(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)3965 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3966 isc_task_t *task, isc_taskaction_t action, void *arg) {
3967 isc_socketevent_t *dev;
3968 isc_socketmgr_t *manager;
3969
3970 REQUIRE(VALID_SOCKET(sock));
3971 REQUIRE(action != NULL);
3972
3973 manager = sock->manager;
3974 REQUIRE(VALID_MANAGER(manager));
3975
3976 INSIST(sock->bound);
3977
3978 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
3979 action, arg);
3980 if (dev == NULL) {
3981 return (ISC_R_NOMEMORY);
3982 }
3983
3984 return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
3985 }
3986
3987 isc_result_t
isc_socket_recv2(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)3988 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3989 isc_task_t *task, isc_socketevent_t *event,
3990 unsigned int flags) {
3991 event->ev_sender = sock;
3992 event->result = ISC_R_UNSET;
3993 event->region = *region;
3994 event->n = 0;
3995 event->offset = 0;
3996 event->attributes = 0;
3997
3998 /*
3999 * UDP sockets are always partial read.
4000 */
4001 if (sock->type == isc_sockettype_udp) {
4002 event->minimum = 1;
4003 } else {
4004 if (minimum == 0) {
4005 event->minimum = region->length;
4006 } else {
4007 event->minimum = minimum;
4008 }
4009 }
4010
4011 return (socket_recv(sock, event, task, flags));
4012 }
4013
4014 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)4015 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4016 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4017 unsigned int flags) {
4018 int io_state;
4019 bool have_lock = false;
4020 isc_task_t *ntask = NULL;
4021 isc_result_t result = ISC_R_SUCCESS;
4022
4023 dev->ev_sender = task;
4024
4025 set_dev_address(address, sock, dev);
4026 if (pktinfo != NULL) {
4027 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4028 dev->pktinfo = *pktinfo;
4029
4030 if (!isc_sockaddr_issitelocal(&dev->address) &&
4031 !isc_sockaddr_islinklocal(&dev->address))
4032 {
4033 socket_log(sock, NULL, TRACE,
4034 "pktinfo structure provided, ifindex %u "
4035 "(set to 0)",
4036 pktinfo->ipi6_ifindex);
4037
4038 /*
4039 * Set the pktinfo index to 0 here, to let the
4040 * kernel decide what interface it should send on.
4041 */
4042 dev->pktinfo.ipi6_ifindex = 0;
4043 }
4044 }
4045
4046 if (sock->type == isc_sockettype_udp) {
4047 io_state = doio_send(sock, dev);
4048 } else {
4049 LOCK(&sock->lock);
4050 have_lock = true;
4051
4052 if (ISC_LIST_EMPTY(sock->send_list)) {
4053 io_state = doio_send(sock, dev);
4054 } else {
4055 io_state = DOIO_SOFT;
4056 }
4057 }
4058
4059 switch (io_state) {
4060 case DOIO_SOFT:
4061 /*
4062 * We couldn't send all or part of the request right now, so
4063 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4064 */
4065 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4066 isc_task_attach(task, &ntask);
4067 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4068
4069 if (!have_lock) {
4070 LOCK(&sock->lock);
4071 have_lock = true;
4072 }
4073
4074 /*
4075 * Enqueue the request. If the socket was previously
4076 * not being watched, poke the watcher to start
4077 * paying attention to it.
4078 */
4079 bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4080 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4081 if (do_poke) {
4082 select_poke(sock->manager, sock->threadid,
4083 sock->fd, SELECT_POKE_WRITE);
4084 }
4085 socket_log(sock, NULL, EVENT,
4086 "socket_send: event %p -> task %p", dev,
4087 ntask);
4088
4089 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4090 result = ISC_R_INPROGRESS;
4091 }
4092 break;
4093 }
4094
4095 /* FALLTHROUGH */
4096
4097 case DOIO_HARD:
4098 case DOIO_SUCCESS:
4099 if (!have_lock) {
4100 LOCK(&sock->lock);
4101 have_lock = true;
4102 }
4103 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4104 send_senddone_event(sock, &dev);
4105 }
4106 break;
4107 }
4108
4109 if (have_lock) {
4110 UNLOCK(&sock->lock);
4111 }
4112
4113 return (result);
4114 }
4115
4116 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4117 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4118 isc_taskaction_t action, void *arg) {
4119 /*
4120 * REQUIRE() checking is performed in isc_socket_sendto().
4121 */
4122 return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4123 }
4124
4125 isc_result_t
isc_socket_sendto(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4126 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4127 isc_taskaction_t action, void *arg,
4128 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4129 isc_socketevent_t *dev;
4130 isc_socketmgr_t *manager;
4131
4132 REQUIRE(VALID_SOCKET(sock));
4133 REQUIRE(region != NULL);
4134 REQUIRE(task != NULL);
4135 REQUIRE(action != NULL);
4136
4137 manager = sock->manager;
4138 REQUIRE(VALID_MANAGER(manager));
4139
4140 INSIST(sock->bound);
4141
4142 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4143 action, arg);
4144 if (dev == NULL) {
4145 return (ISC_R_NOMEMORY);
4146 }
4147
4148 dev->region = *region;
4149
4150 return (socket_send(sock, dev, task, address, pktinfo, 0));
4151 }
4152
4153 isc_result_t
isc_socket_sendto2(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4154 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4155 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4156 isc_socketevent_t *event, unsigned int flags) {
4157 REQUIRE(VALID_SOCKET(sock));
4158 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4159 0);
4160 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4161 REQUIRE(sock->type == isc_sockettype_udp);
4162 }
4163 event->ev_sender = sock;
4164 event->result = ISC_R_UNSET;
4165 event->region = *region;
4166 event->n = 0;
4167 event->offset = 0;
4168 event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4169
4170 return (socket_send(sock, event, task, address, pktinfo, flags));
4171 }
4172
4173 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4174 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4175 #ifdef ISC_PLATFORM_HAVESYSUNH
4176 int s;
4177 struct stat sb;
4178 char strbuf[ISC_STRERRORSIZE];
4179
4180 if (sockaddr->type.sa.sa_family != AF_UNIX) {
4181 return;
4182 }
4183
4184 #ifndef S_ISSOCK
4185 #if defined(S_IFMT) && defined(S_IFSOCK)
4186 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4187 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4188 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4189 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4190 #endif /* ifndef S_ISSOCK */
4191
4192 #ifndef S_ISFIFO
4193 #if defined(S_IFMT) && defined(S_IFIFO)
4194 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4195 #elif defined(_S_IFMT) && defined(S_IFIFO)
4196 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4197 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4198 #endif /* ifndef S_ISFIFO */
4199
4200 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4201 /* cppcheck-suppress preprocessorErrorDirective */
4202 #error \
4203 You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>.
4204 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4205
4206 #ifndef S_ISFIFO
4207 #define S_ISFIFO(mode) 0
4208 #endif /* ifndef S_ISFIFO */
4209
4210 #ifndef S_ISSOCK
4211 #define S_ISSOCK(mode) 0
4212 #endif /* ifndef S_ISSOCK */
4213
4214 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4215 switch (errno) {
4216 case ENOENT:
4217 if (active) { /* We exited cleanly last time */
4218 break;
4219 }
4220 /* intentional fallthrough */
4221 default:
4222 strerror_r(errno, strbuf, sizeof(strbuf));
4223 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4224 ISC_LOGMODULE_SOCKET,
4225 active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4226 "isc_socket_cleanunix: stat(%s): %s",
4227 sockaddr->type.sunix.sun_path, strbuf);
4228 return;
4229 }
4230 } else {
4231 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4232 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4233 ISC_LOGMODULE_SOCKET,
4234 active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4235 "isc_socket_cleanunix: %s: not a socket",
4236 sockaddr->type.sunix.sun_path);
4237 return;
4238 }
4239 }
4240
4241 if (active) {
4242 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4243 strerror_r(errno, strbuf, sizeof(strbuf));
4244 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4245 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4246 "isc_socket_cleanunix: unlink(%s): %s",
4247 sockaddr->type.sunix.sun_path, strbuf);
4248 }
4249 return;
4250 }
4251
4252 s = socket(AF_UNIX, SOCK_STREAM, 0);
4253 if (s < 0) {
4254 strerror_r(errno, strbuf, sizeof(strbuf));
4255 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4256 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4257 "isc_socket_cleanunix: socket(%s): %s",
4258 sockaddr->type.sunix.sun_path, strbuf);
4259 return;
4260 }
4261
4262 if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4263 sizeof(sockaddr->type.sunix)) < 0)
4264 {
4265 switch (errno) {
4266 case ECONNREFUSED:
4267 case ECONNRESET:
4268 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4269 strerror_r(errno, strbuf, sizeof(strbuf));
4270 isc_log_write(
4271 isc_lctx, ISC_LOGCATEGORY_GENERAL,
4272 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4273 "isc_socket_cleanunix: "
4274 "unlink(%s): %s",
4275 sockaddr->type.sunix.sun_path, strbuf);
4276 }
4277 break;
4278 default:
4279 strerror_r(errno, strbuf, sizeof(strbuf));
4280 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4281 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4282 "isc_socket_cleanunix: connect(%s): %s",
4283 sockaddr->type.sunix.sun_path, strbuf);
4284 break;
4285 }
4286 }
4287 close(s);
4288 #else /* ifdef ISC_PLATFORM_HAVESYSUNH */
4289 UNUSED(sockaddr);
4290 UNUSED(active);
4291 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4292 }
4293
4294 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4295 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4296 uint32_t owner, uint32_t group) {
4297 #ifdef ISC_PLATFORM_HAVESYSUNH
4298 isc_result_t result = ISC_R_SUCCESS;
4299 char strbuf[ISC_STRERRORSIZE];
4300 char path[sizeof(sockaddr->type.sunix.sun_path)];
4301 #ifdef NEED_SECURE_DIRECTORY
4302 char *slash;
4303 #endif /* ifdef NEED_SECURE_DIRECTORY */
4304
4305 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4306 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4307 strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4308
4309 #ifdef NEED_SECURE_DIRECTORY
4310 slash = strrchr(path, '/');
4311 if (slash != NULL) {
4312 if (slash != path) {
4313 *slash = '\0';
4314 } else {
4315 strlcpy(path, "/", sizeof(path));
4316 }
4317 } else {
4318 strlcpy(path, ".", sizeof(path));
4319 }
4320 #endif /* ifdef NEED_SECURE_DIRECTORY */
4321
4322 if (chmod(path, perm) < 0) {
4323 strerror_r(errno, strbuf, sizeof(strbuf));
4324 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4325 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4326 "isc_socket_permunix: chmod(%s, %d): %s", path,
4327 perm, strbuf);
4328 result = ISC_R_FAILURE;
4329 }
4330 if (chown(path, owner, group) < 0) {
4331 strerror_r(errno, strbuf, sizeof(strbuf));
4332 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4333 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4334 "isc_socket_permunix: chown(%s, %d, %d): %s",
4335 path, owner, group, strbuf);
4336 result = ISC_R_FAILURE;
4337 }
4338 return (result);
4339 #else /* ifdef ISC_PLATFORM_HAVESYSUNH */
4340 UNUSED(sockaddr);
4341 UNUSED(perm);
4342 UNUSED(owner);
4343 UNUSED(group);
4344 return (ISC_R_NOTIMPLEMENTED);
4345 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4346 }
4347
4348 isc_result_t
isc_socket_bind(isc_socket_t * sock,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4349 isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
4350 isc_socket_options_t options) {
4351 char strbuf[ISC_STRERRORSIZE];
4352 int on = 1;
4353
4354 REQUIRE(VALID_SOCKET(sock));
4355
4356 LOCK(&sock->lock);
4357
4358 INSIST(!sock->bound);
4359 INSIST(!sock->dupped);
4360
4361 if (sock->pf != sockaddr->type.sa.sa_family) {
4362 UNLOCK(&sock->lock);
4363 return (ISC_R_FAMILYMISMATCH);
4364 }
4365
4366 /*
4367 * Only set SO_REUSEADDR when we want a specific port.
4368 */
4369 #ifdef AF_UNIX
4370 if (sock->pf == AF_UNIX) {
4371 goto bind_socket;
4372 }
4373 #endif /* ifdef AF_UNIX */
4374 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4375 isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4376 {
4377 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4378 sizeof(on)) < 0) {
4379 UNEXPECTED_ERROR(__FILE__, __LINE__,
4380 "setsockopt(%d) failed", sock->fd);
4381 }
4382 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4383 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4384 (void *)&on, sizeof(on)) < 0)
4385 {
4386 UNEXPECTED_ERROR(__FILE__, __LINE__,
4387 "setsockopt(%d) failed", sock->fd);
4388 }
4389 #elif defined(__linux__) && defined(SO_REUSEPORT)
4390 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4391 sizeof(on)) < 0) {
4392 UNEXPECTED_ERROR(__FILE__, __LINE__,
4393 "setsockopt(%d) failed", sock->fd);
4394 }
4395 #endif /* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4396 /* Press on... */
4397 }
4398 #ifdef AF_UNIX
4399 bind_socket:
4400 #endif /* ifdef AF_UNIX */
4401 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4402 inc_stats(sock->manager->stats,
4403 sock->statsindex[STATID_BINDFAIL]);
4404
4405 UNLOCK(&sock->lock);
4406 switch (errno) {
4407 case EACCES:
4408 return (ISC_R_NOPERM);
4409 case EADDRNOTAVAIL:
4410 return (ISC_R_ADDRNOTAVAIL);
4411 case EADDRINUSE:
4412 return (ISC_R_ADDRINUSE);
4413 case EINVAL:
4414 return (ISC_R_BOUND);
4415 default:
4416 strerror_r(errno, strbuf, sizeof(strbuf));
4417 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4418 strbuf);
4419 return (ISC_R_UNEXPECTED);
4420 }
4421 }
4422
4423 socket_log(sock, sockaddr, TRACE, "bound");
4424 sock->bound = 1;
4425
4426 UNLOCK(&sock->lock);
4427 return (ISC_R_SUCCESS);
4428 }
4429
4430 /*
4431 * Enable this only for specific OS versions, and only when they have repaired
4432 * their problems with it. Until then, this is is broken and needs to be
4433 * disabled by default. See RT22589 for details.
4434 */
4435 #undef ENABLE_ACCEPTFILTER
4436
4437 isc_result_t
isc_socket_filter(isc_socket_t * sock,const char * filter)4438 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4439 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4440 char strbuf[ISC_STRERRORSIZE];
4441 struct accept_filter_arg afa;
4442 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4443 UNUSED(sock);
4444 UNUSED(filter);
4445 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4446
4447 REQUIRE(VALID_SOCKET(sock));
4448
4449 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4450 bzero(&afa, sizeof(afa));
4451 strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4452 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4453 sizeof(afa)) == -1) {
4454 strerror_r(errno, strbuf, sizeof(strbuf));
4455 socket_log(sock, NULL, CREATION,
4456 "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4457 return (ISC_R_FAILURE);
4458 }
4459 return (ISC_R_SUCCESS);
4460 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4461 return (ISC_R_NOTIMPLEMENTED);
4462 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4463 }
4464
4465 /*
4466 * Try enabling TCP Fast Open for a given socket if the OS supports it.
4467 */
4468 static void
set_tcp_fastopen(isc_socket_t * sock,unsigned int backlog)4469 set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
4470 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4471 char strbuf[ISC_STRERRORSIZE];
4472
4473 /*
4474 * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4475 * shipping a default kernel without TFO support, so we special-case it by
4476 * performing an additional runtime check for TFO support using sysctl to
4477 * prevent setsockopt() errors from being logged.
4478 */
4479 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4480 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4481 unsigned int enabled;
4482 size_t enabledlen = sizeof(enabled);
4483 static bool tfo_notice_logged = false;
4484
4485 if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4486 /*
4487 * This kernel does not support TCP Fast Open. There is
4488 * nothing more we can do.
4489 */
4490 return;
4491 } else if (enabled == 0) {
4492 /*
4493 * This kernel does support TCP Fast Open, but it is disabled
4494 * by sysctl. Notify the user, but do not nag.
4495 */
4496 if (!tfo_notice_logged) {
4497 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4498 ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4499 "TCP_FASTOPEN support is disabled by "
4500 "sysctl (" SYSCTL_TFO " = 0)");
4501 tfo_notice_logged = true;
4502 }
4503 return;
4504 }
4505 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4506
4507 #ifdef __APPLE__
4508 backlog = 1;
4509 #else /* ifdef __APPLE__ */
4510 backlog = backlog / 2;
4511 if (backlog == 0) {
4512 backlog = 1;
4513 }
4514 #endif /* ifdef __APPLE__ */
4515 if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4516 sizeof(backlog)) < 0)
4517 {
4518 strerror_r(errno, strbuf, sizeof(strbuf));
4519 UNEXPECTED_ERROR(__FILE__, __LINE__,
4520 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4521 sock->fd, strbuf);
4522 /* TCP_FASTOPEN is experimental so ignore failures */
4523 }
4524 #else /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4525 UNUSED(sock);
4526 UNUSED(backlog);
4527 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4528 }
4529
4530 /*
4531 * Set up to listen on a given socket. We do this by creating an internal
4532 * event that will be dispatched when the socket has read activity. The
4533 * watcher will send the internal event to the task when there is a new
4534 * connection.
4535 *
4536 * Unlike in read, we don't preallocate a done event here. Every time there
4537 * is a new connection we'll have to allocate a new one anyway, so we might
4538 * as well keep things simple rather than having to track them.
4539 */
4540 isc_result_t
isc_socket_listen(isc_socket_t * sock,unsigned int backlog)4541 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4542 char strbuf[ISC_STRERRORSIZE];
4543
4544 REQUIRE(VALID_SOCKET(sock));
4545
4546 LOCK(&sock->lock);
4547
4548 REQUIRE(!sock->listener);
4549 REQUIRE(sock->bound);
4550 REQUIRE(sock->type == isc_sockettype_tcp ||
4551 sock->type == isc_sockettype_unix);
4552
4553 if (backlog == 0) {
4554 backlog = SOMAXCONN;
4555 }
4556
4557 if (listen(sock->fd, (int)backlog) < 0) {
4558 UNLOCK(&sock->lock);
4559 strerror_r(errno, strbuf, sizeof(strbuf));
4560
4561 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4562
4563 return (ISC_R_UNEXPECTED);
4564 }
4565
4566 set_tcp_fastopen(sock, backlog);
4567
4568 sock->listener = 1;
4569
4570 UNLOCK(&sock->lock);
4571 return (ISC_R_SUCCESS);
4572 }
4573
4574 /*
4575 * This should try to do aggressive accept() XXXMLG
4576 */
4577 isc_result_t
isc_socket_accept(isc_socket_t * sock,isc_task_t * task,isc_taskaction_t action,void * arg)4578 isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
4579 void *arg) {
4580 isc_socket_newconnev_t *dev;
4581 isc_socketmgr_t *manager;
4582 isc_task_t *ntask = NULL;
4583 isc_socket_t *nsock;
4584 isc_result_t result;
4585 bool do_poke = false;
4586
4587 REQUIRE(VALID_SOCKET(sock));
4588 manager = sock->manager;
4589 REQUIRE(VALID_MANAGER(manager));
4590
4591 LOCK(&sock->lock);
4592
4593 REQUIRE(sock->listener);
4594
4595 /*
4596 * Sender field is overloaded here with the task we will be sending
4597 * this event to. Just before the actual event is delivered the
4598 * actual ev_sender will be touched up to be the socket.
4599 */
4600 dev = (isc_socket_newconnev_t *)isc_event_allocate(
4601 manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4602 sizeof(*dev));
4603 ISC_LINK_INIT(dev, ev_link);
4604
4605 result = allocate_socket(manager, sock->type, &nsock);
4606 if (result != ISC_R_SUCCESS) {
4607 isc_event_free(ISC_EVENT_PTR(&dev));
4608 UNLOCK(&sock->lock);
4609 return (result);
4610 }
4611
4612 /*
4613 * Attach to socket and to task.
4614 */
4615 isc_task_attach(task, &ntask);
4616 if (isc_task_exiting(ntask)) {
4617 free_socket(&nsock);
4618 isc_task_detach(&ntask);
4619 isc_event_free(ISC_EVENT_PTR(&dev));
4620 UNLOCK(&sock->lock);
4621 return (ISC_R_SHUTTINGDOWN);
4622 }
4623 isc_refcount_increment0(&nsock->references);
4624 nsock->statsindex = sock->statsindex;
4625
4626 dev->ev_sender = ntask;
4627 dev->newsocket = nsock;
4628
4629 /*
4630 * Poke watcher here. We still have the socket locked, so there
4631 * is no race condition. We will keep the lock for such a short
4632 * bit of time waking it up now or later won't matter all that much.
4633 */
4634 do_poke = ISC_LIST_EMPTY(sock->accept_list);
4635 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4636 if (do_poke) {
4637 select_poke(manager, sock->threadid, sock->fd,
4638 SELECT_POKE_ACCEPT);
4639 }
4640 UNLOCK(&sock->lock);
4641 return (ISC_R_SUCCESS);
4642 }
4643
4644 isc_result_t
isc_socket_connect(isc_socket_t * sock,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4645 isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
4646 isc_task_t *task, isc_taskaction_t action, void *arg) {
4647 isc_socket_connev_t *dev;
4648 isc_task_t *ntask = NULL;
4649 isc_socketmgr_t *manager;
4650 int cc;
4651 char strbuf[ISC_STRERRORSIZE];
4652 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4653
4654 REQUIRE(VALID_SOCKET(sock));
4655 REQUIRE(addr != NULL);
4656 REQUIRE(task != NULL);
4657 REQUIRE(action != NULL);
4658
4659 manager = sock->manager;
4660 REQUIRE(VALID_MANAGER(manager));
4661 REQUIRE(addr != NULL);
4662
4663 if (isc_sockaddr_ismulticast(addr)) {
4664 return (ISC_R_MULTICAST);
4665 }
4666
4667 LOCK(&sock->lock);
4668
4669 dev = (isc_socket_connev_t *)isc_event_allocate(
4670 manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4671 sizeof(*dev));
4672 ISC_LINK_INIT(dev, ev_link);
4673
4674 if (sock->connecting) {
4675 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4676 goto queue;
4677 }
4678
4679 if (sock->connected) {
4680 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4681 dev->result = ISC_R_SUCCESS;
4682 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4683
4684 UNLOCK(&sock->lock);
4685
4686 return (ISC_R_SUCCESS);
4687 }
4688
4689 /*
4690 * Try to do the connect right away, as there can be only one
4691 * outstanding, and it might happen to complete.
4692 */
4693 sock->peer_address = *addr;
4694 cc = connect(sock->fd, &addr->type.sa, addr->length);
4695 if (cc < 0) {
4696 /*
4697 * The socket is nonblocking and the connection cannot be
4698 * completed immediately. It is possible to select(2) or
4699 * poll(2) for completion by selecting the socket for writing.
4700 * After select(2) indicates writability, use getsockopt(2) to
4701 * read the SO_ERROR option at level SOL_SOCKET to determine
4702 * whether connect() completed successfully (SO_ERROR is zero)
4703 * or unsuccessfully (SO_ERROR is one of the usual error codes
4704 * listed here, explaining the reason for the failure).
4705 */
4706 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4707 cc = 0;
4708 goto success;
4709 }
4710 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4711 goto queue;
4712 }
4713
4714 switch (errno) {
4715 #define ERROR_MATCH(a, b) \
4716 case a: \
4717 dev->result = b; \
4718 goto err_exit;
4719 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4720 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4721 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4722 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4723 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4724 #ifdef EHOSTDOWN
4725 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4726 #endif /* ifdef EHOSTDOWN */
4727 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4728 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4729 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4730 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4731 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4732 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4733 #undef ERROR_MATCH
4734 }
4735
4736 sock->connected = 0;
4737
4738 strerror_r(errno, strbuf, sizeof(strbuf));
4739 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4740 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4741 addrbuf, errno, strbuf);
4742
4743 UNLOCK(&sock->lock);
4744 inc_stats(sock->manager->stats,
4745 sock->statsindex[STATID_CONNECTFAIL]);
4746 isc_event_free(ISC_EVENT_PTR(&dev));
4747 return (ISC_R_UNEXPECTED);
4748
4749 err_exit:
4750 sock->connected = 0;
4751 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4752
4753 UNLOCK(&sock->lock);
4754 inc_stats(sock->manager->stats,
4755 sock->statsindex[STATID_CONNECTFAIL]);
4756 return (ISC_R_SUCCESS);
4757 }
4758
4759 /*
4760 * If connect completed, fire off the done event.
4761 */
4762 success:
4763 if (cc == 0) {
4764 sock->connected = 1;
4765 sock->bound = 1;
4766 dev->result = ISC_R_SUCCESS;
4767 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4768
4769 UNLOCK(&sock->lock);
4770
4771 inc_stats(sock->manager->stats,
4772 sock->statsindex[STATID_CONNECT]);
4773
4774 return (ISC_R_SUCCESS);
4775 }
4776
4777 queue:
4778
4779 /*
4780 * Attach to task.
4781 */
4782 isc_task_attach(task, &ntask);
4783
4784 dev->ev_sender = ntask;
4785
4786 /*
4787 * Poke watcher here. We still have the socket locked, so there
4788 * is no race condition. We will keep the lock for such a short
4789 * bit of time waking it up now or later won't matter all that much.
4790 */
4791 bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4792 ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4793 if (do_poke && !sock->connecting) {
4794 sock->connecting = 1;
4795 select_poke(manager, sock->threadid, sock->fd,
4796 SELECT_POKE_CONNECT);
4797 }
4798
4799 UNLOCK(&sock->lock);
4800 return (ISC_R_SUCCESS);
4801 }
4802
4803 /*
4804 * Called when a socket with a pending connect() finishes.
4805 */
4806 static void
internal_connect(isc_socket_t * sock)4807 internal_connect(isc_socket_t *sock) {
4808 isc_socket_connev_t *dev;
4809 int cc;
4810 isc_result_t result;
4811 socklen_t optlen;
4812 char strbuf[ISC_STRERRORSIZE];
4813 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4814
4815 INSIST(VALID_SOCKET(sock));
4816 REQUIRE(sock->fd >= 0);
4817
4818 /*
4819 * Get the first item off the connect list.
4820 * If it is empty, unlock the socket and return.
4821 */
4822 dev = ISC_LIST_HEAD(sock->connect_list);
4823 if (dev == NULL) {
4824 INSIST(!sock->connecting);
4825 goto finish;
4826 }
4827
4828 INSIST(sock->connecting);
4829 sock->connecting = 0;
4830
4831 /*
4832 * Get any possible error status here.
4833 */
4834 optlen = sizeof(cc);
4835 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4836 (void *)&optlen) != 0)
4837 {
4838 cc = errno;
4839 } else {
4840 errno = cc;
4841 }
4842
4843 if (errno != 0) {
4844 /*
4845 * If the error is EAGAIN, just re-select on this
4846 * fd and pretend nothing strange happened.
4847 */
4848 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4849 sock->connecting = 1;
4850 return;
4851 }
4852
4853 inc_stats(sock->manager->stats,
4854 sock->statsindex[STATID_CONNECTFAIL]);
4855
4856 /*
4857 * Translate other errors into ISC_R_* flavors.
4858 */
4859 switch (errno) {
4860 #define ERROR_MATCH(a, b) \
4861 case a: \
4862 result = b; \
4863 break;
4864 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4865 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4866 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4867 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4868 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4869 #ifdef EHOSTDOWN
4870 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4871 #endif /* ifdef EHOSTDOWN */
4872 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4873 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4874 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4875 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4876 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4877 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4878 #undef ERROR_MATCH
4879 default:
4880 result = ISC_R_UNEXPECTED;
4881 isc_sockaddr_format(&sock->peer_address, peerbuf,
4882 sizeof(peerbuf));
4883 strerror_r(errno, strbuf, sizeof(strbuf));
4884 UNEXPECTED_ERROR(__FILE__, __LINE__,
4885 "internal_connect: connect(%s) %s",
4886 peerbuf, strbuf);
4887 }
4888 } else {
4889 inc_stats(sock->manager->stats,
4890 sock->statsindex[STATID_CONNECT]);
4891 result = ISC_R_SUCCESS;
4892 sock->connected = 1;
4893 sock->bound = 1;
4894 }
4895
4896 do {
4897 dev->result = result;
4898 send_connectdone_event(sock, &dev);
4899 dev = ISC_LIST_HEAD(sock->connect_list);
4900 } while (dev != NULL);
4901
4902 finish:
4903 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
4904 SELECT_POKE_CONNECT);
4905 }
4906
4907 isc_result_t
isc_socket_getpeername(isc_socket_t * sock,isc_sockaddr_t * addressp)4908 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4909 isc_result_t result;
4910
4911 REQUIRE(VALID_SOCKET(sock));
4912 REQUIRE(addressp != NULL);
4913
4914 LOCK(&sock->lock);
4915
4916 if (sock->connected) {
4917 *addressp = sock->peer_address;
4918 result = ISC_R_SUCCESS;
4919 } else {
4920 result = ISC_R_NOTCONNECTED;
4921 }
4922
4923 UNLOCK(&sock->lock);
4924
4925 return (result);
4926 }
4927
4928 isc_result_t
isc_socket_getsockname(isc_socket_t * sock,isc_sockaddr_t * addressp)4929 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4930 socklen_t len;
4931 isc_result_t result;
4932 char strbuf[ISC_STRERRORSIZE];
4933
4934 REQUIRE(VALID_SOCKET(sock));
4935 REQUIRE(addressp != NULL);
4936
4937 LOCK(&sock->lock);
4938
4939 if (!sock->bound) {
4940 result = ISC_R_NOTBOUND;
4941 goto out;
4942 }
4943
4944 result = ISC_R_SUCCESS;
4945
4946 len = sizeof(addressp->type);
4947 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
4948 strerror_r(errno, strbuf, sizeof(strbuf));
4949 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
4950 result = ISC_R_UNEXPECTED;
4951 goto out;
4952 }
4953 addressp->length = (unsigned int)len;
4954
4955 out:
4956 UNLOCK(&sock->lock);
4957
4958 return (result);
4959 }
4960
4961 /*
4962 * Run through the list of events on this socket, and cancel the ones
4963 * queued for task "task" of type "how". "how" is a bitmask.
4964 */
4965 void
isc_socket_cancel(isc_socket_t * sock,isc_task_t * task,unsigned int how)4966 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
4967 REQUIRE(VALID_SOCKET(sock));
4968
4969 /*
4970 * Quick exit if there is nothing to do. Don't even bother locking
4971 * in this case.
4972 */
4973 if (how == 0) {
4974 return;
4975 }
4976
4977 LOCK(&sock->lock);
4978
4979 /*
4980 * All of these do the same thing, more or less.
4981 * Each will:
4982 * o If the internal event is marked as "posted" try to
4983 * remove it from the task's queue. If this fails, mark it
4984 * as canceled instead, and let the task clean it up later.
4985 * o For each I/O request for that task of that type, post
4986 * its done event with status of "ISC_R_CANCELED".
4987 * o Reset any state needed.
4988 */
4989 if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
4990 !ISC_LIST_EMPTY(sock->recv_list)) {
4991 isc_socketevent_t *dev;
4992 isc_socketevent_t *next;
4993 isc_task_t *current_task;
4994
4995 dev = ISC_LIST_HEAD(sock->recv_list);
4996
4997 while (dev != NULL) {
4998 current_task = dev->ev_sender;
4999 next = ISC_LIST_NEXT(dev, ev_link);
5000
5001 if ((task == NULL) || (task == current_task)) {
5002 dev->result = ISC_R_CANCELED;
5003 send_recvdone_event(sock, &dev);
5004 }
5005 dev = next;
5006 }
5007 }
5008
5009 if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
5010 !ISC_LIST_EMPTY(sock->send_list)) {
5011 isc_socketevent_t *dev;
5012 isc_socketevent_t *next;
5013 isc_task_t *current_task;
5014
5015 dev = ISC_LIST_HEAD(sock->send_list);
5016
5017 while (dev != NULL) {
5018 current_task = dev->ev_sender;
5019 next = ISC_LIST_NEXT(dev, ev_link);
5020
5021 if ((task == NULL) || (task == current_task)) {
5022 dev->result = ISC_R_CANCELED;
5023 send_senddone_event(sock, &dev);
5024 }
5025 dev = next;
5026 }
5027 }
5028
5029 if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
5030 !ISC_LIST_EMPTY(sock->accept_list)) {
5031 isc_socket_newconnev_t *dev;
5032 isc_socket_newconnev_t *next;
5033 isc_task_t *current_task;
5034
5035 dev = ISC_LIST_HEAD(sock->accept_list);
5036 while (dev != NULL) {
5037 current_task = dev->ev_sender;
5038 next = ISC_LIST_NEXT(dev, ev_link);
5039
5040 if ((task == NULL) || (task == current_task)) {
5041 ISC_LIST_UNLINK(sock->accept_list, dev,
5042 ev_link);
5043
5044 isc_refcount_decrementz(
5045 &NEWCONNSOCK(dev)->references);
5046 free_socket((isc_socket_t **)&dev->newsocket);
5047
5048 dev->result = ISC_R_CANCELED;
5049 dev->ev_sender = sock;
5050 isc_task_sendtoanddetach(¤t_task,
5051 ISC_EVENT_PTR(&dev),
5052 sock->threadid);
5053 }
5054
5055 dev = next;
5056 }
5057 }
5058
5059 if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5060 !ISC_LIST_EMPTY(sock->connect_list))
5061 {
5062 isc_socket_connev_t *dev;
5063 isc_socket_connev_t *next;
5064 isc_task_t *current_task;
5065
5066 INSIST(sock->connecting);
5067 sock->connecting = 0;
5068
5069 dev = ISC_LIST_HEAD(sock->connect_list);
5070
5071 while (dev != NULL) {
5072 current_task = dev->ev_sender;
5073 next = ISC_LIST_NEXT(dev, ev_link);
5074
5075 if ((task == NULL) || (task == current_task)) {
5076 dev->result = ISC_R_CANCELED;
5077 send_connectdone_event(sock, &dev);
5078 }
5079 dev = next;
5080 }
5081 }
5082
5083 UNLOCK(&sock->lock);
5084 }
5085
5086 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock)5087 isc_socket_gettype(isc_socket_t *sock) {
5088 REQUIRE(VALID_SOCKET(sock));
5089
5090 return (sock->type);
5091 }
5092
5093 void
isc_socket_ipv6only(isc_socket_t * sock,bool yes)5094 isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
5095 #if defined(IPV6_V6ONLY)
5096 int onoff = yes ? 1 : 0;
5097 #else /* if defined(IPV6_V6ONLY) */
5098 UNUSED(yes);
5099 UNUSED(sock);
5100 #endif /* if defined(IPV6_V6ONLY) */
5101
5102 REQUIRE(VALID_SOCKET(sock));
5103 INSIST(!sock->dupped);
5104
5105 #ifdef IPV6_V6ONLY
5106 if (sock->pf == AF_INET6) {
5107 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5108 (void *)&onoff, sizeof(int)) < 0)
5109 {
5110 char strbuf[ISC_STRERRORSIZE];
5111 strerror_r(errno, strbuf, sizeof(strbuf));
5112 UNEXPECTED_ERROR(__FILE__, __LINE__,
5113 "setsockopt(%d, IPV6_V6ONLY) failed: "
5114 "%s",
5115 sock->fd, strbuf);
5116 }
5117 }
5118 #endif /* ifdef IPV6_V6ONLY */
5119 }
5120
5121 static void
setdscp(isc_socket_t * sock,isc_dscp_t dscp)5122 setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
5123 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5124 int value = dscp << 2;
5125 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5126
5127 sock->dscp = dscp;
5128
5129 #ifdef IP_TOS
5130 if (sock->pf == AF_INET) {
5131 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5132 sizeof(value)) < 0) {
5133 char strbuf[ISC_STRERRORSIZE];
5134 strerror_r(errno, strbuf, sizeof(strbuf));
5135 UNEXPECTED_ERROR(__FILE__, __LINE__,
5136 "setsockopt(%d, IP_TOS, %.02x) "
5137 "failed: %s",
5138 sock->fd, value >> 2, strbuf);
5139 }
5140 }
5141 #endif /* ifdef IP_TOS */
5142 #ifdef IPV6_TCLASS
5143 if (sock->pf == AF_INET6) {
5144 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5145 (void *)&value, sizeof(value)) < 0)
5146 {
5147 char strbuf[ISC_STRERRORSIZE];
5148 strerror_r(errno, strbuf, sizeof(strbuf));
5149 UNEXPECTED_ERROR(__FILE__, __LINE__,
5150 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5151 "failed: %s",
5152 sock->fd, dscp >> 2, strbuf);
5153 }
5154 }
5155 #endif /* ifdef IPV6_TCLASS */
5156 }
5157
5158 void
isc_socket_dscp(isc_socket_t * sock,isc_dscp_t dscp)5159 isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
5160 REQUIRE(VALID_SOCKET(sock));
5161 REQUIRE(dscp < 0x40);
5162
5163 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5164 UNUSED(dscp);
5165 #else /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5166 if (dscp < 0) {
5167 return;
5168 }
5169
5170 /* The DSCP value must not be changed once it has been set. */
5171 if (isc_dscp_check_value != -1) {
5172 INSIST(dscp == isc_dscp_check_value);
5173 }
5174 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5175
5176 #ifdef notyet
5177 REQUIRE(!sock->dupped);
5178 #endif /* ifdef notyet */
5179
5180 setdscp(sock, dscp);
5181 }
5182
5183 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5184 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5185 isc_taskaction_t action, void *arg) {
5186 return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5187 }
5188
5189 void
isc_socket_setname(isc_socket_t * sock,const char * name,void * tag)5190 isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
5191 /*
5192 * Name 'sock'.
5193 */
5194
5195 REQUIRE(VALID_SOCKET(sock));
5196
5197 LOCK(&sock->lock);
5198 strlcpy(sock->name, name, sizeof(sock->name));
5199 sock->tag = tag;
5200 UNLOCK(&sock->lock);
5201 }
5202
5203 const char *
isc_socket_getname(isc_socket_t * sock)5204 isc_socket_getname(isc_socket_t *sock) {
5205 return (sock->name);
5206 }
5207
5208 void *
isc_socket_gettag(isc_socket_t * sock)5209 isc_socket_gettag(isc_socket_t *sock) {
5210 return (sock->tag);
5211 }
5212
5213 int
isc_socket_getfd(isc_socket_t * sock)5214 isc_socket_getfd(isc_socket_t *sock) {
5215 return ((short)sock->fd);
5216 }
5217
5218 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5219 static bool hasreuseport = false;
5220
5221 static void
init_hasreuseport()5222 init_hasreuseport() {
5223 /*
5224 * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5225 * We only want to use it on Linux, if it's available. On BSD we want to dup()
5226 * sockets instead of re-binding them.
5227 */
5228 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5229 (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5230 int sock, yes = 1;
5231 sock = socket(AF_INET, SOCK_DGRAM, 0);
5232 if (sock < 0) {
5233 sock = socket(AF_INET6, SOCK_DGRAM, 0);
5234 if (sock < 0) {
5235 return;
5236 }
5237 }
5238 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5239 sizeof(yes)) < 0) {
5240 close(sock);
5241 return;
5242 #if defined(__FreeBSD_kernel__)
5243 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5244 sizeof(yes)) < 0)
5245 #else /* if defined(__FreeBSD_kernel__) */
5246 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5247 sizeof(yes)) < 0)
5248 #endif /* if defined(__FreeBSD_kernel__) */
5249 {
5250 close(sock);
5251 return;
5252 }
5253 hasreuseport = true;
5254 close(sock);
5255 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5256 * (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5257 }
5258
5259 bool
isc_socket_hasreuseport()5260 isc_socket_hasreuseport() {
5261 RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5262 ISC_R_SUCCESS);
5263 return (hasreuseport);
5264 }
5265
5266 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5267 static const char *
_socktype(isc_sockettype_t type)5268 _socktype(isc_sockettype_t type) {
5269 switch (type) {
5270 case isc_sockettype_udp:
5271 return ("udp");
5272 case isc_sockettype_tcp:
5273 return ("tcp");
5274 case isc_sockettype_unix:
5275 return ("unix");
5276 default:
5277 return ("not-initialized");
5278 }
5279 }
5280 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5281
5282 #ifdef HAVE_LIBXML2
5283 #define TRY0(a) \
5284 do { \
5285 xmlrc = (a); \
5286 if (xmlrc < 0) \
5287 goto error; \
5288 } while (0)
5289 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr,void * writer0)5290 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
5291 isc_socket_t *sock = NULL;
5292 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5293 isc_sockaddr_t addr;
5294 socklen_t len;
5295 int xmlrc;
5296 xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5297
5298 LOCK(&mgr->lock);
5299
5300 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5301 sock = ISC_LIST_HEAD(mgr->socklist);
5302 while (sock != NULL) {
5303 LOCK(&sock->lock);
5304 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5305
5306 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5307 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5308 TRY0(xmlTextWriterEndElement(writer));
5309
5310 if (sock->name[0] != 0) {
5311 TRY0(xmlTextWriterStartElement(writer,
5312 ISC_XMLCHAR "name"));
5313 TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5314 sock->name));
5315 TRY0(xmlTextWriterEndElement(writer)); /* name */
5316 }
5317
5318 TRY0(xmlTextWriterStartElement(writer,
5319 ISC_XMLCHAR "references"));
5320 TRY0(xmlTextWriterWriteFormatString(
5321 writer, "%d",
5322 (int)isc_refcount_current(&sock->references)));
5323 TRY0(xmlTextWriterEndElement(writer));
5324
5325 TRY0(xmlTextWriterWriteElement(
5326 writer, ISC_XMLCHAR "type",
5327 ISC_XMLCHAR _socktype(sock->type)));
5328
5329 if (sock->connected) {
5330 isc_sockaddr_format(&sock->peer_address, peerbuf,
5331 sizeof(peerbuf));
5332 TRY0(xmlTextWriterWriteElement(
5333 writer, ISC_XMLCHAR "peer-address",
5334 ISC_XMLCHAR peerbuf));
5335 }
5336
5337 len = sizeof(addr);
5338 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5339 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5340 TRY0(xmlTextWriterWriteElement(
5341 writer, ISC_XMLCHAR "local-address",
5342 ISC_XMLCHAR peerbuf));
5343 }
5344
5345 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5346 if (sock->listener) {
5347 TRY0(xmlTextWriterWriteElement(writer,
5348 ISC_XMLCHAR "state",
5349 ISC_XMLCHAR "listener"));
5350 }
5351 if (sock->connected) {
5352 TRY0(xmlTextWriterWriteElement(
5353 writer, ISC_XMLCHAR "state",
5354 ISC_XMLCHAR "connected"));
5355 }
5356 if (sock->connecting) {
5357 TRY0(xmlTextWriterWriteElement(
5358 writer, ISC_XMLCHAR "state",
5359 ISC_XMLCHAR "connecting"));
5360 }
5361 if (sock->bound) {
5362 TRY0(xmlTextWriterWriteElement(writer,
5363 ISC_XMLCHAR "state",
5364 ISC_XMLCHAR "bound"));
5365 }
5366
5367 TRY0(xmlTextWriterEndElement(writer)); /* states */
5368
5369 TRY0(xmlTextWriterEndElement(writer)); /* socket */
5370
5371 UNLOCK(&sock->lock);
5372 sock = ISC_LIST_NEXT(sock, link);
5373 }
5374 TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5375
5376 error:
5377 if (sock != NULL) {
5378 UNLOCK(&sock->lock);
5379 }
5380
5381 UNLOCK(&mgr->lock);
5382
5383 return (xmlrc);
5384 }
5385 #endif /* HAVE_LIBXML2 */
5386
5387 #ifdef HAVE_JSON_C
5388 #define CHECKMEM(m) \
5389 do { \
5390 if (m == NULL) { \
5391 result = ISC_R_NOMEMORY; \
5392 goto error; \
5393 } \
5394 } while (0)
5395
5396 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr,void * stats0)5397 isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
5398 isc_result_t result = ISC_R_SUCCESS;
5399 isc_socket_t *sock = NULL;
5400 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5401 isc_sockaddr_t addr;
5402 socklen_t len;
5403 json_object *obj, *array = json_object_new_array();
5404 json_object *stats = (json_object *)stats0;
5405
5406 CHECKMEM(array);
5407
5408 LOCK(&mgr->lock);
5409
5410 sock = ISC_LIST_HEAD(mgr->socklist);
5411 while (sock != NULL) {
5412 json_object *states, *entry = json_object_new_object();
5413 char buf[255];
5414
5415 CHECKMEM(entry);
5416 json_object_array_add(array, entry);
5417
5418 LOCK(&sock->lock);
5419
5420 snprintf(buf, sizeof(buf), "%p", sock);
5421 obj = json_object_new_string(buf);
5422 CHECKMEM(obj);
5423 json_object_object_add(entry, "id", obj);
5424
5425 if (sock->name[0] != 0) {
5426 obj = json_object_new_string(sock->name);
5427 CHECKMEM(obj);
5428 json_object_object_add(entry, "name", obj);
5429 }
5430
5431 obj = json_object_new_int(
5432 (int)isc_refcount_current(&sock->references));
5433 CHECKMEM(obj);
5434 json_object_object_add(entry, "references", obj);
5435
5436 obj = json_object_new_string(_socktype(sock->type));
5437 CHECKMEM(obj);
5438 json_object_object_add(entry, "type", obj);
5439
5440 if (sock->connected) {
5441 isc_sockaddr_format(&sock->peer_address, peerbuf,
5442 sizeof(peerbuf));
5443 obj = json_object_new_string(peerbuf);
5444 CHECKMEM(obj);
5445 json_object_object_add(entry, "peer-address", obj);
5446 }
5447
5448 len = sizeof(addr);
5449 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5450 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5451 obj = json_object_new_string(peerbuf);
5452 CHECKMEM(obj);
5453 json_object_object_add(entry, "local-address", obj);
5454 }
5455
5456 states = json_object_new_array();
5457 CHECKMEM(states);
5458 json_object_object_add(entry, "states", states);
5459
5460 if (sock->listener) {
5461 obj = json_object_new_string("listener");
5462 CHECKMEM(obj);
5463 json_object_array_add(states, obj);
5464 }
5465
5466 if (sock->connected) {
5467 obj = json_object_new_string("connected");
5468 CHECKMEM(obj);
5469 json_object_array_add(states, obj);
5470 }
5471
5472 if (sock->connecting) {
5473 obj = json_object_new_string("connecting");
5474 CHECKMEM(obj);
5475 json_object_array_add(states, obj);
5476 }
5477
5478 if (sock->bound) {
5479 obj = json_object_new_string("bound");
5480 CHECKMEM(obj);
5481 json_object_array_add(states, obj);
5482 }
5483
5484 UNLOCK(&sock->lock);
5485 sock = ISC_LIST_NEXT(sock, link);
5486 }
5487
5488 json_object_object_add(stats, "sockets", array);
5489 array = NULL;
5490 result = ISC_R_SUCCESS;
5491
5492 error:
5493 if (array != NULL) {
5494 json_object_put(array);
5495 }
5496
5497 if (sock != NULL) {
5498 UNLOCK(&sock->lock);
5499 }
5500
5501 UNLOCK(&mgr->lock);
5502
5503 return (result);
5504 }
5505 #endif /* HAVE_JSON_C */
5506