1 /*
2 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3 *
4 * This Source Code Form is subject to the terms of the Mozilla Public
5 * License, v. 2.0. If a copy of the MPL was not distributed with this
6 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
7 *
8 * See the COPYRIGHT file distributed with this work for additional
9 * information regarding copyright ownership.
10 */
11
12 /*! \file */
13
14 #include <inttypes.h>
15 #include <stdbool.h>
16 #include <sys/param.h>
17 #include <sys/socket.h>
18 #include <sys/stat.h>
19 #include <sys/types.h>
20 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
21 #include <sys/sysctl.h>
22 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
23 #include <sys/time.h>
24 #include <sys/uio.h>
25
26 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
27 #include <linux/netlink.h>
28 #include <linux/rtnetlink.h>
29 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
30 */
31
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <stddef.h>
35 #include <stdlib.h>
36 #include <sys/un.h>
37 #include <unistd.h>
38
39 #include <isc/app.h>
40 #include <isc/buffer.h>
41 #include <isc/condition.h>
42 #include <isc/formatcheck.h>
43 #include <isc/list.h>
44 #include <isc/log.h>
45 #include <isc/mem.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/print.h>
50 #include <isc/refcount.h>
51 #include <isc/region.h>
52 #include <isc/resource.h>
53 #include <isc/socket.h>
54 #include <isc/stats.h>
55 #include <isc/strerr.h>
56 #include <isc/string.h>
57 #include <isc/task.h>
58 #include <isc/thread.h>
59 #include <isc/util.h>
60
61 #ifdef HAVE_KQUEUE
62 #include <sys/event.h>
63 #endif /* ifdef HAVE_KQUEUE */
64 #ifdef HAVE_EPOLL_CREATE1
65 #include <sys/epoll.h>
66 #endif /* ifdef HAVE_EPOLL_CREATE1 */
67 #if defined(HAVE_SYS_DEVPOLL_H)
68 #include <sys/devpoll.h>
69 #elif defined(HAVE_DEVPOLL_H)
70 #include <devpoll.h>
71 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
72
73 #include <netinet/tcp.h>
74
75 #include "errno2result.h"
76 #include "socket_p.h"
77
78 #ifdef ENABLE_TCP_FASTOPEN
79 #include <netinet/tcp.h>
80 #endif /* ifdef ENABLE_TCP_FASTOPEN */
81
82 #ifdef HAVE_JSON_C
83 #include <json_object.h>
84 #endif /* HAVE_JSON_C */
85
86 #ifdef HAVE_LIBXML2
87 #include <libxml/xmlwriter.h>
88 #define ISC_XMLCHAR (const xmlChar *)
89 #endif /* HAVE_LIBXML2 */
90
91 /*%
92 * Choose the most preferable multiplex method.
93 */
94 #if defined(HAVE_KQUEUE)
95 #define USE_KQUEUE
96 #elif defined(HAVE_EPOLL_CREATE1)
97 #define USE_EPOLL
98 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
99 #define USE_DEVPOLL
100 typedef struct {
101 unsigned int want_read : 1, want_write : 1;
102 } pollinfo_t;
103 #else /* if defined(HAVE_KQUEUE) */
104 #define USE_SELECT
105 #endif /* HAVE_KQUEUE */
106
107 /*
108 * Set by the -T dscp option on the command line. If set to a value
109 * other than -1, we check to make sure DSCP values match it, and
110 * assert if not.
111 */
112 int isc_dscp_check_value = -1;
113
114 /*%
115 * Maximum number of allowable open sockets. This is also the maximum
116 * allowable socket file descriptor.
117 *
118 * Care should be taken before modifying this value for select():
119 * The API standard doesn't ensure select() accept more than (the system default
120 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
121 * the vast majority of cases. This constant should therefore be increased only
122 * when absolutely necessary and possible, i.e., the server is exhausting all
123 * available file descriptors (up to FD_SETSIZE) and the select() function
124 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
125 * always by true, but we keep using some of them to ensure as much
126 * portability as possible). Note also that overall server performance
127 * may be rather worsened with a larger value of this constant due to
128 * inherent scalability problems of select().
129 *
130 * As a special note, this value shouldn't have to be touched if
131 * this is a build for an authoritative only DNS server.
132 */
133 #ifndef ISC_SOCKET_MAXSOCKETS
134 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
135 #ifdef TUNE_LARGE
136 #define ISC_SOCKET_MAXSOCKETS 21000
137 #else /* ifdef TUNE_LARGE */
138 #define ISC_SOCKET_MAXSOCKETS 4096
139 #endif /* TUNE_LARGE */
140 #elif defined(USE_SELECT)
141 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
142 #endif /* USE_KQUEUE... */
143 #endif /* ISC_SOCKET_MAXSOCKETS */
144
145 #ifdef USE_SELECT
146 /*%
147 * Mac OS X needs a special definition to support larger values in select().
148 * We always define this because a larger value can be specified run-time.
149 */
150 #ifdef __APPLE__
151 #define _DARWIN_UNLIMITED_SELECT
152 #endif /* __APPLE__ */
153 #endif /* USE_SELECT */
154
155 #ifdef ISC_SOCKET_USE_POLLWATCH
156 /*%
157 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
158 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
159 * some of the specified FD. The idea is based on the observation that it's
160 * likely for a busy server to keep receiving packets. It specifically works
161 * as follows: the socket watcher is first initialized with the state of
162 * "poll_idle". While it's in the idle state it keeps sleeping until a socket
163 * event occurs. When it wakes up for a socket I/O event, it moves to the
164 * poll_active state, and sets the poll timeout to a short period
165 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
166 * watcher goes to the poll_checking state with the same timeout period.
167 * In this state, the watcher tries to detect whether this is a break
168 * during intermittent events or the kernel bug is triggered. If the next
169 * polling reports an event within the short period, the previous timeout is
170 * likely to be a kernel bug, and so the watcher goes back to the active state.
171 * Otherwise, it moves to the idle state again.
172 *
173 * It's not clear whether this is a thread-related bug, but since we've only
174 * seen this with threads, this workaround is used only when enabling threads.
175 */
176
177 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
178
179 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
180 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
181 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
182 #endif /* ISC_SOCKET_USE_POLLWATCH */
183
184 /*%
185 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
186 */
187 #define FDLOCK_BITS 10
188 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
189 #define FDLOCK_ID(fd) \
190 (((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
191 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
192
193 /*%
194 * Maximum number of events communicated with the kernel. There should normally
195 * be no need for having a large number.
196 */
197 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
198 #ifndef ISC_SOCKET_MAXEVENTS
199 #ifdef TUNE_LARGE
200 #define ISC_SOCKET_MAXEVENTS 2048
201 #else /* ifdef TUNE_LARGE */
202 #define ISC_SOCKET_MAXEVENTS 64
203 #endif /* TUNE_LARGE */
204 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
205 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
206 * */
207
208 /*%
209 * Some systems define the socket length argument as an int, some as size_t,
210 * some as socklen_t. This is here so it can be easily changed if needed.
211 */
212 #ifndef socklen_t
213 #define socklen_t unsigned int
214 #endif /* ifndef socklen_t */
215
216 /*%
217 * Define what the possible "soft" errors can be. These are non-fatal returns
218 * of various network related functions, like recv() and so on.
219 *
220 * For some reason, BSDI (and perhaps others) will sometimes return <0
221 * from recv() but will have errno==0. This is broken, but we have to
222 * work around it here.
223 */
224 #define SOFT_ERROR(e) \
225 ((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
226 (e) == EINTR || (e) == 0)
227
228 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
229
230 /*!<
231 * DLVL(90) -- Function entry/exit and other tracing.
232 * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
233 * DLVL(60) -- Socket data send/receive
234 * DLVL(50) -- Event tracing, including receiving/sending completion events.
235 * DLVL(20) -- Socket creation/destruction.
236 */
237 #define TRACE_LEVEL 90
238 #define CORRECTNESS_LEVEL 70
239 #define IOEVENT_LEVEL 60
240 #define EVENT_LEVEL 50
241 #define CREATION_LEVEL 20
242
243 #define TRACE DLVL(TRACE_LEVEL)
244 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
245 #define IOEVENT DLVL(IOEVENT_LEVEL)
246 #define EVENT DLVL(EVENT_LEVEL)
247 #define CREATION DLVL(CREATION_LEVEL)
248
249 typedef isc_event_t intev_t;
250
251 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
252 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
253
254 /*!
255 * IPv6 control information. If the socket is an IPv6 socket we want
256 * to collect the destination address and interface so the client can
257 * set them on outgoing packets.
258 */
259 #ifndef USE_CMSG
260 #define USE_CMSG 1
261 #endif /* ifndef USE_CMSG */
262
263 /*%
264 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
265 * a setsockopt() like interface to request timestamps, and if the OS
266 * doesn't do it for us, call gettimeofday() on every UDP receive?
267 */
268 #ifdef SO_TIMESTAMP
269 #ifndef USE_CMSG
270 #define USE_CMSG 1
271 #endif /* ifndef USE_CMSG */
272 #endif /* ifdef SO_TIMESTAMP */
273
274 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
275 #define SET_RCVBUF
276 #endif
277
278 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
279 #define SET_SNDBUF
280 #endif
281
282 /*%
283 * Instead of calculating the cmsgbuf lengths every time we take
284 * a rule of thumb approach - sizes are taken from x86_64 linux,
285 * multiplied by 2, everything should fit. Those sizes are not
286 * large enough to cause any concern.
287 */
288 #if defined(USE_CMSG)
289 #define CMSG_SP_IN6PKT 40
290 #else /* if defined(USE_CMSG) */
291 #define CMSG_SP_IN6PKT 0
292 #endif /* if defined(USE_CMSG) */
293
294 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
295 #define CMSG_SP_TIMESTAMP 32
296 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
297 #define CMSG_SP_TIMESTAMP 0
298 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
299
300 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
301 #define CMSG_SP_TCTOS 24
302 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
303 #define CMSG_SP_TCTOS 0
304 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
305
306 #define CMSG_SP_INT 24
307
308 /* Align cmsg buffers to be safe on SPARC etc. */
309 #define RECVCMSGBUFLEN \
310 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
311 1, \
312 sizeof(void *))
313 #define SENDCMSGBUFLEN \
314 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
315 sizeof(void *))
316
317 /*%
318 * The number of times a send operation is repeated if the result is EINTR.
319 */
320 #define NRETRIES 10
321
322 typedef struct isc__socketthread isc__socketthread_t;
323
324 #define NEWCONNSOCK(ev) ((ev)->newsocket)
325
326 struct isc_socket {
327 /* Not locked. */
328 unsigned int magic;
329 isc_socketmgr_t *manager;
330 isc_mutex_t lock;
331 isc_sockettype_t type;
332 const isc_statscounter_t *statsindex;
333 isc_refcount_t references;
334
335 /* Locked by socket lock. */
336 ISC_LINK(isc_socket_t) link;
337 int fd;
338 int pf;
339 int threadid;
340 char name[16];
341 void *tag;
342
343 ISC_LIST(isc_socketevent_t) send_list;
344 ISC_LIST(isc_socketevent_t) recv_list;
345 ISC_LIST(isc_socket_newconnev_t) accept_list;
346 ISC_LIST(isc_socket_connev_t) connect_list;
347
348 isc_sockaddr_t peer_address; /* remote address */
349
350 unsigned int listener : 1, /* listener socket */
351 connected : 1, connecting : 1, /* connect pending
352 * */
353 bound : 1, /* bound to local addr */
354 active : 1, /* currently active */
355 pktdscp : 1; /* per packet dscp */
356
357 #ifdef ISC_PLATFORM_RECVOVERFLOW
358 unsigned char overflow; /* used for MSG_TRUNC fake */
359 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
360
361 unsigned int dscp;
362 };
363
364 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
365 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
366
367 struct isc_socketmgr {
368 /* Not locked. */
369 unsigned int magic;
370 isc_mem_t *mctx;
371 isc_mutex_t lock;
372 isc_stats_t *stats;
373 int nthreads;
374 isc__socketthread_t *threads;
375 unsigned int maxsocks;
376 /* Locked by manager lock. */
377 ISC_LIST(isc_socket_t) socklist;
378 int reserved; /* unlocked */
379 isc_condition_t shutdown_ok;
380 size_t maxudp;
381 };
382
383 struct isc__socketthread {
384 isc_socketmgr_t *manager;
385 int threadid;
386 isc_thread_t thread;
387 int pipe_fds[2];
388 isc_mutex_t *fdlock;
389 /* Locked by fdlock. */
390 isc_socket_t **fds;
391 int *fdstate;
392 #ifdef USE_KQUEUE
393 int kqueue_fd;
394 int nevents;
395 struct kevent *events;
396 #endif /* USE_KQUEUE */
397 #ifdef USE_EPOLL
398 int epoll_fd;
399 int nevents;
400 struct epoll_event *events;
401 uint32_t *epoll_events;
402 #endif /* USE_EPOLL */
403 #ifdef USE_DEVPOLL
404 int devpoll_fd;
405 isc_resourcevalue_t open_max;
406 unsigned int calls;
407 int nevents;
408 struct pollfd *events;
409 pollinfo_t *fdpollinfo;
410 #endif /* USE_DEVPOLL */
411 #ifdef USE_SELECT
412 int fd_bufsize;
413 fd_set *read_fds;
414 fd_set *read_fds_copy;
415 fd_set *write_fds;
416 fd_set *write_fds_copy;
417 int maxfd;
418 #endif /* USE_SELECT */
419 };
420
421 #define CLOSED 0 /* this one must be zero */
422 #define MANAGED 1
423 #define CLOSE_PENDING 2
424
425 /*
426 * send() and recv() iovec counts
427 */
428 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
429 #ifdef ISC_PLATFORM_RECVOVERFLOW
430 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
431 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
432 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
433 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
434
435 static isc_result_t
436 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
437 isc_socket_t **socketp);
438 static void
439 send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
440 static void
441 send_senddone_event(isc_socket_t *, isc_socketevent_t **);
442 static void
443 send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
444 static void
445 free_socket(isc_socket_t **);
446 static isc_result_t
447 allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
448 static void
449 destroy(isc_socket_t **);
450 static void
451 internal_accept(isc_socket_t *);
452 static void
453 internal_connect(isc_socket_t *);
454 static void
455 internal_recv(isc_socket_t *);
456 static void
457 internal_send(isc_socket_t *);
458 static void
459 process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
460 static void
461 build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
462 struct iovec *, size_t *);
463 static void
464 build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
465 struct iovec *, size_t *);
466 static bool
467 process_ctlfd(isc__socketthread_t *thread);
468 static void
469 setdscp(isc_socket_t *sock, isc_dscp_t dscp);
470
471 #define SELECT_POKE_SHUTDOWN (-1)
472 #define SELECT_POKE_NOTHING (-2)
473 #define SELECT_POKE_READ (-3)
474 #define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
475 #define SELECT_POKE_WRITE (-4)
476 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
477 #define SELECT_POKE_CLOSE (-5)
478
479 /*%
480 * Shortcut index arrays to get access to statistics counters.
481 */
482 enum {
483 STATID_OPEN = 0,
484 STATID_OPENFAIL = 1,
485 STATID_CLOSE = 2,
486 STATID_BINDFAIL = 3,
487 STATID_CONNECTFAIL = 4,
488 STATID_CONNECT = 5,
489 STATID_ACCEPTFAIL = 6,
490 STATID_ACCEPT = 7,
491 STATID_SENDFAIL = 8,
492 STATID_RECVFAIL = 9,
493 STATID_ACTIVE = 10
494 };
495 static const isc_statscounter_t udp4statsindex[] = {
496 isc_sockstatscounter_udp4open,
497 isc_sockstatscounter_udp4openfail,
498 isc_sockstatscounter_udp4close,
499 isc_sockstatscounter_udp4bindfail,
500 isc_sockstatscounter_udp4connectfail,
501 isc_sockstatscounter_udp4connect,
502 -1,
503 -1,
504 isc_sockstatscounter_udp4sendfail,
505 isc_sockstatscounter_udp4recvfail,
506 isc_sockstatscounter_udp4active
507 };
508 static const isc_statscounter_t udp6statsindex[] = {
509 isc_sockstatscounter_udp6open,
510 isc_sockstatscounter_udp6openfail,
511 isc_sockstatscounter_udp6close,
512 isc_sockstatscounter_udp6bindfail,
513 isc_sockstatscounter_udp6connectfail,
514 isc_sockstatscounter_udp6connect,
515 -1,
516 -1,
517 isc_sockstatscounter_udp6sendfail,
518 isc_sockstatscounter_udp6recvfail,
519 isc_sockstatscounter_udp6active
520 };
521 static const isc_statscounter_t tcp4statsindex[] = {
522 isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail,
523 isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail,
524 isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
525 isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept,
526 isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail,
527 isc_sockstatscounter_tcp4active
528 };
529 static const isc_statscounter_t tcp6statsindex[] = {
530 isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail,
531 isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail,
532 isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
533 isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept,
534 isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail,
535 isc_sockstatscounter_tcp6active
536 };
537 static const isc_statscounter_t unixstatsindex[] = {
538 isc_sockstatscounter_unixopen, isc_sockstatscounter_unixopenfail,
539 isc_sockstatscounter_unixclose, isc_sockstatscounter_unixbindfail,
540 isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
541 isc_sockstatscounter_unixacceptfail, isc_sockstatscounter_unixaccept,
542 isc_sockstatscounter_unixsendfail, isc_sockstatscounter_unixrecvfail,
543 isc_sockstatscounter_unixactive
544 };
545 static const isc_statscounter_t rawstatsindex[] = {
546 isc_sockstatscounter_rawopen,
547 isc_sockstatscounter_rawopenfail,
548 isc_sockstatscounter_rawclose,
549 -1,
550 -1,
551 -1,
552 -1,
553 -1,
554 -1,
555 isc_sockstatscounter_rawrecvfail,
556 isc_sockstatscounter_rawactive
557 };
558
559 static int
560 gen_threadid(isc_socket_t *sock);
561
562 static int
gen_threadid(isc_socket_t * sock)563 gen_threadid(isc_socket_t *sock) {
564 return (sock->fd % sock->manager->nthreads);
565 }
566
567 static void
568 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
569 isc_logmodule_t *module, int level, const char *fmt, ...)
570 ISC_FORMAT_PRINTF(5, 6);
571 static void
manager_log(isc_socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)572 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
573 isc_logmodule_t *module, int level, const char *fmt, ...) {
574 char msgbuf[2048];
575 va_list ap;
576
577 if (!isc_log_wouldlog(isc_lctx, level)) {
578 return;
579 }
580
581 va_start(ap, fmt);
582 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
583 va_end(ap);
584
585 isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
586 sockmgr, msgbuf);
587 }
588
589 static void
590 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
591 isc_logmodule_t *module, int level, const char *fmt, ...)
592 ISC_FORMAT_PRINTF(5, 6);
593 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)594 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
595 isc_logmodule_t *module, int level, const char *fmt, ...) {
596 char msgbuf[2048];
597 va_list ap;
598
599 if (!isc_log_wouldlog(isc_lctx, level)) {
600 return;
601 }
602
603 va_start(ap, fmt);
604 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
605 va_end(ap);
606
607 isc_log_write(isc_lctx, category, module, level,
608 "sockmgr %p thread %d: %s", thread->manager,
609 thread->threadid, msgbuf);
610 }
611
612 static void
613 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
614 isc_logcategory_t *category, isc_logmodule_t *module, int level,
615 const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
616 static void
socket_log(isc_socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)617 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
618 isc_logcategory_t *category, isc_logmodule_t *module, int level,
619 const char *fmt, ...) {
620 char msgbuf[2048];
621 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
622 va_list ap;
623
624 if (!isc_log_wouldlog(isc_lctx, level)) {
625 return;
626 }
627
628 va_start(ap, fmt);
629 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
630 va_end(ap);
631
632 if (address == NULL) {
633 isc_log_write(isc_lctx, category, module, level,
634 "socket %p: %s", sock, msgbuf);
635 } else {
636 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
637 isc_log_write(isc_lctx, category, module, level,
638 "socket %p %s: %s", sock, peerbuf, msgbuf);
639 }
640 }
641
642 /*%
643 * Increment socket-related statistics counters.
644 */
645 static inline void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)646 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
647 REQUIRE(counterid != -1);
648
649 if (stats != NULL) {
650 isc_stats_increment(stats, counterid);
651 }
652 }
653
654 /*%
655 * Decrement socket-related statistics counters.
656 */
657 static inline void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)658 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
659 REQUIRE(counterid != -1);
660
661 if (stats != NULL) {
662 isc_stats_decrement(stats, counterid);
663 }
664 }
665
666 static inline isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)667 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
668 isc_result_t result = ISC_R_SUCCESS;
669
670 #ifdef USE_KQUEUE
671 struct kevent evchange;
672
673 memset(&evchange, 0, sizeof(evchange));
674 if (msg == SELECT_POKE_READ) {
675 evchange.filter = EVFILT_READ;
676 } else {
677 evchange.filter = EVFILT_WRITE;
678 }
679 evchange.flags = EV_ADD;
680 evchange.ident = fd;
681 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
682 result = isc__errno2result(errno);
683 }
684
685 return (result);
686 #elif defined(USE_EPOLL)
687 struct epoll_event event;
688 uint32_t oldevents;
689 int ret;
690 int op;
691
692 oldevents = thread->epoll_events[fd];
693 if (msg == SELECT_POKE_READ) {
694 thread->epoll_events[fd] |= EPOLLIN;
695 } else {
696 thread->epoll_events[fd] |= EPOLLOUT;
697 }
698
699 event.events = thread->epoll_events[fd];
700 memset(&event.data, 0, sizeof(event.data));
701 event.data.fd = fd;
702
703 op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
704 if (thread->fds[fd] != NULL) {
705 LOCK(&thread->fds[fd]->lock);
706 }
707 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
708 if (thread->fds[fd] != NULL) {
709 UNLOCK(&thread->fds[fd]->lock);
710 }
711 if (ret == -1) {
712 if (errno == EEXIST) {
713 UNEXPECTED_ERROR(__FILE__, __LINE__,
714 "epoll_ctl(ADD/MOD) returned "
715 "EEXIST for fd %d",
716 fd);
717 }
718 result = isc__errno2result(errno);
719 }
720
721 return (result);
722 #elif defined(USE_DEVPOLL)
723 struct pollfd pfd;
724
725 memset(&pfd, 0, sizeof(pfd));
726 if (msg == SELECT_POKE_READ) {
727 pfd.events = POLLIN;
728 } else {
729 pfd.events = POLLOUT;
730 }
731 pfd.fd = fd;
732 pfd.revents = 0;
733 if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
734 result = isc__errno2result(errno);
735 } else {
736 if (msg == SELECT_POKE_READ) {
737 thread->fdpollinfo[fd].want_read = 1;
738 } else {
739 thread->fdpollinfo[fd].want_write = 1;
740 }
741 }
742
743 return (result);
744 #elif defined(USE_SELECT)
745 LOCK(&thread->manager->lock);
746 if (msg == SELECT_POKE_READ) {
747 FD_SET(fd, thread->read_fds);
748 }
749 if (msg == SELECT_POKE_WRITE) {
750 FD_SET(fd, thread->write_fds);
751 }
752 UNLOCK(&thread->manager->lock);
753
754 return (result);
755 #endif /* ifdef USE_KQUEUE */
756 }
757
758 static inline isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)759 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
760 isc_result_t result = ISC_R_SUCCESS;
761
762 #ifdef USE_KQUEUE
763 struct kevent evchange;
764
765 memset(&evchange, 0, sizeof(evchange));
766 if (msg == SELECT_POKE_READ) {
767 evchange.filter = EVFILT_READ;
768 } else {
769 evchange.filter = EVFILT_WRITE;
770 }
771 evchange.flags = EV_DELETE;
772 evchange.ident = fd;
773 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
774 result = isc__errno2result(errno);
775 }
776
777 return (result);
778 #elif defined(USE_EPOLL)
779 struct epoll_event event;
780 int ret;
781 int op;
782
783 if (msg == SELECT_POKE_READ) {
784 thread->epoll_events[fd] &= ~(EPOLLIN);
785 } else {
786 thread->epoll_events[fd] &= ~(EPOLLOUT);
787 }
788
789 event.events = thread->epoll_events[fd];
790 memset(&event.data, 0, sizeof(event.data));
791 event.data.fd = fd;
792
793 op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
794 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
795 if (ret == -1 && errno != ENOENT) {
796 char strbuf[ISC_STRERRORSIZE];
797 strerror_r(errno, strbuf, sizeof(strbuf));
798 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
799 fd, strbuf);
800 result = ISC_R_UNEXPECTED;
801 }
802 return (result);
803 #elif defined(USE_DEVPOLL)
804 struct pollfd pfds[2];
805 size_t writelen = sizeof(pfds[0]);
806
807 memset(pfds, 0, sizeof(pfds));
808 pfds[0].events = POLLREMOVE;
809 pfds[0].fd = fd;
810
811 /*
812 * Canceling read or write polling via /dev/poll is tricky. Since it
813 * only provides a way of canceling per FD, we may need to re-poll the
814 * socket for the other operation.
815 */
816 if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
817 pfds[1].events = POLLOUT;
818 pfds[1].fd = fd;
819 writelen += sizeof(pfds[1]);
820 }
821 if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
822 pfds[1].events = POLLIN;
823 pfds[1].fd = fd;
824 writelen += sizeof(pfds[1]);
825 }
826
827 if (write(thread->devpoll_fd, pfds, writelen) == -1) {
828 result = isc__errno2result(errno);
829 } else {
830 if (msg == SELECT_POKE_READ) {
831 thread->fdpollinfo[fd].want_read = 0;
832 } else {
833 thread->fdpollinfo[fd].want_write = 0;
834 }
835 }
836
837 return (result);
838 #elif defined(USE_SELECT)
839 LOCK(&thread->manager->lock);
840 if (msg == SELECT_POKE_READ) {
841 FD_CLR(fd, thread->read_fds);
842 } else if (msg == SELECT_POKE_WRITE) {
843 FD_CLR(fd, thread->write_fds);
844 }
845 UNLOCK(&thread->manager->lock);
846
847 return (result);
848 #endif /* ifdef USE_KQUEUE */
849 }
850
851 /*
852 * A poke message was received, perform a proper watch/unwatch
853 * on a fd provided
854 */
855 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)856 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
857 isc_result_t result;
858 int lockid = FDLOCK_ID(fd);
859
860 /*
861 * This is a wakeup on a socket. If the socket is not in the
862 * process of being closed, start watching it for either reads
863 * or writes.
864 */
865
866 INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
867
868 if (msg == SELECT_POKE_CLOSE) {
869 LOCK(&thread->fdlock[lockid]);
870 INSIST(thread->fdstate[fd] == CLOSE_PENDING);
871 thread->fdstate[fd] = CLOSED;
872 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
873 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
874 (void)close(fd);
875 UNLOCK(&thread->fdlock[lockid]);
876 return;
877 }
878
879 LOCK(&thread->fdlock[lockid]);
880 if (thread->fdstate[fd] == CLOSE_PENDING) {
881 /*
882 * We accept (and ignore) any error from unwatch_fd() as we are
883 * closing the socket, hoping it doesn't leave dangling state in
884 * the kernel.
885 * Note that unwatch_fd() must be called after releasing the
886 * fdlock; otherwise it could cause deadlock due to a lock order
887 * reversal.
888 */
889 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
890 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
891 UNLOCK(&thread->fdlock[lockid]);
892 return;
893 }
894 if (thread->fdstate[fd] != MANAGED) {
895 UNLOCK(&thread->fdlock[lockid]);
896 return;
897 }
898
899 /*
900 * Set requested bit.
901 */
902 result = watch_fd(thread, fd, msg);
903 if (result != ISC_R_SUCCESS) {
904 /*
905 * XXXJT: what should we do? Ignoring the failure of watching
906 * a socket will make the application dysfunctional, but there
907 * seems to be no reasonable recovery process.
908 */
909 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
910 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
911 "failed to start watching FD (%d): %s", fd,
912 isc_result_totext(result));
913 }
914 UNLOCK(&thread->fdlock[lockid]);
915 }
916
917 /*
918 * Poke the select loop when there is something for us to do.
919 * The write is required (by POSIX) to complete. That is, we
920 * will not get partial writes.
921 */
922 static void
select_poke(isc_socketmgr_t * mgr,int threadid,int fd,int msg)923 select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
924 int cc;
925 int buf[2];
926 char strbuf[ISC_STRERRORSIZE];
927
928 buf[0] = fd;
929 buf[1] = msg;
930
931 do {
932 cc = write(mgr->threads[threadid].pipe_fds[1], buf,
933 sizeof(buf));
934 #ifdef ENOSR
935 /*
936 * Treat ENOSR as EAGAIN but loop slowly as it is
937 * unlikely to clear fast.
938 */
939 if (cc < 0 && errno == ENOSR) {
940 sleep(1);
941 errno = EAGAIN;
942 }
943 #endif /* ifdef ENOSR */
944 } while (cc < 0 && SOFT_ERROR(errno));
945
946 if (cc < 0) {
947 strerror_r(errno, strbuf, sizeof(strbuf));
948 FATAL_ERROR(__FILE__, __LINE__,
949 "write() failed during watcher poke: %s", strbuf);
950 }
951
952 INSIST(cc == sizeof(buf));
953 }
954
955 /*
956 * Read a message on the internal fd.
957 */
958 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)959 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
960 int buf[2];
961 int cc;
962 char strbuf[ISC_STRERRORSIZE];
963
964 cc = read(thread->pipe_fds[0], buf, sizeof(buf));
965 if (cc < 0) {
966 *msg = SELECT_POKE_NOTHING;
967 *fd = -1; /* Silence compiler. */
968 if (SOFT_ERROR(errno)) {
969 return;
970 }
971
972 strerror_r(errno, strbuf, sizeof(strbuf));
973 FATAL_ERROR(__FILE__, __LINE__,
974 "read() failed during watcher poke: %s", strbuf);
975 }
976 INSIST(cc == sizeof(buf));
977
978 *fd = buf[0];
979 *msg = buf[1];
980 }
981
982 /*
983 * Make a fd non-blocking.
984 */
985 static isc_result_t
make_nonblock(int fd)986 make_nonblock(int fd) {
987 int ret;
988 char strbuf[ISC_STRERRORSIZE];
989 #ifdef USE_FIONBIO_IOCTL
990 int on = 1;
991 #else /* ifdef USE_FIONBIO_IOCTL */
992 int flags;
993 #endif /* ifdef USE_FIONBIO_IOCTL */
994
995 #ifdef USE_FIONBIO_IOCTL
996 ret = ioctl(fd, FIONBIO, (char *)&on);
997 #else /* ifdef USE_FIONBIO_IOCTL */
998 flags = fcntl(fd, F_GETFL, 0);
999 flags |= O_NONBLOCK;
1000 ret = fcntl(fd, F_SETFL, flags);
1001 #endif /* ifdef USE_FIONBIO_IOCTL */
1002
1003 if (ret == -1) {
1004 strerror_r(errno, strbuf, sizeof(strbuf));
1005 UNEXPECTED_ERROR(__FILE__, __LINE__,
1006 #ifdef USE_FIONBIO_IOCTL
1007 "ioctl(%d, FIONBIO, &on): %s", fd,
1008 #else /* ifdef USE_FIONBIO_IOCTL */
1009 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1010 #endif /* ifdef USE_FIONBIO_IOCTL */
1011 strbuf);
1012
1013 return (ISC_R_UNEXPECTED);
1014 }
1015
1016 return (ISC_R_SUCCESS);
1017 }
1018
1019 #ifdef USE_CMSG
1020 /*
1021 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1022 * In order to ensure as much portability as possible, we provide wrapper
1023 * functions of these macros.
1024 * Note that cmsg_space() could run slow on OSes that do not have
1025 * CMSG_SPACE.
1026 */
1027 static inline socklen_t
cmsg_len(socklen_t len)1028 cmsg_len(socklen_t len) {
1029 #ifdef CMSG_LEN
1030 return (CMSG_LEN(len));
1031 #else /* ifdef CMSG_LEN */
1032 socklen_t hdrlen;
1033
1034 /*
1035 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1036 * is correct.
1037 */
1038 hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1039 return (hdrlen + len);
1040 #endif /* ifdef CMSG_LEN */
1041 }
1042
1043 static inline socklen_t
cmsg_space(socklen_t len)1044 cmsg_space(socklen_t len) {
1045 #ifdef CMSG_SPACE
1046 return (CMSG_SPACE(len));
1047 #else /* ifdef CMSG_SPACE */
1048 struct msghdr msg;
1049 struct cmsghdr *cmsgp;
1050 /*
1051 * XXX: The buffer length is an ad-hoc value, but should be enough
1052 * in a practical sense.
1053 */
1054 char dummybuf[sizeof(struct cmsghdr) + 1024];
1055
1056 memset(&msg, 0, sizeof(msg));
1057 msg.msg_control = dummybuf;
1058 msg.msg_controllen = sizeof(dummybuf);
1059
1060 cmsgp = (struct cmsghdr *)dummybuf;
1061 cmsgp->cmsg_len = cmsg_len(len);
1062
1063 cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1064 if (cmsgp != NULL) {
1065 return ((char *)cmsgp - (char *)msg.msg_control);
1066 } else {
1067 return (0);
1068 }
1069 #endif /* ifdef CMSG_SPACE */
1070 }
1071 #endif /* USE_CMSG */
1072
1073 /*
1074 * Process control messages received on a socket.
1075 */
1076 static void
process_cmsg(isc_socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1077 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1078 #ifdef USE_CMSG
1079 struct cmsghdr *cmsgp;
1080 struct in6_pktinfo *pktinfop;
1081 #ifdef SO_TIMESTAMP
1082 void *timevalp;
1083 #endif /* ifdef SO_TIMESTAMP */
1084 #endif /* ifdef USE_CMSG */
1085
1086 /*
1087 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1088 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1089 * They are all here, outside of the CPP tests, because it is
1090 * more consistent with the usual ISC coding style.
1091 */
1092 UNUSED(sock);
1093 UNUSED(msg);
1094 UNUSED(dev);
1095
1096 #ifdef MSG_TRUNC
1097 if ((msg->msg_flags & MSG_TRUNC) != 0) {
1098 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1099 }
1100 #endif /* ifdef MSG_TRUNC */
1101
1102 #ifdef MSG_CTRUNC
1103 if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1104 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1105 }
1106 #endif /* ifdef MSG_CTRUNC */
1107
1108 #ifndef USE_CMSG
1109 return;
1110 #else /* ifndef USE_CMSG */
1111 if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1112 return;
1113 }
1114
1115 #ifdef SO_TIMESTAMP
1116 timevalp = NULL;
1117 #endif /* ifdef SO_TIMESTAMP */
1118 pktinfop = NULL;
1119
1120 cmsgp = CMSG_FIRSTHDR(msg);
1121 while (cmsgp != NULL) {
1122 socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1123
1124 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1125 cmsgp->cmsg_type == IPV6_PKTINFO) {
1126 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1127 memmove(&dev->pktinfo, pktinfop,
1128 sizeof(struct in6_pktinfo));
1129 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1130 socket_log(sock, NULL, TRACE,
1131 "interface received on ifindex %u",
1132 dev->pktinfo.ipi6_ifindex);
1133 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1134 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1135 }
1136 goto next;
1137 }
1138
1139 #ifdef SO_TIMESTAMP
1140 if (cmsgp->cmsg_level == SOL_SOCKET &&
1141 cmsgp->cmsg_type == SCM_TIMESTAMP) {
1142 struct timeval tv;
1143 timevalp = CMSG_DATA(cmsgp);
1144 memmove(&tv, timevalp, sizeof(tv));
1145 dev->timestamp.seconds = tv.tv_sec;
1146 dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1147 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1148 goto next;
1149 }
1150 #endif /* ifdef SO_TIMESTAMP */
1151
1152 #ifdef IPV6_TCLASS
1153 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1154 cmsgp->cmsg_type == IPV6_TCLASS) {
1155 dev->dscp = *(int *)CMSG_DATA(cmsgp);
1156 dev->dscp >>= 2;
1157 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1158 goto next;
1159 }
1160 #endif /* ifdef IPV6_TCLASS */
1161
1162 #ifdef IP_TOS
1163 if (cmsgp->cmsg_level == IPPROTO_IP &&
1164 (cmsgp->cmsg_type == IP_TOS
1165 #ifdef IP_RECVTOS
1166 || cmsgp->cmsg_type == IP_RECVTOS
1167 #endif /* ifdef IP_RECVTOS */
1168 ))
1169 {
1170 dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1171 dev->dscp >>= 2;
1172 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1173 goto next;
1174 }
1175 #endif /* ifdef IP_TOS */
1176 next:
1177 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1178 }
1179 #endif /* USE_CMSG */
1180 }
1181
1182 /*
1183 * Construct an iov array and attach it to the msghdr passed in. This is
1184 * the SEND constructor, which will use the used region of the buffer
1185 * (if using a buffer list) or will use the internal region (if a single
1186 * buffer I/O is requested).
1187 *
1188 * Nothing can be NULL, and the done event must list at least one buffer
1189 * on the buffer linked list for this function to be meaningful.
1190 *
1191 * If write_countp != NULL, *write_countp will hold the number of bytes
1192 * this transaction can send.
1193 */
1194 static void
build_msghdr_send(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1195 build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1196 struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1197 unsigned int iovcount;
1198 size_t write_count;
1199 struct cmsghdr *cmsgp;
1200
1201 memset(msg, 0, sizeof(*msg));
1202
1203 if (!sock->connected) {
1204 msg->msg_name = (void *)&dev->address.type.sa;
1205 msg->msg_namelen = dev->address.length;
1206 } else {
1207 msg->msg_name = NULL;
1208 msg->msg_namelen = 0;
1209 }
1210
1211 write_count = dev->region.length - dev->n;
1212 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1213 iov[0].iov_len = write_count;
1214 iovcount = 1;
1215
1216 msg->msg_iov = iov;
1217 msg->msg_iovlen = iovcount;
1218 msg->msg_control = NULL;
1219 msg->msg_controllen = 0;
1220 msg->msg_flags = 0;
1221 #if defined(USE_CMSG)
1222
1223 if ((sock->type == isc_sockettype_udp) &&
1224 ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1225 {
1226 struct in6_pktinfo *pktinfop;
1227
1228 socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1229 dev->pktinfo.ipi6_ifindex);
1230
1231 msg->msg_control = (void *)cmsgbuf;
1232 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1233 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1234
1235 cmsgp = (struct cmsghdr *)cmsgbuf;
1236 cmsgp->cmsg_level = IPPROTO_IPV6;
1237 cmsgp->cmsg_type = IPV6_PKTINFO;
1238 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1239 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1240 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1241 }
1242
1243 #if defined(IPV6_USE_MIN_MTU)
1244 if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1245 ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1246 {
1247 int use_min_mtu = 1; /* -1, 0, 1 */
1248
1249 cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1250 msg->msg_control = (void *)cmsgbuf;
1251 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1252 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1253
1254 cmsgp->cmsg_level = IPPROTO_IPV6;
1255 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1256 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1257 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1258 }
1259 #endif /* if defined(IPV6_USE_MIN_MTU) */
1260
1261 if (isc_dscp_check_value > -1) {
1262 if (sock->type == isc_sockettype_udp) {
1263 INSIST((int)dev->dscp == isc_dscp_check_value);
1264 } else if (sock->type == isc_sockettype_tcp) {
1265 INSIST((int)sock->dscp == isc_dscp_check_value);
1266 }
1267 }
1268
1269 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1270 if ((sock->type == isc_sockettype_udp) &&
1271 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1272 {
1273 int dscp = (dev->dscp << 2) & 0xff;
1274
1275 INSIST(dev->dscp < 0x40);
1276
1277 #ifdef IP_TOS
1278 if (sock->pf == AF_INET && sock->pktdscp) {
1279 cmsgp = (struct cmsghdr *)(cmsgbuf +
1280 msg->msg_controllen);
1281 msg->msg_control = (void *)cmsgbuf;
1282 msg->msg_controllen += cmsg_space(sizeof(dscp));
1283 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1284
1285 cmsgp->cmsg_level = IPPROTO_IP;
1286 cmsgp->cmsg_type = IP_TOS;
1287 cmsgp->cmsg_len = cmsg_len(sizeof(char));
1288 *(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1289 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1290 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1291 (void *)&dscp, sizeof(int)) < 0) {
1292 char strbuf[ISC_STRERRORSIZE];
1293 strerror_r(errno, strbuf, sizeof(strbuf));
1294 UNEXPECTED_ERROR(__FILE__, __LINE__,
1295 "setsockopt(%d, IP_TOS, %.02x)"
1296 " failed: %s",
1297 sock->fd, dscp >> 2, strbuf);
1298 } else {
1299 sock->dscp = dscp;
1300 }
1301 }
1302 #endif /* ifdef IP_TOS */
1303 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1304 if (sock->pf == AF_INET6 && sock->pktdscp) {
1305 cmsgp = (struct cmsghdr *)(cmsgbuf +
1306 msg->msg_controllen);
1307 msg->msg_control = (void *)cmsgbuf;
1308 msg->msg_controllen += cmsg_space(sizeof(dscp));
1309 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1310
1311 cmsgp->cmsg_level = IPPROTO_IPV6;
1312 cmsgp->cmsg_type = IPV6_TCLASS;
1313 cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1314 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1315 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1316 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1317 (void *)&dscp, sizeof(int)) < 0)
1318 {
1319 char strbuf[ISC_STRERRORSIZE];
1320 strerror_r(errno, strbuf, sizeof(strbuf));
1321 UNEXPECTED_ERROR(__FILE__, __LINE__,
1322 "setsockopt(%d, IPV6_TCLASS, "
1323 "%.02x) failed: %s",
1324 sock->fd, dscp >> 2, strbuf);
1325 } else {
1326 sock->dscp = dscp;
1327 }
1328 }
1329 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1330 if (msg->msg_controllen != 0 &&
1331 msg->msg_controllen < SENDCMSGBUFLEN) {
1332 memset(cmsgbuf + msg->msg_controllen, 0,
1333 SENDCMSGBUFLEN - msg->msg_controllen);
1334 }
1335 }
1336 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1337 * defined(IPV6_TCLASS)) \
1338 * */
1339 #endif /* USE_CMSG */
1340
1341 if (write_countp != NULL) {
1342 *write_countp = write_count;
1343 }
1344 }
1345
1346 /*
1347 * Construct an iov array and attach it to the msghdr passed in. This is
1348 * the RECV constructor, which will use the available region of the buffer
1349 * (if using a buffer list) or will use the internal region (if a single
1350 * buffer I/O is requested).
1351 *
1352 * Nothing can be NULL, and the done event must list at least one buffer
1353 * on the buffer linked list for this function to be meaningful.
1354 *
1355 * If read_countp != NULL, *read_countp will hold the number of bytes
1356 * this transaction can receive.
1357 */
1358 static void
build_msghdr_recv(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1359 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1360 struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1361 unsigned int iovcount;
1362 size_t read_count;
1363
1364 memset(msg, 0, sizeof(struct msghdr));
1365
1366 if (sock->type == isc_sockettype_udp) {
1367 memset(&dev->address, 0, sizeof(dev->address));
1368 msg->msg_name = (void *)&dev->address.type.sa;
1369 msg->msg_namelen = sizeof(dev->address.type);
1370 } else { /* TCP */
1371 msg->msg_name = NULL;
1372 msg->msg_namelen = 0;
1373 dev->address = sock->peer_address;
1374 }
1375
1376 read_count = dev->region.length - dev->n;
1377 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1378 iov[0].iov_len = read_count;
1379 iovcount = 1;
1380
1381 /*
1382 * If needed, set up to receive that one extra byte.
1383 */
1384 #ifdef ISC_PLATFORM_RECVOVERFLOW
1385 if (sock->type == isc_sockettype_udp) {
1386 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1387 iov[iovcount].iov_base = (void *)(&sock->overflow);
1388 iov[iovcount].iov_len = 1;
1389 iovcount++;
1390 }
1391 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1392
1393 msg->msg_iov = iov;
1394 msg->msg_iovlen = iovcount;
1395
1396 #if defined(USE_CMSG)
1397 msg->msg_control = cmsgbuf;
1398 msg->msg_controllen = RECVCMSGBUFLEN;
1399 #else /* if defined(USE_CMSG) */
1400 msg->msg_control = NULL;
1401 msg->msg_controllen = 0;
1402 #endif /* USE_CMSG */
1403 msg->msg_flags = 0;
1404
1405 if (read_countp != NULL) {
1406 *read_countp = read_count;
1407 }
1408 }
1409
1410 static void
set_dev_address(const isc_sockaddr_t * address,isc_socket_t * sock,isc_socketevent_t * dev)1411 set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
1412 isc_socketevent_t *dev) {
1413 if (sock->type == isc_sockettype_udp) {
1414 if (address != NULL) {
1415 dev->address = *address;
1416 } else {
1417 dev->address = sock->peer_address;
1418 }
1419 } else if (sock->type == isc_sockettype_tcp) {
1420 INSIST(address == NULL);
1421 dev->address = sock->peer_address;
1422 }
1423 }
1424
1425 static void
destroy_socketevent(isc_event_t * event)1426 destroy_socketevent(isc_event_t *event) {
1427 isc_socketevent_t *ev = (isc_socketevent_t *)event;
1428
1429 (ev->destroy)(event);
1430 }
1431
1432 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1433 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1434 isc_taskaction_t action, void *arg) {
1435 isc_socketevent_t *ev;
1436
1437 ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1438 action, arg, sizeof(*ev));
1439
1440 ev->result = ISC_R_UNSET;
1441 ISC_LINK_INIT(ev, ev_link);
1442 ev->region.base = NULL;
1443 ev->n = 0;
1444 ev->offset = 0;
1445 ev->attributes = 0;
1446 ev->destroy = ev->ev_destroy;
1447 ev->ev_destroy = destroy_socketevent;
1448 ev->dscp = 0;
1449
1450 return (ev);
1451 }
1452
1453 #if defined(ISC_SOCKET_DEBUG)
1454 static void
dump_msg(struct msghdr * msg)1455 dump_msg(struct msghdr *msg) {
1456 unsigned int i;
1457
1458 printf("MSGHDR %p\n", msg);
1459 printf("\tname %p, namelen %ld\n", msg->msg_name,
1460 (long)msg->msg_namelen);
1461 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1462 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1463 printf("\t\t%u\tbase %p, len %ld\n", i,
1464 msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1465 printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1466 (long)msg->msg_controllen);
1467 }
1468 #endif /* if defined(ISC_SOCKET_DEBUG) */
1469
1470 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1471 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
1472 #define DOIO_HARD 2 /* i/o error, event sent */
1473 #define DOIO_EOF 3 /* EOF, no event sent */
1474
1475 static int
doio_recv(isc_socket_t * sock,isc_socketevent_t * dev)1476 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1477 int cc;
1478 struct iovec iov[MAXSCATTERGATHER_RECV];
1479 size_t read_count;
1480 struct msghdr msghdr;
1481 int recv_errno;
1482 char strbuf[ISC_STRERRORSIZE];
1483 char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1484
1485 build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1486
1487 #if defined(ISC_SOCKET_DEBUG)
1488 dump_msg(&msghdr);
1489 #endif /* if defined(ISC_SOCKET_DEBUG) */
1490
1491 cc = recvmsg(sock->fd, &msghdr, 0);
1492 recv_errno = errno;
1493
1494 #if defined(ISC_SOCKET_DEBUG)
1495 dump_msg(&msghdr);
1496 #endif /* if defined(ISC_SOCKET_DEBUG) */
1497
1498 if (cc < 0) {
1499 if (SOFT_ERROR(recv_errno)) {
1500 return (DOIO_SOFT);
1501 }
1502
1503 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1504 strerror_r(recv_errno, strbuf, sizeof(strbuf));
1505 socket_log(sock, NULL, IOEVENT,
1506 "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1507 sock->fd, cc, recv_errno, strbuf);
1508 }
1509
1510 #define SOFT_OR_HARD(_system, _isc) \
1511 if (recv_errno == _system) { \
1512 if (sock->connected) { \
1513 dev->result = _isc; \
1514 inc_stats(sock->manager->stats, \
1515 sock->statsindex[STATID_RECVFAIL]); \
1516 return (DOIO_HARD); \
1517 } \
1518 return (DOIO_SOFT); \
1519 }
1520 #define ALWAYS_HARD(_system, _isc) \
1521 if (recv_errno == _system) { \
1522 dev->result = _isc; \
1523 inc_stats(sock->manager->stats, \
1524 sock->statsindex[STATID_RECVFAIL]); \
1525 return (DOIO_HARD); \
1526 }
1527
1528 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1529 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1530 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1531 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1532 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1533 /*
1534 * Older operating systems may still return EPROTO in some
1535 * situations, for example when receiving ICMP/ICMPv6 errors.
1536 * A real life scenario is when ICMPv6 returns code 5 or 6.
1537 * These codes are introduced in RFC 4443 from March 2006,
1538 * and the document obsoletes RFC 1885. But unfortunately not
1539 * all operating systems have caught up with the new standard
1540 * (in 2020) and thus a generic protocol error is returned.
1541 */
1542 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1543 /* Should never get this one but it was seen. */
1544 #ifdef ENOPROTOOPT
1545 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1546 #endif /* ifdef ENOPROTOOPT */
1547 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1548
1549 #undef SOFT_OR_HARD
1550 #undef ALWAYS_HARD
1551
1552 dev->result = isc__errno2result(recv_errno);
1553 inc_stats(sock->manager->stats,
1554 sock->statsindex[STATID_RECVFAIL]);
1555 return (DOIO_HARD);
1556 }
1557
1558 /*
1559 * On TCP and UNIX sockets, zero length reads indicate EOF,
1560 * while on UDP sockets, zero length reads are perfectly valid,
1561 * although strange.
1562 */
1563 switch (sock->type) {
1564 case isc_sockettype_tcp:
1565 case isc_sockettype_unix:
1566 if (cc == 0) {
1567 return (DOIO_EOF);
1568 }
1569 break;
1570 case isc_sockettype_udp:
1571 case isc_sockettype_raw:
1572 break;
1573 default:
1574 INSIST(0);
1575 ISC_UNREACHABLE();
1576 }
1577
1578 if (sock->type == isc_sockettype_udp) {
1579 dev->address.length = msghdr.msg_namelen;
1580 if (isc_sockaddr_getport(&dev->address) == 0) {
1581 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1582 socket_log(sock, &dev->address, IOEVENT,
1583 "dropping source port zero packet");
1584 }
1585 return (DOIO_SOFT);
1586 }
1587 /*
1588 * Simulate a firewall blocking UDP responses bigger than
1589 * 'maxudp' bytes.
1590 */
1591 if (sock->manager->maxudp != 0 &&
1592 cc > (int)sock->manager->maxudp) {
1593 return (DOIO_SOFT);
1594 }
1595 }
1596
1597 socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1598
1599 /*
1600 * Overflow bit detection. If we received MORE bytes than we should,
1601 * this indicates an overflow situation. Set the flag in the
1602 * dev entry and adjust how much we read by one.
1603 */
1604 #ifdef ISC_PLATFORM_RECVOVERFLOW
1605 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1606 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1607 cc--;
1608 }
1609 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1610
1611 /*
1612 * If there are control messages attached, run through them and pull
1613 * out the interesting bits.
1614 */
1615 process_cmsg(sock, &msghdr, dev);
1616
1617 /*
1618 * update the buffers (if any) and the i/o count
1619 */
1620 dev->n += cc;
1621
1622 /*
1623 * If we read less than we expected, update counters,
1624 * and let the upper layer poke the descriptor.
1625 */
1626 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1627 return (DOIO_SOFT);
1628 }
1629
1630 /*
1631 * Full reads are posted, or partials if partials are ok.
1632 */
1633 dev->result = ISC_R_SUCCESS;
1634 return (DOIO_SUCCESS);
1635 }
1636
1637 /*
1638 * Returns:
1639 * DOIO_SUCCESS The operation succeeded. dev->result contains
1640 * ISC_R_SUCCESS.
1641 *
1642 * DOIO_HARD A hard or unexpected I/O error was encountered.
1643 * dev->result contains the appropriate error.
1644 *
1645 * DOIO_SOFT A soft I/O error was encountered. No senddone
1646 * event was sent. The operation should be retried.
1647 *
1648 * No other return values are possible.
1649 */
1650 static int
doio_send(isc_socket_t * sock,isc_socketevent_t * dev)1651 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1652 int cc;
1653 struct iovec iov[MAXSCATTERGATHER_SEND];
1654 size_t write_count;
1655 struct msghdr msghdr;
1656 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1657 int attempts = 0;
1658 int send_errno;
1659 char strbuf[ISC_STRERRORSIZE];
1660 char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1661
1662 build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1663
1664 resend:
1665 if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1666 write_count > sock->manager->maxudp)
1667 {
1668 cc = write_count;
1669 } else {
1670 cc = sendmsg(sock->fd, &msghdr, 0);
1671 }
1672 send_errno = errno;
1673
1674 /*
1675 * Check for error or block condition.
1676 */
1677 if (cc < 0) {
1678 if (send_errno == EINTR && ++attempts < NRETRIES) {
1679 goto resend;
1680 }
1681
1682 if (SOFT_ERROR(send_errno)) {
1683 if (errno == EWOULDBLOCK || errno == EAGAIN) {
1684 dev->result = ISC_R_WOULDBLOCK;
1685 }
1686 return (DOIO_SOFT);
1687 }
1688
1689 #define SOFT_OR_HARD(_system, _isc) \
1690 if (send_errno == _system) { \
1691 if (sock->connected) { \
1692 dev->result = _isc; \
1693 inc_stats(sock->manager->stats, \
1694 sock->statsindex[STATID_SENDFAIL]); \
1695 return (DOIO_HARD); \
1696 } \
1697 return (DOIO_SOFT); \
1698 }
1699 #define ALWAYS_HARD(_system, _isc) \
1700 if (send_errno == _system) { \
1701 dev->result = _isc; \
1702 inc_stats(sock->manager->stats, \
1703 sock->statsindex[STATID_SENDFAIL]); \
1704 return (DOIO_HARD); \
1705 }
1706
1707 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1708 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1709 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1710 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1711 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1712 #ifdef EHOSTDOWN
1713 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1714 #endif /* ifdef EHOSTDOWN */
1715 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1716 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1717 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1718 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1719 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1720
1721 #undef SOFT_OR_HARD
1722 #undef ALWAYS_HARD
1723
1724 /*
1725 * The other error types depend on whether or not the
1726 * socket is UDP or TCP. If it is UDP, some errors
1727 * that we expect to be fatal under TCP are merely
1728 * annoying, and are really soft errors.
1729 *
1730 * However, these soft errors are still returned as
1731 * a status.
1732 */
1733 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1734 strerror_r(send_errno, strbuf, sizeof(strbuf));
1735 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1736 addrbuf, strbuf);
1737 dev->result = isc__errno2result(send_errno);
1738 inc_stats(sock->manager->stats,
1739 sock->statsindex[STATID_SENDFAIL]);
1740 return (DOIO_HARD);
1741 }
1742
1743 if (cc == 0) {
1744 inc_stats(sock->manager->stats,
1745 sock->statsindex[STATID_SENDFAIL]);
1746 UNEXPECTED_ERROR(__FILE__, __LINE__,
1747 "doio_send: send() returned 0");
1748 }
1749
1750 /*
1751 * If we write less than we expected, update counters, poke.
1752 */
1753 dev->n += cc;
1754 if ((size_t)cc != write_count) {
1755 return (DOIO_SOFT);
1756 }
1757
1758 /*
1759 * Exactly what we wanted to write. We're done with this
1760 * entry. Post its completion event.
1761 */
1762 dev->result = ISC_R_SUCCESS;
1763 return (DOIO_SUCCESS);
1764 }
1765
1766 /*
1767 * Kill.
1768 *
1769 * Caller must ensure that the socket is not locked and no external
1770 * references exist.
1771 */
1772 static void
socketclose(isc__socketthread_t * thread,isc_socket_t * sock,int fd)1773 socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
1774 int lockid = FDLOCK_ID(fd);
1775 /*
1776 * No one has this socket open, so the watcher doesn't have to be
1777 * poked, and the socket doesn't have to be locked.
1778 */
1779 LOCK(&thread->fdlock[lockid]);
1780 thread->fds[fd] = NULL;
1781 thread->fdstate[fd] = CLOSE_PENDING;
1782 UNLOCK(&thread->fdlock[lockid]);
1783 select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
1784
1785 inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1786
1787 LOCK(&sock->lock);
1788 if (sock->active == 1) {
1789 dec_stats(thread->manager->stats,
1790 sock->statsindex[STATID_ACTIVE]);
1791 sock->active = 0;
1792 }
1793 UNLOCK(&sock->lock);
1794
1795 /*
1796 * update manager->maxfd here (XXX: this should be implemented more
1797 * efficiently)
1798 */
1799 #ifdef USE_SELECT
1800 LOCK(&thread->manager->lock);
1801 if (thread->maxfd == fd) {
1802 int i;
1803
1804 thread->maxfd = 0;
1805 for (i = fd - 1; i >= 0; i--) {
1806 lockid = FDLOCK_ID(i);
1807
1808 LOCK(&thread->fdlock[lockid]);
1809 if (thread->fdstate[i] == MANAGED) {
1810 thread->maxfd = i;
1811 UNLOCK(&thread->fdlock[lockid]);
1812 break;
1813 }
1814 UNLOCK(&thread->fdlock[lockid]);
1815 }
1816 if (thread->maxfd < thread->pipe_fds[0]) {
1817 thread->maxfd = thread->pipe_fds[0];
1818 }
1819 }
1820
1821 UNLOCK(&thread->manager->lock);
1822 #endif /* USE_SELECT */
1823 }
1824
1825 static void
destroy(isc_socket_t ** sockp)1826 destroy(isc_socket_t **sockp) {
1827 int fd = 0;
1828 isc_socket_t *sock = *sockp;
1829 isc_socketmgr_t *manager = sock->manager;
1830 isc__socketthread_t *thread = NULL;
1831
1832 socket_log(sock, NULL, CREATION, "destroying");
1833
1834 isc_refcount_destroy(&sock->references);
1835
1836 LOCK(&sock->lock);
1837 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1838 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1839 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1840 INSIST(ISC_LIST_EMPTY(sock->send_list));
1841 INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1842
1843 if (sock->fd >= 0) {
1844 fd = sock->fd;
1845 thread = &manager->threads[sock->threadid];
1846 sock->fd = -1;
1847 sock->threadid = -1;
1848 }
1849 UNLOCK(&sock->lock);
1850
1851 if (fd > 0) {
1852 socketclose(thread, sock, fd);
1853 }
1854
1855 LOCK(&manager->lock);
1856
1857 ISC_LIST_UNLINK(manager->socklist, sock, link);
1858
1859 if (ISC_LIST_EMPTY(manager->socklist)) {
1860 SIGNAL(&manager->shutdown_ok);
1861 }
1862
1863 /* can't unlock manager as its memory context is still used */
1864 free_socket(sockp);
1865
1866 UNLOCK(&manager->lock);
1867 }
1868
1869 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1870 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1871 isc_socket_t **socketp) {
1872 isc_socket_t *sock;
1873
1874 sock = isc_mem_get(manager->mctx, sizeof(*sock));
1875
1876 sock->magic = 0;
1877 isc_refcount_init(&sock->references, 0);
1878
1879 sock->manager = manager;
1880 sock->type = type;
1881 sock->fd = -1;
1882 sock->threadid = -1;
1883 sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1884 sock->statsindex = NULL;
1885 sock->active = 0;
1886
1887 ISC_LINK_INIT(sock, link);
1888
1889 memset(sock->name, 0, sizeof(sock->name));
1890 sock->tag = NULL;
1891
1892 /*
1893 * Set up list of readers and writers to be initially empty.
1894 */
1895 ISC_LIST_INIT(sock->recv_list);
1896 ISC_LIST_INIT(sock->send_list);
1897 ISC_LIST_INIT(sock->accept_list);
1898 ISC_LIST_INIT(sock->connect_list);
1899
1900 sock->listener = 0;
1901 sock->connected = 0;
1902 sock->connecting = 0;
1903 sock->bound = 0;
1904 sock->pktdscp = 0;
1905
1906 /*
1907 * Initialize the lock.
1908 */
1909 isc_mutex_init(&sock->lock);
1910
1911 sock->magic = SOCKET_MAGIC;
1912 *socketp = sock;
1913
1914 return (ISC_R_SUCCESS);
1915 }
1916
1917 /*
1918 * This event requires that the various lists be empty, that the reference
1919 * count be 1, and that the magic number is valid. The other socket bits,
1920 * like the lock, must be initialized as well. The fd associated must be
1921 * marked as closed, by setting it to -1 on close, or this routine will
1922 * also close the socket.
1923 */
1924 static void
free_socket(isc_socket_t ** socketp)1925 free_socket(isc_socket_t **socketp) {
1926 isc_socket_t *sock = *socketp;
1927 *socketp = NULL;
1928
1929 INSIST(VALID_SOCKET(sock));
1930 isc_refcount_destroy(&sock->references);
1931 LOCK(&sock->lock);
1932 INSIST(!sock->connecting);
1933 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1934 INSIST(ISC_LIST_EMPTY(sock->send_list));
1935 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1936 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1937 INSIST(!ISC_LINK_LINKED(sock, link));
1938 UNLOCK(&sock->lock);
1939
1940 sock->magic = 0;
1941
1942 isc_mutex_destroy(&sock->lock);
1943
1944 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1945 }
1946
1947 #if defined(SET_RCVBUF)
1948 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1949 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1950
1951 static void
set_rcvbuf(void)1952 set_rcvbuf(void) {
1953 int fd;
1954 int max = rcvbuf, min;
1955 socklen_t len;
1956
1957 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1958 if (fd == -1) {
1959 switch (errno) {
1960 case EPROTONOSUPPORT:
1961 case EPFNOSUPPORT:
1962 case EAFNOSUPPORT:
1963 /*
1964 * Linux 2.2 (and maybe others) return EINVAL instead of
1965 * EAFNOSUPPORT.
1966 */
1967 case EINVAL:
1968 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
1969 break;
1970 }
1971 }
1972 if (fd == -1) {
1973 return;
1974 }
1975
1976 len = sizeof(min);
1977 if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
1978 min < rcvbuf)
1979 {
1980 again:
1981 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
1982 sizeof(rcvbuf)) == -1)
1983 {
1984 if (errno == ENOBUFS && rcvbuf > min) {
1985 max = rcvbuf - 1;
1986 rcvbuf = (rcvbuf + min) / 2;
1987 goto again;
1988 } else {
1989 rcvbuf = min;
1990 goto cleanup;
1991 }
1992 } else {
1993 min = rcvbuf;
1994 }
1995 if (min != max) {
1996 rcvbuf = max;
1997 goto again;
1998 }
1999 }
2000 cleanup:
2001 close(fd);
2002 }
2003 #endif /* ifdef SO_RCVBUF */
2004
2005 #if defined(SET_SNDBUF)
2006 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
2007 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2008
2009 static void
set_sndbuf(void)2010 set_sndbuf(void) {
2011 int fd;
2012 int max = sndbuf, min;
2013 socklen_t len;
2014
2015 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2016 if (fd == -1) {
2017 switch (errno) {
2018 case EPROTONOSUPPORT:
2019 case EPFNOSUPPORT:
2020 case EAFNOSUPPORT:
2021 /*
2022 * Linux 2.2 (and maybe others) return EINVAL instead of
2023 * EAFNOSUPPORT.
2024 */
2025 case EINVAL:
2026 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2027 break;
2028 }
2029 }
2030 if (fd == -1) {
2031 return;
2032 }
2033
2034 len = sizeof(min);
2035 if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2036 min < sndbuf)
2037 {
2038 again:
2039 if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2040 sizeof(sndbuf)) == -1)
2041 {
2042 if (errno == ENOBUFS && sndbuf > min) {
2043 max = sndbuf - 1;
2044 sndbuf = (sndbuf + min) / 2;
2045 goto again;
2046 } else {
2047 sndbuf = min;
2048 goto cleanup;
2049 }
2050 } else {
2051 min = sndbuf;
2052 }
2053 if (min != max) {
2054 sndbuf = max;
2055 goto again;
2056 }
2057 }
2058 cleanup:
2059 close(fd);
2060 }
2061 #endif /* ifdef SO_SNDBUF */
2062
2063 static void
use_min_mtu(isc_socket_t * sock)2064 use_min_mtu(isc_socket_t *sock) {
2065 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2066 UNUSED(sock);
2067 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2068 #ifdef IPV6_USE_MIN_MTU
2069 /* use minimum MTU */
2070 if (sock->pf == AF_INET6) {
2071 int on = 1;
2072 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2073 (void *)&on, sizeof(on));
2074 }
2075 #endif /* ifdef IPV6_USE_MIN_MTU */
2076 #if defined(IPV6_MTU)
2077 /*
2078 * Use minimum MTU on IPv6 sockets.
2079 */
2080 if (sock->pf == AF_INET6) {
2081 int mtu = 1280;
2082 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2083 sizeof(mtu));
2084 }
2085 #endif /* if defined(IPV6_MTU) */
2086 }
2087
2088 static void
set_tcp_maxseg(isc_socket_t * sock,int size)2089 set_tcp_maxseg(isc_socket_t *sock, int size) {
2090 #ifdef TCP_MAXSEG
2091 if (sock->type == isc_sockettype_tcp) {
2092 (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2093 (void *)&size, sizeof(size));
2094 }
2095 #endif /* ifdef TCP_MAXSEG */
2096 }
2097
2098 static void
set_ip_disable_pmtud(isc_socket_t * sock)2099 set_ip_disable_pmtud(isc_socket_t *sock) {
2100 /*
2101 * Disable Path MTU Discover on IP packets
2102 */
2103 if (sock->pf == AF_INET6) {
2104 #if defined(IPV6_DONTFRAG)
2105 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
2106 &(int){ 0 }, sizeof(int));
2107 #endif
2108 #if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2109 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2110 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2111 #endif
2112 } else if (sock->pf == AF_INET) {
2113 #if defined(IP_DONTFRAG)
2114 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
2115 sizeof(int));
2116 #endif
2117 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2118 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2119 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2120 #endif
2121 }
2122 }
2123
2124 static isc_result_t
opensocket(isc_socketmgr_t * manager,isc_socket_t * sock)2125 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2126 isc_result_t result;
2127 char strbuf[ISC_STRERRORSIZE];
2128 const char *err = "socket";
2129 int tries = 0;
2130 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2131 int on = 1;
2132 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2133 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2134 socklen_t optlen;
2135 int size = 0;
2136 #endif
2137
2138 again:
2139 switch (sock->type) {
2140 case isc_sockettype_udp:
2141 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2142 break;
2143 case isc_sockettype_tcp:
2144 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2145 break;
2146 case isc_sockettype_unix:
2147 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2148 break;
2149 case isc_sockettype_raw:
2150 errno = EPFNOSUPPORT;
2151 /*
2152 * PF_ROUTE is a alias for PF_NETLINK on linux.
2153 */
2154 #if defined(PF_ROUTE)
2155 if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2156 #ifdef NETLINK_ROUTE
2157 sock->fd = socket(sock->pf, SOCK_RAW, NETLINK_ROUTE);
2158 #else /* ifdef NETLINK_ROUTE */
2159 sock->fd = socket(sock->pf, SOCK_RAW, 0);
2160 #endif /* ifdef NETLINK_ROUTE */
2161 if (sock->fd != -1) {
2162 #ifdef NETLINK_ROUTE
2163 struct sockaddr_nl sa;
2164 int n;
2165
2166 /*
2167 * Do an implicit bind.
2168 */
2169 memset(&sa, 0, sizeof(sa));
2170 sa.nl_family = AF_NETLINK;
2171 sa.nl_groups = RTMGRP_IPV4_IFADDR |
2172 RTMGRP_IPV6_IFADDR;
2173 n = bind(sock->fd, (struct sockaddr *)&sa,
2174 sizeof(sa));
2175 if (n < 0) {
2176 close(sock->fd);
2177 sock->fd = -1;
2178 }
2179 #endif /* ifdef NETLINK_ROUTE */
2180 sock->bound = 1;
2181 }
2182 }
2183 #endif /* if defined(PF_ROUTE) */
2184 break;
2185 }
2186 if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2187 goto again;
2188 }
2189
2190 #ifdef F_DUPFD
2191 /*
2192 * Leave a space for stdio and TCP to work in.
2193 */
2194 if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2195 sock->fd >= 0 && sock->fd < manager->reserved)
2196 {
2197 int newfd, tmp;
2198 newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2199 tmp = errno;
2200 (void)close(sock->fd);
2201 errno = tmp;
2202 sock->fd = newfd;
2203 err = "isc_socket_create: fcntl/reserved";
2204 } else if (sock->fd >= 0 && sock->fd < 20) {
2205 int newfd, tmp;
2206 newfd = fcntl(sock->fd, F_DUPFD, 20);
2207 tmp = errno;
2208 (void)close(sock->fd);
2209 errno = tmp;
2210 sock->fd = newfd;
2211 err = "isc_socket_create: fcntl";
2212 }
2213 #endif /* ifdef F_DUPFD */
2214
2215 if (sock->fd >= (int)manager->maxsocks) {
2216 (void)close(sock->fd);
2217 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2218 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2219 "socket: file descriptor exceeds limit (%d/%u)",
2220 sock->fd, manager->maxsocks);
2221 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2222 return (ISC_R_NORESOURCES);
2223 }
2224
2225 if (sock->fd < 0) {
2226 switch (errno) {
2227 case EMFILE:
2228 case ENFILE:
2229 strerror_r(errno, strbuf, sizeof(strbuf));
2230 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2231 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2232 "%s: %s", err, strbuf);
2233 /* fallthrough */
2234 case ENOBUFS:
2235 inc_stats(manager->stats,
2236 sock->statsindex[STATID_OPENFAIL]);
2237 return (ISC_R_NORESOURCES);
2238
2239 case EPROTONOSUPPORT:
2240 case EPFNOSUPPORT:
2241 case EAFNOSUPPORT:
2242 /*
2243 * Linux 2.2 (and maybe others) return EINVAL instead of
2244 * EAFNOSUPPORT.
2245 */
2246 case EINVAL:
2247 inc_stats(manager->stats,
2248 sock->statsindex[STATID_OPENFAIL]);
2249 return (ISC_R_FAMILYNOSUPPORT);
2250
2251 default:
2252 strerror_r(errno, strbuf, sizeof(strbuf));
2253 UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2254 err, strbuf);
2255 inc_stats(manager->stats,
2256 sock->statsindex[STATID_OPENFAIL]);
2257 return (ISC_R_UNEXPECTED);
2258 }
2259 }
2260
2261 result = make_nonblock(sock->fd);
2262 if (result != ISC_R_SUCCESS) {
2263 (void)close(sock->fd);
2264 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2265 return (result);
2266 }
2267
2268 #ifdef SO_NOSIGPIPE
2269 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2270 sizeof(on)) < 0) {
2271 strerror_r(errno, strbuf, sizeof(strbuf));
2272 UNEXPECTED_ERROR(__FILE__, __LINE__,
2273 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2274 sock->fd, strbuf);
2275 /* Press on... */
2276 }
2277 #endif /* ifdef SO_NOSIGPIPE */
2278
2279 /*
2280 * Use minimum mtu if possible.
2281 */
2282 if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2283 use_min_mtu(sock);
2284 set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2285 }
2286
2287 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2288 if (sock->type == isc_sockettype_udp) {
2289 #if defined(USE_CMSG)
2290 #if defined(SO_TIMESTAMP)
2291 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2292 sizeof(on)) < 0 &&
2293 errno != ENOPROTOOPT)
2294 {
2295 strerror_r(errno, strbuf, sizeof(strbuf));
2296 UNEXPECTED_ERROR(__FILE__, __LINE__,
2297 "setsockopt(%d, SO_TIMESTAMP) failed: "
2298 "%s",
2299 sock->fd, strbuf);
2300 /* Press on... */
2301 }
2302 #endif /* SO_TIMESTAMP */
2303
2304 #ifdef IPV6_RECVPKTINFO
2305 /* RFC 3542 */
2306 if ((sock->pf == AF_INET6) &&
2307 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2308 (void *)&on, sizeof(on)) < 0))
2309 {
2310 strerror_r(errno, strbuf, sizeof(strbuf));
2311 UNEXPECTED_ERROR(__FILE__, __LINE__,
2312 "setsockopt(%d, IPV6_RECVPKTINFO) "
2313 "failed: %s",
2314 sock->fd, strbuf);
2315 }
2316 #else /* ifdef IPV6_RECVPKTINFO */
2317 /* RFC 2292 */
2318 if ((sock->pf == AF_INET6) &&
2319 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2320 (void *)&on, sizeof(on)) < 0))
2321 {
2322 strerror_r(errno, strbuf, sizeof(strbuf));
2323 UNEXPECTED_ERROR(__FILE__, __LINE__,
2324 "setsockopt(%d, IPV6_PKTINFO) failed: "
2325 "%s",
2326 sock->fd, strbuf);
2327 }
2328 #endif /* IPV6_RECVPKTINFO */
2329 #endif /* defined(USE_CMSG) */
2330
2331 #if defined(SET_RCVBUF)
2332 optlen = sizeof(size);
2333 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2334 &optlen) == 0 &&
2335 size < rcvbuf)
2336 {
2337 RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2338 ISC_R_SUCCESS);
2339 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2340 (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2341 {
2342 strerror_r(errno, strbuf, sizeof(strbuf));
2343 UNEXPECTED_ERROR(__FILE__, __LINE__,
2344 "setsockopt(%d, SO_RCVBUF, "
2345 "%d) failed: %s",
2346 sock->fd, rcvbuf, strbuf);
2347 }
2348 }
2349 #endif /* if defined(SET_RCVBUF) */
2350
2351 #if defined(SET_SNDBUF)
2352 optlen = sizeof(size);
2353 if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2354 &optlen) == 0 &&
2355 size < sndbuf)
2356 {
2357 RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2358 ISC_R_SUCCESS);
2359 if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2360 (void *)&sndbuf, sizeof(sndbuf)) == -1)
2361 {
2362 strerror_r(errno, strbuf, sizeof(strbuf));
2363 UNEXPECTED_ERROR(__FILE__, __LINE__,
2364 "setsockopt(%d, SO_SNDBUF, "
2365 "%d) failed: %s",
2366 sock->fd, sndbuf, strbuf);
2367 }
2368 }
2369 #endif /* if defined(SO_SNDBUF) */
2370 }
2371 #ifdef IPV6_RECVTCLASS
2372 if ((sock->pf == AF_INET6) &&
2373 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2374 sizeof(on)) < 0))
2375 {
2376 strerror_r(errno, strbuf, sizeof(strbuf));
2377 UNEXPECTED_ERROR(__FILE__, __LINE__,
2378 "setsockopt(%d, IPV6_RECVTCLASS) "
2379 "failed: %s",
2380 sock->fd, strbuf);
2381 }
2382 #endif /* ifdef IPV6_RECVTCLASS */
2383 #ifdef IP_RECVTOS
2384 if ((sock->pf == AF_INET) &&
2385 (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2386 sizeof(on)) < 0))
2387 {
2388 strerror_r(errno, strbuf, sizeof(strbuf));
2389 UNEXPECTED_ERROR(__FILE__, __LINE__,
2390 "setsockopt(%d, IP_RECVTOS) "
2391 "failed: %s",
2392 sock->fd, strbuf);
2393 }
2394 #endif /* ifdef IP_RECVTOS */
2395 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2396
2397 set_ip_disable_pmtud(sock);
2398
2399 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2400 if (sock->active == 0) {
2401 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2402 sock->active = 1;
2403 }
2404
2405 return (ISC_R_SUCCESS);
2406 }
2407
2408 /*
2409 * Create a 'type' socket, managed by 'manager'. Events will be posted to
2410 * 'task' and when dispatched 'action' will be called with 'arg' as the arg
2411 * value. The new socket is returned in 'socketp'.
2412 */
2413 static isc_result_t
socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2414 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2415 isc_socket_t **socketp) {
2416 isc_socket_t *sock = NULL;
2417 isc__socketthread_t *thread;
2418 isc_result_t result;
2419 int lockid;
2420
2421 REQUIRE(VALID_MANAGER(manager));
2422 REQUIRE(socketp != NULL && *socketp == NULL);
2423
2424 result = allocate_socket(manager, type, &sock);
2425 if (result != ISC_R_SUCCESS) {
2426 return (result);
2427 }
2428
2429 switch (sock->type) {
2430 case isc_sockettype_udp:
2431 sock->statsindex = (pf == AF_INET) ? udp4statsindex
2432 : udp6statsindex;
2433 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2434 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2435 break;
2436 case isc_sockettype_tcp:
2437 sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2438 : tcp6statsindex;
2439 break;
2440 case isc_sockettype_unix:
2441 sock->statsindex = unixstatsindex;
2442 break;
2443 case isc_sockettype_raw:
2444 sock->statsindex = rawstatsindex;
2445 break;
2446 default:
2447 INSIST(0);
2448 ISC_UNREACHABLE();
2449 }
2450
2451 sock->pf = pf;
2452
2453 result = opensocket(manager, sock);
2454 if (result != ISC_R_SUCCESS) {
2455 free_socket(&sock);
2456 return (result);
2457 }
2458
2459 if (sock->fd == -1) {
2460 abort();
2461 }
2462 sock->threadid = gen_threadid(sock);
2463 isc_refcount_increment0(&sock->references);
2464 thread = &manager->threads[sock->threadid];
2465 *socketp = sock;
2466
2467 /*
2468 * Note we don't have to lock the socket like we normally would because
2469 * there are no external references to it yet.
2470 */
2471
2472 lockid = FDLOCK_ID(sock->fd);
2473 LOCK(&thread->fdlock[lockid]);
2474 thread->fds[sock->fd] = sock;
2475 thread->fdstate[sock->fd] = MANAGED;
2476 #if defined(USE_EPOLL)
2477 thread->epoll_events[sock->fd] = 0;
2478 #endif /* if defined(USE_EPOLL) */
2479 #ifdef USE_DEVPOLL
2480 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2481 thread->fdpollinfo[sock->fd].want_write == 0);
2482 #endif /* ifdef USE_DEVPOLL */
2483 UNLOCK(&thread->fdlock[lockid]);
2484
2485 LOCK(&manager->lock);
2486 ISC_LIST_APPEND(manager->socklist, sock, link);
2487 #ifdef USE_SELECT
2488 if (thread->maxfd < sock->fd) {
2489 thread->maxfd = sock->fd;
2490 }
2491 #endif /* ifdef USE_SELECT */
2492 UNLOCK(&manager->lock);
2493
2494 socket_log(sock, NULL, CREATION, "created");
2495
2496 return (ISC_R_SUCCESS);
2497 }
2498
2499 /*%
2500 * Create a new 'type' socket managed by 'manager'. Events
2501 * will be posted to 'task' and when dispatched 'action' will be
2502 * called with 'arg' as the arg value. The new socket is returned
2503 * in 'socketp'.
2504 */
2505 isc_result_t
isc_socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2506 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2507 isc_socket_t **socketp) {
2508 return (socket_create(manager, pf, type, socketp));
2509 }
2510
2511 isc_result_t
isc_socket_open(isc_socket_t * sock)2512 isc_socket_open(isc_socket_t *sock) {
2513 isc_result_t result;
2514 isc__socketthread_t *thread;
2515
2516 REQUIRE(VALID_SOCKET(sock));
2517
2518 LOCK(&sock->lock);
2519
2520 REQUIRE(isc_refcount_current(&sock->references) >= 1);
2521 REQUIRE(sock->fd == -1);
2522 REQUIRE(sock->threadid == -1);
2523
2524 result = opensocket(sock->manager, sock);
2525
2526 UNLOCK(&sock->lock);
2527
2528 if (result != ISC_R_SUCCESS) {
2529 sock->fd = -1;
2530 } else {
2531 sock->threadid = gen_threadid(sock);
2532 thread = &sock->manager->threads[sock->threadid];
2533 int lockid = FDLOCK_ID(sock->fd);
2534
2535 LOCK(&thread->fdlock[lockid]);
2536 thread->fds[sock->fd] = sock;
2537 thread->fdstate[sock->fd] = MANAGED;
2538 #if defined(USE_EPOLL)
2539 thread->epoll_events[sock->fd] = 0;
2540 #endif /* if defined(USE_EPOLL) */
2541 #ifdef USE_DEVPOLL
2542 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2543 thread->fdpollinfo[sock->fd].want_write == 0);
2544 #endif /* ifdef USE_DEVPOLL */
2545 UNLOCK(&thread->fdlock[lockid]);
2546
2547 #ifdef USE_SELECT
2548 LOCK(&sock->manager->lock);
2549 if (thread->maxfd < sock->fd) {
2550 thread->maxfd = sock->fd;
2551 }
2552 UNLOCK(&sock->manager->lock);
2553 #endif /* ifdef USE_SELECT */
2554 }
2555
2556 return (result);
2557 }
2558
2559 /*
2560 * Attach to a socket. Caller must explicitly detach when it is done.
2561 */
2562 void
isc_socket_attach(isc_socket_t * sock,isc_socket_t ** socketp)2563 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2564 REQUIRE(VALID_SOCKET(sock));
2565 REQUIRE(socketp != NULL && *socketp == NULL);
2566
2567 int old_refs = isc_refcount_increment(&sock->references);
2568 REQUIRE(old_refs > 0);
2569
2570 *socketp = sock;
2571 }
2572
2573 /*
2574 * Dereference a socket. If this is the last reference to it, clean things
2575 * up by destroying the socket.
2576 */
2577 void
isc_socket_detach(isc_socket_t ** socketp)2578 isc_socket_detach(isc_socket_t **socketp) {
2579 isc_socket_t *sock;
2580
2581 REQUIRE(socketp != NULL);
2582 sock = *socketp;
2583 REQUIRE(VALID_SOCKET(sock));
2584 if (isc_refcount_decrement(&sock->references) == 1) {
2585 destroy(&sock);
2586 }
2587
2588 *socketp = NULL;
2589 }
2590
2591 isc_result_t
isc_socket_close(isc_socket_t * sock)2592 isc_socket_close(isc_socket_t *sock) {
2593 int fd;
2594 isc_socketmgr_t *manager;
2595 isc__socketthread_t *thread;
2596
2597 REQUIRE(VALID_SOCKET(sock));
2598
2599 LOCK(&sock->lock);
2600
2601 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2602
2603 INSIST(!sock->connecting);
2604 INSIST(ISC_LIST_EMPTY(sock->recv_list));
2605 INSIST(ISC_LIST_EMPTY(sock->send_list));
2606 INSIST(ISC_LIST_EMPTY(sock->accept_list));
2607 INSIST(ISC_LIST_EMPTY(sock->connect_list));
2608
2609 manager = sock->manager;
2610 thread = &manager->threads[sock->threadid];
2611 fd = sock->fd;
2612 sock->fd = -1;
2613 sock->threadid = -1;
2614
2615 memset(sock->name, 0, sizeof(sock->name));
2616 sock->tag = NULL;
2617 sock->listener = 0;
2618 sock->connected = 0;
2619 sock->connecting = 0;
2620 sock->bound = 0;
2621 isc_sockaddr_any(&sock->peer_address);
2622
2623 UNLOCK(&sock->lock);
2624
2625 socketclose(thread, sock, fd);
2626
2627 return (ISC_R_SUCCESS);
2628 }
2629
2630 /*
2631 * Dequeue an item off the given socket's read queue, set the result code
2632 * in the done event to the one provided, and send it to the task it was
2633 * destined for.
2634 *
2635 * If the event to be sent is on a list, remove it before sending. If
2636 * asked to, send and detach from the socket as well.
2637 *
2638 * Caller must have the socket locked if the event is attached to the socket.
2639 */
2640 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2641 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2642 isc_task_t *task;
2643
2644 task = (*dev)->ev_sender;
2645
2646 (*dev)->ev_sender = sock;
2647
2648 if (ISC_LINK_LINKED(*dev, ev_link)) {
2649 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2650 }
2651
2652 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2653 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2654 sock->threadid);
2655 } else {
2656 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2657 }
2658 }
2659
2660 /*
2661 * See comments for send_recvdone_event() above.
2662 *
2663 * Caller must have the socket locked if the event is attached to the socket.
2664 */
2665 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2666 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2667 isc_task_t *task;
2668
2669 INSIST(dev != NULL && *dev != NULL);
2670
2671 task = (*dev)->ev_sender;
2672 (*dev)->ev_sender = sock;
2673
2674 if (ISC_LINK_LINKED(*dev, ev_link)) {
2675 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2676 }
2677
2678 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2679 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2680 sock->threadid);
2681 } else {
2682 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2683 }
2684 }
2685
2686 /*
2687 * See comments for send_recvdone_event() above.
2688 *
2689 * Caller must have the socket locked if the event is attached to the socket.
2690 */
2691 static void
send_connectdone_event(isc_socket_t * sock,isc_socket_connev_t ** dev)2692 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
2693 isc_task_t *task;
2694
2695 INSIST(dev != NULL && *dev != NULL);
2696
2697 task = (*dev)->ev_sender;
2698 (*dev)->ev_sender = sock;
2699
2700 if (ISC_LINK_LINKED(*dev, ev_link)) {
2701 ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2702 }
2703
2704 isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2705 }
2706
2707 /*
2708 * Call accept() on a socket, to get the new file descriptor. The listen
2709 * socket is used as a prototype to create a new isc_socket_t. The new
2710 * socket has one outstanding reference. The task receiving the event
2711 * will be detached from just after the event is delivered.
2712 *
2713 * On entry to this function, the event delivered is the internal
2714 * readable event, and the first item on the accept_list should be
2715 * the done event we want to send. If the list is empty, this is a no-op,
2716 * so just unlock and return.
2717 */
2718 static void
internal_accept(isc_socket_t * sock)2719 internal_accept(isc_socket_t *sock) {
2720 isc_socketmgr_t *manager;
2721 isc__socketthread_t *thread, *nthread;
2722 isc_socket_newconnev_t *dev;
2723 isc_task_t *task;
2724 socklen_t addrlen;
2725 int fd;
2726 isc_result_t result = ISC_R_SUCCESS;
2727 char strbuf[ISC_STRERRORSIZE];
2728 const char *err = "accept";
2729
2730 INSIST(VALID_SOCKET(sock));
2731 REQUIRE(sock->fd >= 0);
2732
2733 socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2734
2735 manager = sock->manager;
2736 INSIST(VALID_MANAGER(manager));
2737 thread = &manager->threads[sock->threadid];
2738
2739 INSIST(sock->listener);
2740
2741 /*
2742 * Get the first item off the accept list.
2743 * If it is empty, unlock the socket and return.
2744 */
2745 dev = ISC_LIST_HEAD(sock->accept_list);
2746 if (dev == NULL) {
2747 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2748 UNLOCK(&sock->lock);
2749 return;
2750 }
2751
2752 /*
2753 * Try to accept the new connection. If the accept fails with
2754 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2755 * again. Also ignore ECONNRESET, which has been reported to
2756 * be spuriously returned on Linux 2.2.19 although it is not
2757 * a documented error for accept(). ECONNABORTED has been
2758 * reported for Solaris 8. The rest are thrown in not because
2759 * we have seen them but because they are ignored by other
2760 * daemons such as BIND 8 and Apache.
2761 */
2762
2763 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2764 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2765 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2766 (void *)&addrlen);
2767
2768 #ifdef F_DUPFD
2769 /*
2770 * Leave a space for stdio to work in.
2771 */
2772 if (fd >= 0 && fd < 20) {
2773 int newfd, tmp;
2774 newfd = fcntl(fd, F_DUPFD, 20);
2775 tmp = errno;
2776 (void)close(fd);
2777 errno = tmp;
2778 fd = newfd;
2779 err = "accept/fcntl";
2780 }
2781 #endif /* ifdef F_DUPFD */
2782
2783 if (fd < 0) {
2784 if (SOFT_ERROR(errno)) {
2785 goto soft_error;
2786 }
2787 switch (errno) {
2788 case ENFILE:
2789 case EMFILE:
2790 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2791 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2792 "%s: too many open file descriptors",
2793 err);
2794 goto soft_error;
2795
2796 case ENOBUFS:
2797 case ENOMEM:
2798 case ECONNRESET:
2799 case ECONNABORTED:
2800 case EHOSTUNREACH:
2801 case EHOSTDOWN:
2802 case ENETUNREACH:
2803 case ENETDOWN:
2804 case ECONNREFUSED:
2805 #ifdef EPROTO
2806 case EPROTO:
2807 #endif /* ifdef EPROTO */
2808 #ifdef ENONET
2809 case ENONET:
2810 #endif /* ifdef ENONET */
2811 goto soft_error;
2812 default:
2813 break;
2814 }
2815 strerror_r(errno, strbuf, sizeof(strbuf));
2816 UNEXPECTED_ERROR(__FILE__, __LINE__,
2817 "internal_accept: %s() failed: %s", err,
2818 strbuf);
2819 fd = -1;
2820 result = ISC_R_UNEXPECTED;
2821 } else {
2822 if (addrlen == 0U) {
2823 UNEXPECTED_ERROR(__FILE__, __LINE__,
2824 "internal_accept(): "
2825 "accept() failed to return "
2826 "remote address");
2827
2828 (void)close(fd);
2829 goto soft_error;
2830 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2831 sock->pf) {
2832 UNEXPECTED_ERROR(
2833 __FILE__, __LINE__,
2834 "internal_accept(): "
2835 "accept() returned peer address "
2836 "family %u (expected %u)",
2837 NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2838 sock->pf);
2839 (void)close(fd);
2840 goto soft_error;
2841 } else if (fd >= (int)manager->maxsocks) {
2842 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2843 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2844 "accept: file descriptor exceeds limit "
2845 "(%d/%u)",
2846 fd, manager->maxsocks);
2847 (void)close(fd);
2848 goto soft_error;
2849 }
2850 }
2851
2852 if (fd != -1) {
2853 NEWCONNSOCK(dev)->peer_address.length = addrlen;
2854 NEWCONNSOCK(dev)->pf = sock->pf;
2855 }
2856
2857 /*
2858 * Pull off the done event.
2859 */
2860 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2861
2862 /*
2863 * Poke watcher if there are more pending accepts.
2864 */
2865 if (ISC_LIST_EMPTY(sock->accept_list)) {
2866 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2867 }
2868
2869 if (fd != -1) {
2870 result = make_nonblock(fd);
2871 if (result != ISC_R_SUCCESS) {
2872 (void)close(fd);
2873 fd = -1;
2874 }
2875 }
2876
2877 /*
2878 * We need to unlock sock->lock now to be able to lock manager->lock
2879 * without risking a deadlock with xmlstats.
2880 */
2881 UNLOCK(&sock->lock);
2882
2883 /*
2884 * -1 means the new socket didn't happen.
2885 */
2886 if (fd != -1) {
2887 int lockid = FDLOCK_ID(fd);
2888
2889 NEWCONNSOCK(dev)->fd = fd;
2890 NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2891 NEWCONNSOCK(dev)->bound = 1;
2892 NEWCONNSOCK(dev)->connected = 1;
2893 nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2894
2895 /*
2896 * We already hold a lock on one fdlock in accepting thread,
2897 * we need to make sure that we don't double lock.
2898 */
2899 bool same_bucket = (sock->threadid ==
2900 NEWCONNSOCK(dev)->threadid) &&
2901 (FDLOCK_ID(sock->fd) == lockid);
2902
2903 /*
2904 * Use minimum mtu if possible.
2905 */
2906 use_min_mtu(NEWCONNSOCK(dev));
2907 set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
2908
2909 /*
2910 * Ensure DSCP settings are inherited across accept.
2911 */
2912 setdscp(NEWCONNSOCK(dev), sock->dscp);
2913
2914 /*
2915 * Save away the remote address
2916 */
2917 dev->address = NEWCONNSOCK(dev)->peer_address;
2918
2919 if (NEWCONNSOCK(dev)->active == 0) {
2920 inc_stats(manager->stats,
2921 NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
2922 NEWCONNSOCK(dev)->active = 1;
2923 }
2924
2925 if (!same_bucket) {
2926 LOCK(&nthread->fdlock[lockid]);
2927 }
2928 nthread->fds[fd] = NEWCONNSOCK(dev);
2929 nthread->fdstate[fd] = MANAGED;
2930 #if defined(USE_EPOLL)
2931 nthread->epoll_events[fd] = 0;
2932 #endif /* if defined(USE_EPOLL) */
2933 if (!same_bucket) {
2934 UNLOCK(&nthread->fdlock[lockid]);
2935 }
2936
2937 LOCK(&manager->lock);
2938
2939 #ifdef USE_SELECT
2940 if (nthread->maxfd < fd) {
2941 nthread->maxfd = fd;
2942 }
2943 #endif /* ifdef USE_SELECT */
2944
2945 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
2946 "accepted connection, new socket %p",
2947 dev->newsocket);
2948
2949 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
2950
2951 UNLOCK(&manager->lock);
2952
2953 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2954 } else {
2955 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2956 isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
2957 free_socket((isc_socket_t **)&dev->newsocket);
2958 }
2959
2960 /*
2961 * Fill in the done event details and send it off.
2962 */
2963 dev->result = result;
2964 task = dev->ev_sender;
2965 dev->ev_sender = sock;
2966
2967 isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
2968 return;
2969
2970 soft_error:
2971 watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2972 UNLOCK(&sock->lock);
2973
2974 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2975 return;
2976 }
2977
2978 static void
internal_recv(isc_socket_t * sock)2979 internal_recv(isc_socket_t *sock) {
2980 isc_socketevent_t *dev;
2981
2982 INSIST(VALID_SOCKET(sock));
2983 REQUIRE(sock->fd >= 0);
2984
2985 dev = ISC_LIST_HEAD(sock->recv_list);
2986 if (dev == NULL) {
2987 goto finish;
2988 }
2989
2990 socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
2991 dev, dev->ev_sender);
2992
2993 /*
2994 * Try to do as much I/O as possible on this socket. There are no
2995 * limits here, currently.
2996 */
2997 while (dev != NULL) {
2998 switch (doio_recv(sock, dev)) {
2999 case DOIO_SOFT:
3000 goto finish;
3001
3002 case DOIO_EOF:
3003 /*
3004 * read of 0 means the remote end was closed.
3005 * Run through the event queue and dispatch all
3006 * the events with an EOF result code.
3007 */
3008 do {
3009 dev->result = ISC_R_EOF;
3010 send_recvdone_event(sock, &dev);
3011 dev = ISC_LIST_HEAD(sock->recv_list);
3012 } while (dev != NULL);
3013 goto finish;
3014
3015 case DOIO_SUCCESS:
3016 case DOIO_HARD:
3017 send_recvdone_event(sock, &dev);
3018 break;
3019 }
3020
3021 dev = ISC_LIST_HEAD(sock->recv_list);
3022 }
3023
3024 finish:
3025 if (ISC_LIST_EMPTY(sock->recv_list)) {
3026 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3027 SELECT_POKE_READ);
3028 }
3029 }
3030
3031 static void
internal_send(isc_socket_t * sock)3032 internal_send(isc_socket_t *sock) {
3033 isc_socketevent_t *dev;
3034
3035 INSIST(VALID_SOCKET(sock));
3036 REQUIRE(sock->fd >= 0);
3037
3038 dev = ISC_LIST_HEAD(sock->send_list);
3039 if (dev == NULL) {
3040 goto finish;
3041 }
3042 socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3043 dev->ev_sender);
3044
3045 /*
3046 * Try to do as much I/O as possible on this socket. There are no
3047 * limits here, currently.
3048 */
3049 while (dev != NULL) {
3050 switch (doio_send(sock, dev)) {
3051 case DOIO_SOFT:
3052 goto finish;
3053
3054 case DOIO_HARD:
3055 case DOIO_SUCCESS:
3056 send_senddone_event(sock, &dev);
3057 break;
3058 }
3059
3060 dev = ISC_LIST_HEAD(sock->send_list);
3061 }
3062
3063 finish:
3064 if (ISC_LIST_EMPTY(sock->send_list)) {
3065 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3066 SELECT_POKE_WRITE);
3067 }
3068 }
3069
3070 /*
3071 * Process read/writes on each fd here. Avoid locking
3072 * and unlocking twice if both reads and writes are possible.
3073 */
3074 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3075 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3076 isc_socket_t *sock;
3077 int lockid = FDLOCK_ID(fd);
3078
3079 /*
3080 * If the socket is going to be closed, don't do more I/O.
3081 */
3082 LOCK(&thread->fdlock[lockid]);
3083 if (thread->fdstate[fd] == CLOSE_PENDING) {
3084 UNLOCK(&thread->fdlock[lockid]);
3085
3086 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3087 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3088 return;
3089 }
3090
3091 sock = thread->fds[fd];
3092 if (sock == NULL) {
3093 UNLOCK(&thread->fdlock[lockid]);
3094 return;
3095 }
3096
3097 LOCK(&sock->lock);
3098
3099 if (sock->fd < 0) {
3100 /*
3101 * Sock is being closed - the final external reference
3102 * is gone but it was not yet removed from event loop
3103 * and fdstate[]/fds[] as destroy() is waiting on
3104 * thread->fdlock[lockid] or sock->lock that we're holding.
3105 * Just release the locks and bail.
3106 */
3107 UNLOCK(&sock->lock);
3108 UNLOCK(&thread->fdlock[lockid]);
3109 return;
3110 }
3111
3112 REQUIRE(readable || writeable);
3113 if (writeable) {
3114 if (sock->connecting) {
3115 internal_connect(sock);
3116 } else {
3117 internal_send(sock);
3118 }
3119 }
3120
3121 if (readable) {
3122 if (sock->listener) {
3123 internal_accept(sock); /* unlocks sock */
3124 } else {
3125 internal_recv(sock);
3126 UNLOCK(&sock->lock);
3127 }
3128 } else {
3129 UNLOCK(&sock->lock);
3130 }
3131
3132 UNLOCK(&thread->fdlock[lockid]);
3133
3134 /*
3135 * Socket destruction might be pending, it will resume
3136 * after releasing fdlock and sock->lock.
3137 */
3138 }
3139
3140 /*
3141 * process_fds is different for different event loops
3142 * it takes the events from event loops and for each FD
3143 * launches process_fd
3144 */
3145 #ifdef USE_KQUEUE
3146 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3147 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3148 int i;
3149 bool readable, writable;
3150 bool done = false;
3151 bool have_ctlevent = false;
3152 if (nevents == thread->nevents) {
3153 /*
3154 * This is not an error, but something unexpected. If this
3155 * happens, it may indicate the need for increasing
3156 * ISC_SOCKET_MAXEVENTS.
3157 */
3158 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3159 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3160 "maximum number of FD events (%d) received",
3161 nevents);
3162 }
3163
3164 for (i = 0; i < nevents; i++) {
3165 REQUIRE(events[i].ident < thread->manager->maxsocks);
3166 if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3167 have_ctlevent = true;
3168 continue;
3169 }
3170 readable = (events[i].filter == EVFILT_READ);
3171 writable = (events[i].filter == EVFILT_WRITE);
3172 process_fd(thread, events[i].ident, readable, writable);
3173 }
3174
3175 if (have_ctlevent) {
3176 done = process_ctlfd(thread);
3177 }
3178
3179 return (done);
3180 }
3181 #elif defined(USE_EPOLL)
3182 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3183 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3184 int nevents) {
3185 int i;
3186 bool done = false;
3187 bool have_ctlevent = false;
3188
3189 if (nevents == thread->nevents) {
3190 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3191 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3192 "maximum number of FD events (%d) received",
3193 nevents);
3194 }
3195
3196 for (i = 0; i < nevents; i++) {
3197 REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3198 if (events[i].data.fd == thread->pipe_fds[0]) {
3199 have_ctlevent = true;
3200 continue;
3201 }
3202 if ((events[i].events & EPOLLERR) != 0 ||
3203 (events[i].events & EPOLLHUP) != 0) {
3204 /*
3205 * epoll does not set IN/OUT bits on an erroneous
3206 * condition, so we need to try both anyway. This is a
3207 * bit inefficient, but should be okay for such rare
3208 * events. Note also that the read or write attempt
3209 * won't block because we use non-blocking sockets.
3210 */
3211 int fd = events[i].data.fd;
3212 events[i].events |= thread->epoll_events[fd];
3213 }
3214 process_fd(thread, events[i].data.fd,
3215 (events[i].events & EPOLLIN) != 0,
3216 (events[i].events & EPOLLOUT) != 0);
3217 }
3218
3219 if (have_ctlevent) {
3220 done = process_ctlfd(thread);
3221 }
3222
3223 return (done);
3224 }
3225 #elif defined(USE_DEVPOLL)
3226 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3227 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3228 int i;
3229 bool done = false;
3230 bool have_ctlevent = false;
3231
3232 if (nevents == thread->nevents) {
3233 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3234 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3235 "maximum number of FD events (%d) received",
3236 nevents);
3237 }
3238
3239 for (i = 0; i < nevents; i++) {
3240 REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3241 if (events[i].fd == thread->pipe_fds[0]) {
3242 have_ctlevent = true;
3243 continue;
3244 }
3245 process_fd(thread, events[i].fd,
3246 (events[i].events & POLLIN) != 0,
3247 (events[i].events & POLLOUT) != 0);
3248 }
3249
3250 if (have_ctlevent) {
3251 done = process_ctlfd(thread);
3252 }
3253
3254 return (done);
3255 }
3256 #elif defined(USE_SELECT)
3257 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3258 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3259 fd_set *writefds) {
3260 int i;
3261
3262 REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3263
3264 for (i = 0; i < maxfd; i++) {
3265 if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3266 continue;
3267 }
3268 process_fd(thread, i, FD_ISSET(i, readfds),
3269 FD_ISSET(i, writefds));
3270 }
3271 }
3272 #endif /* ifdef USE_KQUEUE */
3273
3274 static bool
process_ctlfd(isc__socketthread_t * thread)3275 process_ctlfd(isc__socketthread_t *thread) {
3276 int msg, fd;
3277
3278 for (;;) {
3279 select_readmsg(thread, &fd, &msg);
3280
3281 thread_log(thread, IOEVENT,
3282 "watcher got message %d for socket %d", msg, fd);
3283
3284 /*
3285 * Nothing to read?
3286 */
3287 if (msg == SELECT_POKE_NOTHING) {
3288 break;
3289 }
3290
3291 /*
3292 * Handle shutdown message. We really should
3293 * jump out of this loop right away, but
3294 * it doesn't matter if we have to do a little
3295 * more work first.
3296 */
3297 if (msg == SELECT_POKE_SHUTDOWN) {
3298 return (true);
3299 }
3300
3301 /*
3302 * This is a wakeup on a socket. Look
3303 * at the event queue for both read and write,
3304 * and decide if we need to watch on it now
3305 * or not.
3306 */
3307 wakeup_socket(thread, fd, msg);
3308 }
3309
3310 return (false);
3311 }
3312
3313 /*
3314 * This is the thread that will loop forever, always in a select or poll
3315 * call.
3316 *
3317 * When select returns something to do, do whatever's necessary and post
3318 * an event to the task that was requesting the action.
3319 */
3320 static isc_threadresult_t
netthread(void * uap)3321 netthread(void *uap) {
3322 isc__socketthread_t *thread = uap;
3323 isc_socketmgr_t *manager = thread->manager;
3324 (void)manager;
3325 bool done;
3326 int cc;
3327 #ifdef USE_KQUEUE
3328 const char *fnname = "kevent()";
3329 #elif defined(USE_EPOLL)
3330 const char *fnname = "epoll_wait()";
3331 #elif defined(USE_DEVPOLL)
3332 isc_result_t result;
3333 const char *fnname = "ioctl(DP_POLL)";
3334 struct dvpoll dvp;
3335 int pass;
3336 #if defined(ISC_SOCKET_USE_POLLWATCH)
3337 pollstate_t pollstate = poll_idle;
3338 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3339 #elif defined(USE_SELECT)
3340 const char *fnname = "select()";
3341 int maxfd;
3342 int ctlfd;
3343 #endif /* ifdef USE_KQUEUE */
3344 char strbuf[ISC_STRERRORSIZE];
3345
3346 #if defined(USE_SELECT)
3347 /*
3348 * Get the control fd here. This will never change.
3349 */
3350 ctlfd = thread->pipe_fds[0];
3351 #endif /* if defined(USE_SELECT) */
3352 done = false;
3353 while (!done) {
3354 do {
3355 #ifdef USE_KQUEUE
3356 cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3357 thread->nevents, NULL);
3358 #elif defined(USE_EPOLL)
3359 cc = epoll_wait(thread->epoll_fd, thread->events,
3360 thread->nevents, -1);
3361 #elif defined(USE_DEVPOLL)
3362 /*
3363 * Re-probe every thousand calls.
3364 */
3365 if (thread->calls++ > 1000U) {
3366 result = isc_resource_getcurlimit(
3367 isc_resource_openfiles,
3368 &thread->open_max);
3369 if (result != ISC_R_SUCCESS) {
3370 thread->open_max = 64;
3371 }
3372 thread->calls = 0;
3373 }
3374 for (pass = 0; pass < 2; pass++) {
3375 dvp.dp_fds = thread->events;
3376 dvp.dp_nfds = thread->nevents;
3377 if (dvp.dp_nfds >= thread->open_max) {
3378 dvp.dp_nfds = thread->open_max - 1;
3379 }
3380 #ifndef ISC_SOCKET_USE_POLLWATCH
3381 dvp.dp_timeout = -1;
3382 #else /* ifndef ISC_SOCKET_USE_POLLWATCH */
3383 if (pollstate == poll_idle) {
3384 dvp.dp_timeout = -1;
3385 } else {
3386 dvp.dp_timeout =
3387 ISC_SOCKET_POLLWATCH_TIMEOUT;
3388 }
3389 #endif /* ISC_SOCKET_USE_POLLWATCH */
3390 cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3391 if (cc == -1 && errno == EINVAL) {
3392 /*
3393 * {OPEN_MAX} may have dropped. Look
3394 * up the current value and try again.
3395 */
3396 result = isc_resource_getcurlimit(
3397 isc_resource_openfiles,
3398 &thread->open_max);
3399 if (result != ISC_R_SUCCESS) {
3400 thread->open_max = 64;
3401 }
3402 } else {
3403 break;
3404 }
3405 }
3406 #elif defined(USE_SELECT)
3407 /*
3408 * We will have only one thread anyway, we can lock
3409 * manager lock and don't care
3410 */
3411 LOCK(&manager->lock);
3412 memmove(thread->read_fds_copy, thread->read_fds,
3413 thread->fd_bufsize);
3414 memmove(thread->write_fds_copy, thread->write_fds,
3415 thread->fd_bufsize);
3416 maxfd = thread->maxfd + 1;
3417 UNLOCK(&manager->lock);
3418
3419 cc = select(maxfd, thread->read_fds_copy,
3420 thread->write_fds_copy, NULL, NULL);
3421 #endif /* USE_KQUEUE */
3422
3423 if (cc < 0 && !SOFT_ERROR(errno)) {
3424 strerror_r(errno, strbuf, sizeof(strbuf));
3425 FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3426 fnname, strbuf);
3427 }
3428
3429 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3430 if (cc == 0) {
3431 if (pollstate == poll_active) {
3432 pollstate = poll_checking;
3433 } else if (pollstate == poll_checking) {
3434 pollstate = poll_idle;
3435 }
3436 } else if (cc > 0) {
3437 if (pollstate == poll_checking) {
3438 /*
3439 * XXX: We'd like to use a more
3440 * verbose log level as it's actually an
3441 * unexpected event, but the kernel bug
3442 * reportedly happens pretty frequently
3443 * (and it can also be a false positive)
3444 * so it would be just too noisy.
3445 */
3446 thread_log(thread,
3447 ISC_LOGCATEGORY_GENERAL,
3448 ISC_LOGMODULE_SOCKET,
3449 ISC_LOG_DEBUG(1),
3450 "unexpected POLL timeout");
3451 }
3452 pollstate = poll_active;
3453 }
3454 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3455 } while (cc < 0);
3456
3457 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3458 done = process_fds(thread, thread->events, cc);
3459 #elif defined(USE_SELECT)
3460 process_fds(thread, maxfd, thread->read_fds_copy,
3461 thread->write_fds_copy);
3462
3463 /*
3464 * Process reads on internal, control fd.
3465 */
3466 if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3467 done = process_ctlfd(thread);
3468 }
3469 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3470 * */
3471 }
3472
3473 thread_log(thread, TRACE, "watcher exiting");
3474 return ((isc_threadresult_t)0);
3475 }
3476
3477 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager,uint32_t reserved)3478 isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
3479 REQUIRE(VALID_MANAGER(manager));
3480
3481 manager->reserved = reserved;
3482 }
3483
3484 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager,unsigned int maxudp)3485 isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
3486 REQUIRE(VALID_MANAGER(manager));
3487
3488 manager->maxudp = maxudp;
3489 }
3490
3491 /*
3492 * Setup socket thread, thread->manager and thread->threadid must be filled.
3493 */
3494
3495 static isc_result_t
setup_thread(isc__socketthread_t * thread)3496 setup_thread(isc__socketthread_t *thread) {
3497 isc_result_t result = ISC_R_SUCCESS;
3498 int i;
3499 char strbuf[ISC_STRERRORSIZE];
3500
3501 REQUIRE(thread != NULL);
3502 REQUIRE(VALID_MANAGER(thread->manager));
3503 REQUIRE(thread->threadid >= 0 &&
3504 thread->threadid < thread->manager->nthreads);
3505
3506 thread->fds =
3507 isc_mem_get(thread->manager->mctx,
3508 thread->manager->maxsocks * sizeof(isc_socket_t *));
3509
3510 memset(thread->fds, 0,
3511 thread->manager->maxsocks * sizeof(isc_socket_t *));
3512
3513 thread->fdstate = isc_mem_get(thread->manager->mctx,
3514 thread->manager->maxsocks * sizeof(int));
3515
3516 memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3517
3518 thread->fdlock = isc_mem_get(thread->manager->mctx,
3519 FDLOCK_COUNT * sizeof(isc_mutex_t));
3520
3521 for (i = 0; i < FDLOCK_COUNT; i++) {
3522 isc_mutex_init(&thread->fdlock[i]);
3523 }
3524
3525 if (pipe(thread->pipe_fds) != 0) {
3526 strerror_r(errno, strbuf, sizeof(strbuf));
3527 UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3528 strbuf);
3529 return (ISC_R_UNEXPECTED);
3530 }
3531 RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3532
3533 #ifdef USE_KQUEUE
3534 thread->nevents = ISC_SOCKET_MAXEVENTS;
3535 thread->events = isc_mem_get(thread->manager->mctx,
3536 sizeof(struct kevent) * thread->nevents);
3537
3538 thread->kqueue_fd = kqueue();
3539 if (thread->kqueue_fd == -1) {
3540 result = isc__errno2result(errno);
3541 strerror_r(errno, strbuf, sizeof(strbuf));
3542 UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3543 strbuf);
3544 isc_mem_put(thread->manager->mctx, thread->events,
3545 sizeof(struct kevent) * thread->nevents);
3546 return (result);
3547 }
3548
3549 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3550 if (result != ISC_R_SUCCESS) {
3551 close(thread->kqueue_fd);
3552 isc_mem_put(thread->manager->mctx, thread->events,
3553 sizeof(struct kevent) * thread->nevents);
3554 }
3555 return (result);
3556
3557 #elif defined(USE_EPOLL)
3558 thread->nevents = ISC_SOCKET_MAXEVENTS;
3559 thread->epoll_events =
3560 isc_mem_get(thread->manager->mctx,
3561 (thread->manager->maxsocks * sizeof(uint32_t)));
3562
3563 memset(thread->epoll_events, 0,
3564 thread->manager->maxsocks * sizeof(uint32_t));
3565
3566 thread->events =
3567 isc_mem_get(thread->manager->mctx,
3568 sizeof(struct epoll_event) * thread->nevents);
3569
3570 thread->epoll_fd = epoll_create(thread->nevents);
3571 if (thread->epoll_fd == -1) {
3572 result = isc__errno2result(errno);
3573 strerror_r(errno, strbuf, sizeof(strbuf));
3574 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3575 strbuf);
3576 return (result);
3577 }
3578
3579 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3580 return (result);
3581
3582 #elif defined(USE_DEVPOLL)
3583 thread->nevents = ISC_SOCKET_MAXEVENTS;
3584 result = isc_resource_getcurlimit(isc_resource_openfiles,
3585 &thread->open_max);
3586 if (result != ISC_R_SUCCESS) {
3587 thread->open_max = 64;
3588 }
3589 thread->calls = 0;
3590 thread->events = isc_mem_get(thread->manager->mctx,
3591 sizeof(struct pollfd) * thread->nevents);
3592
3593 /*
3594 * Note: fdpollinfo should be able to support all possible FDs, so
3595 * it must have maxsocks entries (not nevents).
3596 */
3597 thread->fdpollinfo =
3598 isc_mem_get(thread->manager->mctx,
3599 sizeof(pollinfo_t) * thread->manager->maxsocks);
3600 memset(thread->fdpollinfo, 0,
3601 sizeof(pollinfo_t) * thread->manager->maxsocks);
3602 thread->devpoll_fd = open("/dev/poll", O_RDWR);
3603 if (thread->devpoll_fd == -1) {
3604 result = isc__errno2result(errno);
3605 strerror_r(errno, strbuf, sizeof(strbuf));
3606 UNEXPECTED_ERROR(__FILE__, __LINE__,
3607 "open(/dev/poll) failed: %s", strbuf);
3608 isc_mem_put(thread->manager->mctx, thread->events,
3609 sizeof(struct pollfd) * thread->nevents);
3610 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3611 sizeof(pollinfo_t) * thread->manager->maxsocks);
3612 return (result);
3613 }
3614 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3615 if (result != ISC_R_SUCCESS) {
3616 close(thread->devpoll_fd);
3617 isc_mem_put(thread->manager->mctx, thread->events,
3618 sizeof(struct pollfd) * thread->nevents);
3619 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3620 sizeof(pollinfo_t) * thread->manager->maxsocks);
3621 return (result);
3622 }
3623
3624 return (ISC_R_SUCCESS);
3625 #elif defined(USE_SELECT)
3626 UNUSED(result);
3627
3628 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3629 /*
3630 * Note: this code should also cover the case of MAXSOCKETS <=
3631 * FD_SETSIZE, but we separate the cases to avoid possible portability
3632 * issues regarding howmany() and the actual representation of fd_set.
3633 */
3634 thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3635 sizeof(fd_mask);
3636 #else /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3637 thread->fd_bufsize = sizeof(fd_set);
3638 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3639
3640 thread->read_fds = isc_mem_get(thread->manager->mctx,
3641 thread->fd_bufsize);
3642 thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3643 thread->fd_bufsize);
3644 thread->write_fds = isc_mem_get(thread->manager->mctx,
3645 thread->fd_bufsize);
3646 thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3647 thread->fd_bufsize);
3648 memset(thread->read_fds, 0, thread->fd_bufsize);
3649 memset(thread->write_fds, 0, thread->fd_bufsize);
3650
3651 (void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3652 thread->maxfd = thread->pipe_fds[0];
3653
3654 return (ISC_R_SUCCESS);
3655 #endif /* USE_KQUEUE */
3656 }
3657
3658 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3659 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3660 isc_result_t result;
3661 int i;
3662
3663 result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3664 if (result != ISC_R_SUCCESS) {
3665 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3666 }
3667 #ifdef USE_KQUEUE
3668 close(thread->kqueue_fd);
3669 isc_mem_put(mctx, thread->events,
3670 sizeof(struct kevent) * thread->nevents);
3671 #elif defined(USE_EPOLL)
3672 close(thread->epoll_fd);
3673
3674 isc_mem_put(mctx, thread->events,
3675 sizeof(struct epoll_event) * thread->nevents);
3676 #elif defined(USE_DEVPOLL)
3677 close(thread->devpoll_fd);
3678 isc_mem_put(mctx, thread->events,
3679 sizeof(struct pollfd) * thread->nevents);
3680 isc_mem_put(mctx, thread->fdpollinfo,
3681 sizeof(pollinfo_t) * thread->manager->maxsocks);
3682 #elif defined(USE_SELECT)
3683 if (thread->read_fds != NULL) {
3684 isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3685 }
3686 if (thread->read_fds_copy != NULL) {
3687 isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3688 }
3689 if (thread->write_fds != NULL) {
3690 isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3691 }
3692 if (thread->write_fds_copy != NULL) {
3693 isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3694 }
3695 #endif /* USE_KQUEUE */
3696 for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3697 if (thread->fdstate[i] == CLOSE_PENDING) {
3698 /* no need to lock */
3699 (void)close(i);
3700 }
3701 }
3702
3703 #if defined(USE_EPOLL)
3704 isc_mem_put(thread->manager->mctx, thread->epoll_events,
3705 thread->manager->maxsocks * sizeof(uint32_t));
3706 #endif /* if defined(USE_EPOLL) */
3707 isc_mem_put(thread->manager->mctx, thread->fds,
3708 thread->manager->maxsocks * sizeof(isc_socket_t *));
3709 isc_mem_put(thread->manager->mctx, thread->fdstate,
3710 thread->manager->maxsocks * sizeof(int));
3711
3712 for (i = 0; i < FDLOCK_COUNT; i++) {
3713 isc_mutex_destroy(&thread->fdlock[i]);
3714 }
3715 isc_mem_put(thread->manager->mctx, thread->fdlock,
3716 FDLOCK_COUNT * sizeof(isc_mutex_t));
3717 }
3718
3719 isc_result_t
isc__socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3720 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3721 unsigned int maxsocks, int nthreads) {
3722 int i;
3723 isc_socketmgr_t *manager;
3724
3725 REQUIRE(managerp != NULL && *managerp == NULL);
3726
3727 if (maxsocks == 0) {
3728 maxsocks = ISC_SOCKET_MAXSOCKETS;
3729 }
3730
3731 manager = isc_mem_get(mctx, sizeof(*manager));
3732
3733 /* zero-clear so that necessary cleanup on failure will be easy */
3734 memset(manager, 0, sizeof(*manager));
3735 manager->maxsocks = maxsocks;
3736 manager->reserved = 0;
3737 manager->maxudp = 0;
3738 manager->nthreads = nthreads;
3739 manager->stats = NULL;
3740
3741 manager->magic = SOCKET_MANAGER_MAGIC;
3742 manager->mctx = NULL;
3743 ISC_LIST_INIT(manager->socklist);
3744 isc_mutex_init(&manager->lock);
3745 isc_condition_init(&manager->shutdown_ok);
3746
3747 /*
3748 * Start up the select/poll thread.
3749 */
3750 manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3751 manager->nthreads);
3752 isc_mem_attach(mctx, &manager->mctx);
3753
3754 for (i = 0; i < manager->nthreads; i++) {
3755 manager->threads[i].manager = manager;
3756 manager->threads[i].threadid = i;
3757 setup_thread(&manager->threads[i]);
3758 isc_thread_create(netthread, &manager->threads[i],
3759 &manager->threads[i].thread);
3760 char tname[1024];
3761 sprintf(tname, "isc-socket-%d", i);
3762 isc_thread_setname(manager->threads[i].thread, tname);
3763 }
3764
3765 *managerp = manager;
3766
3767 return (ISC_R_SUCCESS);
3768 }
3769
3770 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager,unsigned int * nsockp)3771 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3772 REQUIRE(VALID_MANAGER(manager));
3773 REQUIRE(nsockp != NULL);
3774
3775 *nsockp = manager->maxsocks;
3776
3777 return (ISC_R_SUCCESS);
3778 }
3779
3780 void
isc_socketmgr_setstats(isc_socketmgr_t * manager,isc_stats_t * stats)3781 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3782 REQUIRE(VALID_MANAGER(manager));
3783 REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3784 REQUIRE(manager->stats == NULL);
3785 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3786
3787 isc_stats_attach(stats, &manager->stats);
3788 }
3789
3790 void
isc__socketmgr_destroy(isc_socketmgr_t ** managerp)3791 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
3792 isc_socketmgr_t *manager;
3793
3794 /*
3795 * Destroy a socket manager.
3796 */
3797
3798 REQUIRE(managerp != NULL);
3799 manager = *managerp;
3800 REQUIRE(VALID_MANAGER(manager));
3801
3802 LOCK(&manager->lock);
3803
3804 /*
3805 * Wait for all sockets to be destroyed.
3806 */
3807 while (!ISC_LIST_EMPTY(manager->socklist)) {
3808 manager_log(manager, CREATION, "sockets exist");
3809 WAIT(&manager->shutdown_ok, &manager->lock);
3810 }
3811
3812 UNLOCK(&manager->lock);
3813
3814 /*
3815 * Here, poke our select/poll thread. Do this by closing the write
3816 * half of the pipe, which will send EOF to the read half.
3817 * This is currently a no-op in the non-threaded case.
3818 */
3819 for (int i = 0; i < manager->nthreads; i++) {
3820 select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3821 }
3822
3823 /*
3824 * Wait for thread to exit.
3825 */
3826 for (int i = 0; i < manager->nthreads; i++) {
3827 isc_thread_join(manager->threads[i].thread, NULL);
3828 cleanup_thread(manager->mctx, &manager->threads[i]);
3829 }
3830 /*
3831 * Clean up.
3832 */
3833 isc_mem_put(manager->mctx, manager->threads,
3834 sizeof(isc__socketthread_t) * manager->nthreads);
3835 (void)isc_condition_destroy(&manager->shutdown_ok);
3836
3837 if (manager->stats != NULL) {
3838 isc_stats_detach(&manager->stats);
3839 }
3840 isc_mutex_destroy(&manager->lock);
3841 manager->magic = 0;
3842 isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
3843
3844 *managerp = NULL;
3845 }
3846
3847 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)3848 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3849 unsigned int flags) {
3850 int io_state;
3851 bool have_lock = false;
3852 isc_task_t *ntask = NULL;
3853 isc_result_t result = ISC_R_SUCCESS;
3854
3855 dev->ev_sender = task;
3856
3857 if (sock->type == isc_sockettype_udp) {
3858 io_state = doio_recv(sock, dev);
3859 } else {
3860 LOCK(&sock->lock);
3861 have_lock = true;
3862
3863 if (ISC_LIST_EMPTY(sock->recv_list)) {
3864 io_state = doio_recv(sock, dev);
3865 } else {
3866 io_state = DOIO_SOFT;
3867 }
3868 }
3869
3870 switch (io_state) {
3871 case DOIO_SOFT:
3872 /*
3873 * We couldn't read all or part of the request right now, so
3874 * queue it.
3875 *
3876 * Attach to socket and to task
3877 */
3878 isc_task_attach(task, &ntask);
3879 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3880
3881 if (!have_lock) {
3882 LOCK(&sock->lock);
3883 have_lock = true;
3884 }
3885
3886 /*
3887 * Enqueue the request. If the socket was previously not being
3888 * watched, poke the watcher to start paying attention to it.
3889 */
3890 bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
3891 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3892 if (do_poke) {
3893 select_poke(sock->manager, sock->threadid, sock->fd,
3894 SELECT_POKE_READ);
3895 }
3896
3897 socket_log(sock, NULL, EVENT,
3898 "socket_recv: event %p -> task %p", dev, ntask);
3899
3900 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
3901 result = ISC_R_INPROGRESS;
3902 }
3903 break;
3904
3905 case DOIO_EOF:
3906 dev->result = ISC_R_EOF;
3907 /* fallthrough */
3908
3909 case DOIO_HARD:
3910 case DOIO_SUCCESS:
3911 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
3912 send_recvdone_event(sock, &dev);
3913 }
3914 break;
3915 }
3916
3917 if (have_lock) {
3918 UNLOCK(&sock->lock);
3919 }
3920
3921 return (result);
3922 }
3923
3924 isc_result_t
isc_socket_recv(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)3925 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3926 isc_task_t *task, isc_taskaction_t action, void *arg) {
3927 isc_socketevent_t *dev;
3928 isc_socketmgr_t *manager;
3929
3930 REQUIRE(VALID_SOCKET(sock));
3931 REQUIRE(action != NULL);
3932
3933 manager = sock->manager;
3934 REQUIRE(VALID_MANAGER(manager));
3935
3936 INSIST(sock->bound);
3937
3938 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
3939 action, arg);
3940 if (dev == NULL) {
3941 return (ISC_R_NOMEMORY);
3942 }
3943
3944 return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
3945 }
3946
3947 isc_result_t
isc_socket_recv2(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)3948 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3949 isc_task_t *task, isc_socketevent_t *event,
3950 unsigned int flags) {
3951 event->ev_sender = sock;
3952 event->result = ISC_R_UNSET;
3953 event->region = *region;
3954 event->n = 0;
3955 event->offset = 0;
3956 event->attributes = 0;
3957
3958 /*
3959 * UDP sockets are always partial read.
3960 */
3961 if (sock->type == isc_sockettype_udp) {
3962 event->minimum = 1;
3963 } else {
3964 if (minimum == 0) {
3965 event->minimum = region->length;
3966 } else {
3967 event->minimum = minimum;
3968 }
3969 }
3970
3971 return (socket_recv(sock, event, task, flags));
3972 }
3973
3974 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)3975 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3976 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3977 unsigned int flags) {
3978 int io_state;
3979 bool have_lock = false;
3980 isc_task_t *ntask = NULL;
3981 isc_result_t result = ISC_R_SUCCESS;
3982
3983 dev->ev_sender = task;
3984
3985 set_dev_address(address, sock, dev);
3986 if (pktinfo != NULL) {
3987 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
3988 dev->pktinfo = *pktinfo;
3989
3990 if (!isc_sockaddr_issitelocal(&dev->address) &&
3991 !isc_sockaddr_islinklocal(&dev->address))
3992 {
3993 socket_log(sock, NULL, TRACE,
3994 "pktinfo structure provided, ifindex %u "
3995 "(set to 0)",
3996 pktinfo->ipi6_ifindex);
3997
3998 /*
3999 * Set the pktinfo index to 0 here, to let the
4000 * kernel decide what interface it should send on.
4001 */
4002 dev->pktinfo.ipi6_ifindex = 0;
4003 }
4004 }
4005
4006 if (sock->type == isc_sockettype_udp) {
4007 io_state = doio_send(sock, dev);
4008 } else {
4009 LOCK(&sock->lock);
4010 have_lock = true;
4011
4012 if (ISC_LIST_EMPTY(sock->send_list)) {
4013 io_state = doio_send(sock, dev);
4014 } else {
4015 io_state = DOIO_SOFT;
4016 }
4017 }
4018
4019 switch (io_state) {
4020 case DOIO_SOFT:
4021 /*
4022 * We couldn't send all or part of the request right now, so
4023 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4024 */
4025 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4026 isc_task_attach(task, &ntask);
4027 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4028
4029 if (!have_lock) {
4030 LOCK(&sock->lock);
4031 have_lock = true;
4032 }
4033
4034 /*
4035 * Enqueue the request. If the socket was previously
4036 * not being watched, poke the watcher to start
4037 * paying attention to it.
4038 */
4039 bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4040 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4041 if (do_poke) {
4042 select_poke(sock->manager, sock->threadid,
4043 sock->fd, SELECT_POKE_WRITE);
4044 }
4045 socket_log(sock, NULL, EVENT,
4046 "socket_send: event %p -> task %p", dev,
4047 ntask);
4048
4049 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4050 result = ISC_R_INPROGRESS;
4051 }
4052 break;
4053 }
4054
4055 /* FALLTHROUGH */
4056
4057 case DOIO_HARD:
4058 case DOIO_SUCCESS:
4059 if (!have_lock) {
4060 LOCK(&sock->lock);
4061 have_lock = true;
4062 }
4063 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4064 send_senddone_event(sock, &dev);
4065 }
4066 break;
4067 }
4068
4069 if (have_lock) {
4070 UNLOCK(&sock->lock);
4071 }
4072
4073 return (result);
4074 }
4075
4076 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4077 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4078 isc_taskaction_t action, void *arg) {
4079 /*
4080 * REQUIRE() checking is performed in isc_socket_sendto().
4081 */
4082 return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4083 }
4084
4085 isc_result_t
isc_socket_sendto(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4086 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4087 isc_taskaction_t action, void *arg,
4088 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4089 isc_socketevent_t *dev;
4090 isc_socketmgr_t *manager;
4091
4092 REQUIRE(VALID_SOCKET(sock));
4093 REQUIRE(region != NULL);
4094 REQUIRE(task != NULL);
4095 REQUIRE(action != NULL);
4096
4097 manager = sock->manager;
4098 REQUIRE(VALID_MANAGER(manager));
4099
4100 INSIST(sock->bound);
4101
4102 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4103 action, arg);
4104 if (dev == NULL) {
4105 return (ISC_R_NOMEMORY);
4106 }
4107
4108 dev->region = *region;
4109
4110 return (socket_send(sock, dev, task, address, pktinfo, 0));
4111 }
4112
4113 isc_result_t
isc_socket_sendto2(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4114 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4115 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4116 isc_socketevent_t *event, unsigned int flags) {
4117 REQUIRE(VALID_SOCKET(sock));
4118 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4119 0);
4120 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4121 REQUIRE(sock->type == isc_sockettype_udp);
4122 }
4123 event->ev_sender = sock;
4124 event->result = ISC_R_UNSET;
4125 event->region = *region;
4126 event->n = 0;
4127 event->offset = 0;
4128 event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4129
4130 return (socket_send(sock, event, task, address, pktinfo, flags));
4131 }
4132
4133 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4134 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4135 int s;
4136 struct stat sb;
4137 char strbuf[ISC_STRERRORSIZE];
4138
4139 if (sockaddr->type.sa.sa_family != AF_UNIX) {
4140 return;
4141 }
4142
4143 #ifndef S_ISSOCK
4144 #if defined(S_IFMT) && defined(S_IFSOCK)
4145 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4146 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4147 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4148 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4149 #endif /* ifndef S_ISSOCK */
4150
4151 #ifndef S_ISFIFO
4152 #if defined(S_IFMT) && defined(S_IFIFO)
4153 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4154 #elif defined(_S_IFMT) && defined(S_IFIFO)
4155 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4156 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4157 #endif /* ifndef S_ISFIFO */
4158
4159 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4160 /* cppcheck-suppress preprocessorErrorDirective */
4161 #error \
4162 You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>.
4163 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4164
4165 #ifndef S_ISFIFO
4166 #define S_ISFIFO(mode) 0
4167 #endif /* ifndef S_ISFIFO */
4168
4169 #ifndef S_ISSOCK
4170 #define S_ISSOCK(mode) 0
4171 #endif /* ifndef S_ISSOCK */
4172
4173 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4174 switch (errno) {
4175 case ENOENT:
4176 if (active) { /* We exited cleanly last time */
4177 break;
4178 }
4179 /* FALLTHROUGH */
4180 default:
4181 strerror_r(errno, strbuf, sizeof(strbuf));
4182 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4183 ISC_LOGMODULE_SOCKET,
4184 active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4185 "isc_socket_cleanunix: stat(%s): %s",
4186 sockaddr->type.sunix.sun_path, strbuf);
4187 return;
4188 }
4189 } else {
4190 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4191 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4192 ISC_LOGMODULE_SOCKET,
4193 active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4194 "isc_socket_cleanunix: %s: not a socket",
4195 sockaddr->type.sunix.sun_path);
4196 return;
4197 }
4198 }
4199
4200 if (active) {
4201 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4202 strerror_r(errno, strbuf, sizeof(strbuf));
4203 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4204 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4205 "isc_socket_cleanunix: unlink(%s): %s",
4206 sockaddr->type.sunix.sun_path, strbuf);
4207 }
4208 return;
4209 }
4210
4211 s = socket(AF_UNIX, SOCK_STREAM, 0);
4212 if (s < 0) {
4213 strerror_r(errno, strbuf, sizeof(strbuf));
4214 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4215 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4216 "isc_socket_cleanunix: socket(%s): %s",
4217 sockaddr->type.sunix.sun_path, strbuf);
4218 return;
4219 }
4220
4221 if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4222 sizeof(sockaddr->type.sunix)) < 0)
4223 {
4224 switch (errno) {
4225 case ECONNREFUSED:
4226 case ECONNRESET:
4227 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4228 strerror_r(errno, strbuf, sizeof(strbuf));
4229 isc_log_write(
4230 isc_lctx, ISC_LOGCATEGORY_GENERAL,
4231 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4232 "isc_socket_cleanunix: "
4233 "unlink(%s): %s",
4234 sockaddr->type.sunix.sun_path, strbuf);
4235 }
4236 break;
4237 default:
4238 strerror_r(errno, strbuf, sizeof(strbuf));
4239 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4240 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4241 "isc_socket_cleanunix: connect(%s): %s",
4242 sockaddr->type.sunix.sun_path, strbuf);
4243 break;
4244 }
4245 }
4246 close(s);
4247 }
4248
4249 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4250 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4251 uint32_t owner, uint32_t group) {
4252 isc_result_t result = ISC_R_SUCCESS;
4253 char strbuf[ISC_STRERRORSIZE];
4254 char path[sizeof(sockaddr->type.sunix.sun_path)];
4255 #ifdef NEED_SECURE_DIRECTORY
4256 char *slash;
4257 #endif /* ifdef NEED_SECURE_DIRECTORY */
4258
4259 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4260 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4261 strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4262
4263 #ifdef NEED_SECURE_DIRECTORY
4264 slash = strrchr(path, '/');
4265 if (slash != NULL) {
4266 if (slash != path) {
4267 *slash = '\0';
4268 } else {
4269 strlcpy(path, "/", sizeof(path));
4270 }
4271 } else {
4272 strlcpy(path, ".", sizeof(path));
4273 }
4274 #endif /* ifdef NEED_SECURE_DIRECTORY */
4275
4276 if (chmod(path, perm) < 0) {
4277 strerror_r(errno, strbuf, sizeof(strbuf));
4278 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4279 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4280 "isc_socket_permunix: chmod(%s, %d): %s", path,
4281 perm, strbuf);
4282 result = ISC_R_FAILURE;
4283 }
4284 if (chown(path, owner, group) < 0) {
4285 strerror_r(errno, strbuf, sizeof(strbuf));
4286 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4287 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4288 "isc_socket_permunix: chown(%s, %d, %d): %s",
4289 path, owner, group, strbuf);
4290 result = ISC_R_FAILURE;
4291 }
4292 return (result);
4293 }
4294
4295 isc_result_t
isc_socket_bind(isc_socket_t * sock,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4296 isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
4297 isc_socket_options_t options) {
4298 char strbuf[ISC_STRERRORSIZE];
4299 int on = 1;
4300
4301 REQUIRE(VALID_SOCKET(sock));
4302
4303 LOCK(&sock->lock);
4304
4305 INSIST(!sock->bound);
4306
4307 if (sock->pf != sockaddr->type.sa.sa_family) {
4308 UNLOCK(&sock->lock);
4309 return (ISC_R_FAMILYMISMATCH);
4310 }
4311
4312 /*
4313 * Only set SO_REUSEADDR when we want a specific port.
4314 */
4315 #ifdef AF_UNIX
4316 if (sock->pf == AF_UNIX) {
4317 goto bind_socket;
4318 }
4319 #endif /* ifdef AF_UNIX */
4320 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4321 isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4322 {
4323 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4324 sizeof(on)) < 0) {
4325 UNEXPECTED_ERROR(__FILE__, __LINE__,
4326 "setsockopt(%d) failed", sock->fd);
4327 }
4328 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4329 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4330 (void *)&on, sizeof(on)) < 0)
4331 {
4332 UNEXPECTED_ERROR(__FILE__, __LINE__,
4333 "setsockopt(%d) failed", sock->fd);
4334 }
4335 #elif defined(__linux__) && defined(SO_REUSEPORT)
4336 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4337 sizeof(on)) < 0) {
4338 UNEXPECTED_ERROR(__FILE__, __LINE__,
4339 "setsockopt(%d) failed", sock->fd);
4340 }
4341 #endif /* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4342 /* Press on... */
4343 }
4344 #ifdef AF_UNIX
4345 bind_socket:
4346 #endif /* ifdef AF_UNIX */
4347 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4348 inc_stats(sock->manager->stats,
4349 sock->statsindex[STATID_BINDFAIL]);
4350
4351 UNLOCK(&sock->lock);
4352 switch (errno) {
4353 case EACCES:
4354 return (ISC_R_NOPERM);
4355 case EADDRNOTAVAIL:
4356 return (ISC_R_ADDRNOTAVAIL);
4357 case EADDRINUSE:
4358 return (ISC_R_ADDRINUSE);
4359 case EINVAL:
4360 return (ISC_R_BOUND);
4361 default:
4362 strerror_r(errno, strbuf, sizeof(strbuf));
4363 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4364 strbuf);
4365 return (ISC_R_UNEXPECTED);
4366 }
4367 }
4368
4369 socket_log(sock, sockaddr, TRACE, "bound");
4370 sock->bound = 1;
4371
4372 UNLOCK(&sock->lock);
4373 return (ISC_R_SUCCESS);
4374 }
4375
4376 /*
4377 * Enable this only for specific OS versions, and only when they have repaired
4378 * their problems with it. Until then, this is is broken and needs to be
4379 * disabled by default. See RT22589 for details.
4380 */
4381 #undef ENABLE_ACCEPTFILTER
4382
4383 isc_result_t
isc_socket_filter(isc_socket_t * sock,const char * filter)4384 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4385 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4386 char strbuf[ISC_STRERRORSIZE];
4387 struct accept_filter_arg afa;
4388 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4389 UNUSED(sock);
4390 UNUSED(filter);
4391 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4392
4393 REQUIRE(VALID_SOCKET(sock));
4394
4395 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4396 bzero(&afa, sizeof(afa));
4397 strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4398 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4399 sizeof(afa)) == -1) {
4400 strerror_r(errno, strbuf, sizeof(strbuf));
4401 socket_log(sock, NULL, CREATION,
4402 "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4403 return (ISC_R_FAILURE);
4404 }
4405 return (ISC_R_SUCCESS);
4406 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4407 return (ISC_R_NOTIMPLEMENTED);
4408 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4409 }
4410
4411 /*
4412 * Try enabling TCP Fast Open for a given socket if the OS supports it.
4413 */
4414 static void
set_tcp_fastopen(isc_socket_t * sock,unsigned int backlog)4415 set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
4416 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4417 char strbuf[ISC_STRERRORSIZE];
4418
4419 /*
4420 * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4421 * shipping a default kernel without TFO support, so we special-case it by
4422 * performing an additional runtime check for TFO support using sysctl to
4423 * prevent setsockopt() errors from being logged.
4424 */
4425 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4426 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4427 unsigned int enabled;
4428 size_t enabledlen = sizeof(enabled);
4429 static bool tfo_notice_logged = false;
4430
4431 if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4432 /*
4433 * This kernel does not support TCP Fast Open. There is
4434 * nothing more we can do.
4435 */
4436 return;
4437 } else if (enabled == 0) {
4438 /*
4439 * This kernel does support TCP Fast Open, but it is disabled
4440 * by sysctl. Notify the user, but do not nag.
4441 */
4442 if (!tfo_notice_logged) {
4443 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4444 ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4445 "TCP_FASTOPEN support is disabled by "
4446 "sysctl (" SYSCTL_TFO " = 0)");
4447 tfo_notice_logged = true;
4448 }
4449 return;
4450 }
4451 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4452
4453 #ifdef __APPLE__
4454 backlog = 1;
4455 #else /* ifdef __APPLE__ */
4456 backlog = backlog / 2;
4457 if (backlog == 0) {
4458 backlog = 1;
4459 }
4460 #endif /* ifdef __APPLE__ */
4461 if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4462 sizeof(backlog)) < 0)
4463 {
4464 strerror_r(errno, strbuf, sizeof(strbuf));
4465 UNEXPECTED_ERROR(__FILE__, __LINE__,
4466 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4467 sock->fd, strbuf);
4468 /* TCP_FASTOPEN is experimental so ignore failures */
4469 }
4470 #else /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4471 UNUSED(sock);
4472 UNUSED(backlog);
4473 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4474 }
4475
4476 /*
4477 * Set up to listen on a given socket. We do this by creating an internal
4478 * event that will be dispatched when the socket has read activity. The
4479 * watcher will send the internal event to the task when there is a new
4480 * connection.
4481 *
4482 * Unlike in read, we don't preallocate a done event here. Every time there
4483 * is a new connection we'll have to allocate a new one anyway, so we might
4484 * as well keep things simple rather than having to track them.
4485 */
4486 isc_result_t
isc_socket_listen(isc_socket_t * sock,unsigned int backlog)4487 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4488 char strbuf[ISC_STRERRORSIZE];
4489
4490 REQUIRE(VALID_SOCKET(sock));
4491
4492 LOCK(&sock->lock);
4493
4494 REQUIRE(!sock->listener);
4495 REQUIRE(sock->bound);
4496 REQUIRE(sock->type == isc_sockettype_tcp ||
4497 sock->type == isc_sockettype_unix);
4498
4499 if (backlog == 0) {
4500 backlog = SOMAXCONN;
4501 }
4502
4503 if (listen(sock->fd, (int)backlog) < 0) {
4504 UNLOCK(&sock->lock);
4505 strerror_r(errno, strbuf, sizeof(strbuf));
4506
4507 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4508
4509 return (ISC_R_UNEXPECTED);
4510 }
4511
4512 set_tcp_fastopen(sock, backlog);
4513
4514 sock->listener = 1;
4515
4516 UNLOCK(&sock->lock);
4517 return (ISC_R_SUCCESS);
4518 }
4519
4520 /*
4521 * This should try to do aggressive accept() XXXMLG
4522 */
4523 isc_result_t
isc_socket_accept(isc_socket_t * sock,isc_task_t * task,isc_taskaction_t action,void * arg)4524 isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
4525 void *arg) {
4526 isc_socket_newconnev_t *dev;
4527 isc_socketmgr_t *manager;
4528 isc_task_t *ntask = NULL;
4529 isc_socket_t *nsock;
4530 isc_result_t result;
4531 bool do_poke = false;
4532
4533 REQUIRE(VALID_SOCKET(sock));
4534 manager = sock->manager;
4535 REQUIRE(VALID_MANAGER(manager));
4536
4537 LOCK(&sock->lock);
4538
4539 REQUIRE(sock->listener);
4540
4541 /*
4542 * Sender field is overloaded here with the task we will be sending
4543 * this event to. Just before the actual event is delivered the
4544 * actual ev_sender will be touched up to be the socket.
4545 */
4546 dev = (isc_socket_newconnev_t *)isc_event_allocate(
4547 manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4548 sizeof(*dev));
4549 ISC_LINK_INIT(dev, ev_link);
4550
4551 result = allocate_socket(manager, sock->type, &nsock);
4552 if (result != ISC_R_SUCCESS) {
4553 isc_event_free(ISC_EVENT_PTR(&dev));
4554 UNLOCK(&sock->lock);
4555 return (result);
4556 }
4557
4558 /*
4559 * Attach to socket and to task.
4560 */
4561 isc_task_attach(task, &ntask);
4562 if (isc_task_exiting(ntask)) {
4563 free_socket(&nsock);
4564 isc_task_detach(&ntask);
4565 isc_event_free(ISC_EVENT_PTR(&dev));
4566 UNLOCK(&sock->lock);
4567 return (ISC_R_SHUTTINGDOWN);
4568 }
4569 isc_refcount_increment0(&nsock->references);
4570 nsock->statsindex = sock->statsindex;
4571
4572 dev->ev_sender = ntask;
4573 dev->newsocket = nsock;
4574
4575 /*
4576 * Poke watcher here. We still have the socket locked, so there
4577 * is no race condition. We will keep the lock for such a short
4578 * bit of time waking it up now or later won't matter all that much.
4579 */
4580 do_poke = ISC_LIST_EMPTY(sock->accept_list);
4581 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4582 if (do_poke) {
4583 select_poke(manager, sock->threadid, sock->fd,
4584 SELECT_POKE_ACCEPT);
4585 }
4586 UNLOCK(&sock->lock);
4587 return (ISC_R_SUCCESS);
4588 }
4589
4590 isc_result_t
isc_socket_connect(isc_socket_t * sock,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4591 isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
4592 isc_task_t *task, isc_taskaction_t action, void *arg) {
4593 isc_socket_connev_t *dev;
4594 isc_task_t *ntask = NULL;
4595 isc_socketmgr_t *manager;
4596 int cc;
4597 char strbuf[ISC_STRERRORSIZE];
4598 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4599
4600 REQUIRE(VALID_SOCKET(sock));
4601 REQUIRE(addr != NULL);
4602 REQUIRE(task != NULL);
4603 REQUIRE(action != NULL);
4604
4605 manager = sock->manager;
4606 REQUIRE(VALID_MANAGER(manager));
4607 REQUIRE(addr != NULL);
4608
4609 if (isc_sockaddr_ismulticast(addr)) {
4610 return (ISC_R_MULTICAST);
4611 }
4612
4613 LOCK(&sock->lock);
4614
4615 dev = (isc_socket_connev_t *)isc_event_allocate(
4616 manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4617 sizeof(*dev));
4618 ISC_LINK_INIT(dev, ev_link);
4619
4620 if (sock->connecting) {
4621 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4622 goto queue;
4623 }
4624
4625 if (sock->connected) {
4626 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4627 dev->result = ISC_R_SUCCESS;
4628 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4629
4630 UNLOCK(&sock->lock);
4631
4632 return (ISC_R_SUCCESS);
4633 }
4634
4635 /*
4636 * Try to do the connect right away, as there can be only one
4637 * outstanding, and it might happen to complete.
4638 */
4639 sock->peer_address = *addr;
4640 cc = connect(sock->fd, &addr->type.sa, addr->length);
4641 if (cc < 0) {
4642 /*
4643 * The socket is nonblocking and the connection cannot be
4644 * completed immediately. It is possible to select(2) or
4645 * poll(2) for completion by selecting the socket for writing.
4646 * After select(2) indicates writability, use getsockopt(2) to
4647 * read the SO_ERROR option at level SOL_SOCKET to determine
4648 * whether connect() completed successfully (SO_ERROR is zero)
4649 * or unsuccessfully (SO_ERROR is one of the usual error codes
4650 * listed here, explaining the reason for the failure).
4651 */
4652 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4653 cc = 0;
4654 goto success;
4655 }
4656 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4657 goto queue;
4658 }
4659
4660 switch (errno) {
4661 #define ERROR_MATCH(a, b) \
4662 case a: \
4663 dev->result = b; \
4664 goto err_exit;
4665 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4666 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4667 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4668 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4669 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4670 #ifdef EHOSTDOWN
4671 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4672 #endif /* ifdef EHOSTDOWN */
4673 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4674 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4675 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4676 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4677 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4678 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4679 #undef ERROR_MATCH
4680 }
4681
4682 sock->connected = 0;
4683
4684 strerror_r(errno, strbuf, sizeof(strbuf));
4685 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4686 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4687 addrbuf, errno, strbuf);
4688
4689 UNLOCK(&sock->lock);
4690 inc_stats(sock->manager->stats,
4691 sock->statsindex[STATID_CONNECTFAIL]);
4692 isc_event_free(ISC_EVENT_PTR(&dev));
4693 return (ISC_R_UNEXPECTED);
4694
4695 err_exit:
4696 sock->connected = 0;
4697 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4698
4699 UNLOCK(&sock->lock);
4700 inc_stats(sock->manager->stats,
4701 sock->statsindex[STATID_CONNECTFAIL]);
4702 return (ISC_R_SUCCESS);
4703 }
4704
4705 /*
4706 * If connect completed, fire off the done event.
4707 */
4708 success:
4709 if (cc == 0) {
4710 sock->connected = 1;
4711 sock->bound = 1;
4712 dev->result = ISC_R_SUCCESS;
4713 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4714
4715 UNLOCK(&sock->lock);
4716
4717 inc_stats(sock->manager->stats,
4718 sock->statsindex[STATID_CONNECT]);
4719
4720 return (ISC_R_SUCCESS);
4721 }
4722
4723 queue:
4724
4725 /*
4726 * Attach to task.
4727 */
4728 isc_task_attach(task, &ntask);
4729
4730 dev->ev_sender = ntask;
4731
4732 /*
4733 * Poke watcher here. We still have the socket locked, so there
4734 * is no race condition. We will keep the lock for such a short
4735 * bit of time waking it up now or later won't matter all that much.
4736 */
4737 bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4738 ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4739 if (do_poke && !sock->connecting) {
4740 sock->connecting = 1;
4741 select_poke(manager, sock->threadid, sock->fd,
4742 SELECT_POKE_CONNECT);
4743 }
4744
4745 UNLOCK(&sock->lock);
4746 return (ISC_R_SUCCESS);
4747 }
4748
4749 /*
4750 * Called when a socket with a pending connect() finishes.
4751 */
4752 static void
internal_connect(isc_socket_t * sock)4753 internal_connect(isc_socket_t *sock) {
4754 isc_socket_connev_t *dev;
4755 int cc;
4756 isc_result_t result;
4757 socklen_t optlen;
4758 char strbuf[ISC_STRERRORSIZE];
4759 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4760
4761 INSIST(VALID_SOCKET(sock));
4762 REQUIRE(sock->fd >= 0);
4763
4764 /*
4765 * Get the first item off the connect list.
4766 * If it is empty, unlock the socket and return.
4767 */
4768 dev = ISC_LIST_HEAD(sock->connect_list);
4769 if (dev == NULL) {
4770 INSIST(!sock->connecting);
4771 goto finish;
4772 }
4773
4774 INSIST(sock->connecting);
4775 sock->connecting = 0;
4776
4777 /*
4778 * Get any possible error status here.
4779 */
4780 optlen = sizeof(cc);
4781 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4782 (void *)&optlen) != 0)
4783 {
4784 cc = errno;
4785 } else {
4786 errno = cc;
4787 }
4788
4789 if (errno != 0) {
4790 /*
4791 * If the error is EAGAIN, just re-select on this
4792 * fd and pretend nothing strange happened.
4793 */
4794 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4795 sock->connecting = 1;
4796 return;
4797 }
4798
4799 inc_stats(sock->manager->stats,
4800 sock->statsindex[STATID_CONNECTFAIL]);
4801
4802 /*
4803 * Translate other errors into ISC_R_* flavors.
4804 */
4805 switch (errno) {
4806 #define ERROR_MATCH(a, b) \
4807 case a: \
4808 result = b; \
4809 break;
4810 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4811 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4812 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4813 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4814 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4815 #ifdef EHOSTDOWN
4816 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4817 #endif /* ifdef EHOSTDOWN */
4818 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4819 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4820 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4821 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4822 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4823 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4824 #undef ERROR_MATCH
4825 default:
4826 result = ISC_R_UNEXPECTED;
4827 isc_sockaddr_format(&sock->peer_address, peerbuf,
4828 sizeof(peerbuf));
4829 strerror_r(errno, strbuf, sizeof(strbuf));
4830 UNEXPECTED_ERROR(__FILE__, __LINE__,
4831 "internal_connect: connect(%s) %s",
4832 peerbuf, strbuf);
4833 }
4834 } else {
4835 inc_stats(sock->manager->stats,
4836 sock->statsindex[STATID_CONNECT]);
4837 result = ISC_R_SUCCESS;
4838 sock->connected = 1;
4839 sock->bound = 1;
4840 }
4841
4842 do {
4843 dev->result = result;
4844 send_connectdone_event(sock, &dev);
4845 dev = ISC_LIST_HEAD(sock->connect_list);
4846 } while (dev != NULL);
4847
4848 finish:
4849 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
4850 SELECT_POKE_CONNECT);
4851 }
4852
4853 isc_result_t
isc_socket_getpeername(isc_socket_t * sock,isc_sockaddr_t * addressp)4854 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4855 isc_result_t result;
4856
4857 REQUIRE(VALID_SOCKET(sock));
4858 REQUIRE(addressp != NULL);
4859
4860 LOCK(&sock->lock);
4861
4862 if (sock->connected) {
4863 *addressp = sock->peer_address;
4864 result = ISC_R_SUCCESS;
4865 } else {
4866 result = ISC_R_NOTCONNECTED;
4867 }
4868
4869 UNLOCK(&sock->lock);
4870
4871 return (result);
4872 }
4873
4874 isc_result_t
isc_socket_getsockname(isc_socket_t * sock,isc_sockaddr_t * addressp)4875 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4876 socklen_t len;
4877 isc_result_t result;
4878 char strbuf[ISC_STRERRORSIZE];
4879
4880 REQUIRE(VALID_SOCKET(sock));
4881 REQUIRE(addressp != NULL);
4882
4883 LOCK(&sock->lock);
4884
4885 if (!sock->bound) {
4886 result = ISC_R_NOTBOUND;
4887 goto out;
4888 }
4889
4890 result = ISC_R_SUCCESS;
4891
4892 len = sizeof(addressp->type);
4893 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
4894 strerror_r(errno, strbuf, sizeof(strbuf));
4895 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
4896 result = ISC_R_UNEXPECTED;
4897 goto out;
4898 }
4899 addressp->length = (unsigned int)len;
4900
4901 out:
4902 UNLOCK(&sock->lock);
4903
4904 return (result);
4905 }
4906
4907 /*
4908 * Run through the list of events on this socket, and cancel the ones
4909 * queued for task "task" of type "how". "how" is a bitmask.
4910 */
4911 void
isc_socket_cancel(isc_socket_t * sock,isc_task_t * task,unsigned int how)4912 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
4913 REQUIRE(VALID_SOCKET(sock));
4914
4915 /*
4916 * Quick exit if there is nothing to do. Don't even bother locking
4917 * in this case.
4918 */
4919 if (how == 0) {
4920 return;
4921 }
4922
4923 LOCK(&sock->lock);
4924
4925 /*
4926 * All of these do the same thing, more or less.
4927 * Each will:
4928 * o If the internal event is marked as "posted" try to
4929 * remove it from the task's queue. If this fails, mark it
4930 * as canceled instead, and let the task clean it up later.
4931 * o For each I/O request for that task of that type, post
4932 * its done event with status of "ISC_R_CANCELED".
4933 * o Reset any state needed.
4934 */
4935 if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
4936 !ISC_LIST_EMPTY(sock->recv_list)) {
4937 isc_socketevent_t *dev;
4938 isc_socketevent_t *next;
4939 isc_task_t *current_task;
4940
4941 dev = ISC_LIST_HEAD(sock->recv_list);
4942
4943 while (dev != NULL) {
4944 current_task = dev->ev_sender;
4945 next = ISC_LIST_NEXT(dev, ev_link);
4946
4947 if ((task == NULL) || (task == current_task)) {
4948 dev->result = ISC_R_CANCELED;
4949 send_recvdone_event(sock, &dev);
4950 }
4951 dev = next;
4952 }
4953 }
4954
4955 if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
4956 !ISC_LIST_EMPTY(sock->send_list)) {
4957 isc_socketevent_t *dev;
4958 isc_socketevent_t *next;
4959 isc_task_t *current_task;
4960
4961 dev = ISC_LIST_HEAD(sock->send_list);
4962
4963 while (dev != NULL) {
4964 current_task = dev->ev_sender;
4965 next = ISC_LIST_NEXT(dev, ev_link);
4966
4967 if ((task == NULL) || (task == current_task)) {
4968 dev->result = ISC_R_CANCELED;
4969 send_senddone_event(sock, &dev);
4970 }
4971 dev = next;
4972 }
4973 }
4974
4975 if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
4976 !ISC_LIST_EMPTY(sock->accept_list)) {
4977 isc_socket_newconnev_t *dev;
4978 isc_socket_newconnev_t *next;
4979 isc_task_t *current_task;
4980
4981 dev = ISC_LIST_HEAD(sock->accept_list);
4982 while (dev != NULL) {
4983 current_task = dev->ev_sender;
4984 next = ISC_LIST_NEXT(dev, ev_link);
4985
4986 if ((task == NULL) || (task == current_task)) {
4987 ISC_LIST_UNLINK(sock->accept_list, dev,
4988 ev_link);
4989
4990 isc_refcount_decrementz(
4991 &NEWCONNSOCK(dev)->references);
4992 free_socket((isc_socket_t **)&dev->newsocket);
4993
4994 dev->result = ISC_R_CANCELED;
4995 dev->ev_sender = sock;
4996 isc_task_sendtoanddetach(¤t_task,
4997 ISC_EVENT_PTR(&dev),
4998 sock->threadid);
4999 }
5000
5001 dev = next;
5002 }
5003 }
5004
5005 if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5006 !ISC_LIST_EMPTY(sock->connect_list))
5007 {
5008 isc_socket_connev_t *dev;
5009 isc_socket_connev_t *next;
5010 isc_task_t *current_task;
5011
5012 INSIST(sock->connecting);
5013 sock->connecting = 0;
5014
5015 dev = ISC_LIST_HEAD(sock->connect_list);
5016
5017 while (dev != NULL) {
5018 current_task = dev->ev_sender;
5019 next = ISC_LIST_NEXT(dev, ev_link);
5020
5021 if ((task == NULL) || (task == current_task)) {
5022 dev->result = ISC_R_CANCELED;
5023 send_connectdone_event(sock, &dev);
5024 }
5025 dev = next;
5026 }
5027 }
5028
5029 UNLOCK(&sock->lock);
5030 }
5031
5032 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock)5033 isc_socket_gettype(isc_socket_t *sock) {
5034 REQUIRE(VALID_SOCKET(sock));
5035
5036 return (sock->type);
5037 }
5038
5039 void
isc_socket_ipv6only(isc_socket_t * sock,bool yes)5040 isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
5041 #if defined(IPV6_V6ONLY)
5042 int onoff = yes ? 1 : 0;
5043 #else /* if defined(IPV6_V6ONLY) */
5044 UNUSED(yes);
5045 UNUSED(sock);
5046 #endif /* if defined(IPV6_V6ONLY) */
5047
5048 REQUIRE(VALID_SOCKET(sock));
5049
5050 #ifdef IPV6_V6ONLY
5051 if (sock->pf == AF_INET6) {
5052 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5053 (void *)&onoff, sizeof(int)) < 0)
5054 {
5055 char strbuf[ISC_STRERRORSIZE];
5056 strerror_r(errno, strbuf, sizeof(strbuf));
5057 UNEXPECTED_ERROR(__FILE__, __LINE__,
5058 "setsockopt(%d, IPV6_V6ONLY) failed: "
5059 "%s",
5060 sock->fd, strbuf);
5061 }
5062 }
5063 #endif /* ifdef IPV6_V6ONLY */
5064 }
5065
5066 static void
setdscp(isc_socket_t * sock,isc_dscp_t dscp)5067 setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
5068 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5069 int value = dscp << 2;
5070 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5071
5072 sock->dscp = dscp;
5073
5074 #ifdef IP_TOS
5075 if (sock->pf == AF_INET) {
5076 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5077 sizeof(value)) < 0) {
5078 char strbuf[ISC_STRERRORSIZE];
5079 strerror_r(errno, strbuf, sizeof(strbuf));
5080 UNEXPECTED_ERROR(__FILE__, __LINE__,
5081 "setsockopt(%d, IP_TOS, %.02x) "
5082 "failed: %s",
5083 sock->fd, value >> 2, strbuf);
5084 }
5085 }
5086 #endif /* ifdef IP_TOS */
5087 #ifdef IPV6_TCLASS
5088 if (sock->pf == AF_INET6) {
5089 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5090 (void *)&value, sizeof(value)) < 0)
5091 {
5092 char strbuf[ISC_STRERRORSIZE];
5093 strerror_r(errno, strbuf, sizeof(strbuf));
5094 UNEXPECTED_ERROR(__FILE__, __LINE__,
5095 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5096 "failed: %s",
5097 sock->fd, dscp >> 2, strbuf);
5098 }
5099 }
5100 #endif /* ifdef IPV6_TCLASS */
5101 }
5102
5103 void
isc_socket_dscp(isc_socket_t * sock,isc_dscp_t dscp)5104 isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
5105 REQUIRE(VALID_SOCKET(sock));
5106 REQUIRE(dscp < 0x40);
5107
5108 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5109 UNUSED(dscp);
5110 #else /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5111 if (dscp < 0) {
5112 return;
5113 }
5114
5115 /* The DSCP value must not be changed once it has been set. */
5116 if (isc_dscp_check_value != -1) {
5117 INSIST(dscp == isc_dscp_check_value);
5118 }
5119 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5120
5121 setdscp(sock, dscp);
5122 }
5123
5124 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5125 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5126 isc_taskaction_t action, void *arg) {
5127 return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5128 }
5129
5130 void
isc_socket_setname(isc_socket_t * sock,const char * name,void * tag)5131 isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
5132 /*
5133 * Name 'sock'.
5134 */
5135
5136 REQUIRE(VALID_SOCKET(sock));
5137
5138 LOCK(&sock->lock);
5139 strlcpy(sock->name, name, sizeof(sock->name));
5140 sock->tag = tag;
5141 UNLOCK(&sock->lock);
5142 }
5143
5144 const char *
isc_socket_getname(isc_socket_t * sock)5145 isc_socket_getname(isc_socket_t *sock) {
5146 return (sock->name);
5147 }
5148
5149 void *
isc_socket_gettag(isc_socket_t * sock)5150 isc_socket_gettag(isc_socket_t *sock) {
5151 return (sock->tag);
5152 }
5153
5154 int
isc_socket_getfd(isc_socket_t * sock)5155 isc_socket_getfd(isc_socket_t *sock) {
5156 return ((short)sock->fd);
5157 }
5158
5159 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5160 static bool hasreuseport = false;
5161
5162 static void
init_hasreuseport(void)5163 init_hasreuseport(void) {
5164 /*
5165 * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5166 * We only want to use it on Linux, if it's available.
5167 */
5168 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5169 (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5170 int sock, yes = 1;
5171 sock = socket(AF_INET, SOCK_DGRAM, 0);
5172 if (sock < 0) {
5173 sock = socket(AF_INET6, SOCK_DGRAM, 0);
5174 if (sock < 0) {
5175 return;
5176 }
5177 }
5178 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5179 sizeof(yes)) < 0) {
5180 close(sock);
5181 return;
5182 #if defined(__FreeBSD_kernel__)
5183 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5184 sizeof(yes)) < 0)
5185 #else /* if defined(__FreeBSD_kernel__) */
5186 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5187 sizeof(yes)) < 0)
5188 #endif /* if defined(__FreeBSD_kernel__) */
5189 {
5190 close(sock);
5191 return;
5192 }
5193 hasreuseport = true;
5194 close(sock);
5195 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5196 * (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5197 }
5198
5199 bool
isc_socket_hasreuseport(void)5200 isc_socket_hasreuseport(void) {
5201 RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5202 ISC_R_SUCCESS);
5203 return (hasreuseport);
5204 }
5205
5206 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5207 static const char *
_socktype(isc_sockettype_t type)5208 _socktype(isc_sockettype_t type) {
5209 switch (type) {
5210 case isc_sockettype_udp:
5211 return ("udp");
5212 case isc_sockettype_tcp:
5213 return ("tcp");
5214 case isc_sockettype_unix:
5215 return ("unix");
5216 default:
5217 return ("not-initialized");
5218 }
5219 }
5220 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5221
5222 #ifdef HAVE_LIBXML2
5223 #define TRY0(a) \
5224 do { \
5225 xmlrc = (a); \
5226 if (xmlrc < 0) \
5227 goto error; \
5228 } while (0)
5229 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr,void * writer0)5230 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
5231 isc_socket_t *sock = NULL;
5232 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5233 isc_sockaddr_t addr;
5234 socklen_t len;
5235 int xmlrc;
5236 xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5237
5238 LOCK(&mgr->lock);
5239
5240 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5241 sock = ISC_LIST_HEAD(mgr->socklist);
5242 while (sock != NULL) {
5243 LOCK(&sock->lock);
5244 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5245
5246 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5247 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5248 TRY0(xmlTextWriterEndElement(writer));
5249
5250 if (sock->name[0] != 0) {
5251 TRY0(xmlTextWriterStartElement(writer,
5252 ISC_XMLCHAR "name"));
5253 TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5254 sock->name));
5255 TRY0(xmlTextWriterEndElement(writer)); /* name */
5256 }
5257
5258 TRY0(xmlTextWriterStartElement(writer,
5259 ISC_XMLCHAR "references"));
5260 TRY0(xmlTextWriterWriteFormatString(
5261 writer, "%d",
5262 (int)isc_refcount_current(&sock->references)));
5263 TRY0(xmlTextWriterEndElement(writer));
5264
5265 TRY0(xmlTextWriterWriteElement(
5266 writer, ISC_XMLCHAR "type",
5267 ISC_XMLCHAR _socktype(sock->type)));
5268
5269 if (sock->connected) {
5270 isc_sockaddr_format(&sock->peer_address, peerbuf,
5271 sizeof(peerbuf));
5272 TRY0(xmlTextWriterWriteElement(
5273 writer, ISC_XMLCHAR "peer-address",
5274 ISC_XMLCHAR peerbuf));
5275 }
5276
5277 len = sizeof(addr);
5278 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5279 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5280 TRY0(xmlTextWriterWriteElement(
5281 writer, ISC_XMLCHAR "local-address",
5282 ISC_XMLCHAR peerbuf));
5283 }
5284
5285 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5286 if (sock->listener) {
5287 TRY0(xmlTextWriterWriteElement(writer,
5288 ISC_XMLCHAR "state",
5289 ISC_XMLCHAR "listener"));
5290 }
5291 if (sock->connected) {
5292 TRY0(xmlTextWriterWriteElement(
5293 writer, ISC_XMLCHAR "state",
5294 ISC_XMLCHAR "connected"));
5295 }
5296 if (sock->connecting) {
5297 TRY0(xmlTextWriterWriteElement(
5298 writer, ISC_XMLCHAR "state",
5299 ISC_XMLCHAR "connecting"));
5300 }
5301 if (sock->bound) {
5302 TRY0(xmlTextWriterWriteElement(writer,
5303 ISC_XMLCHAR "state",
5304 ISC_XMLCHAR "bound"));
5305 }
5306
5307 TRY0(xmlTextWriterEndElement(writer)); /* states */
5308
5309 TRY0(xmlTextWriterEndElement(writer)); /* socket */
5310
5311 UNLOCK(&sock->lock);
5312 sock = ISC_LIST_NEXT(sock, link);
5313 }
5314 TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5315
5316 error:
5317 if (sock != NULL) {
5318 UNLOCK(&sock->lock);
5319 }
5320
5321 UNLOCK(&mgr->lock);
5322
5323 return (xmlrc);
5324 }
5325 #endif /* HAVE_LIBXML2 */
5326
5327 #ifdef HAVE_JSON_C
5328 #define CHECKMEM(m) \
5329 do { \
5330 if (m == NULL) { \
5331 result = ISC_R_NOMEMORY; \
5332 goto error; \
5333 } \
5334 } while (0)
5335
5336 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr,void * stats0)5337 isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
5338 isc_result_t result = ISC_R_SUCCESS;
5339 isc_socket_t *sock = NULL;
5340 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5341 isc_sockaddr_t addr;
5342 socklen_t len;
5343 json_object *obj, *array = json_object_new_array();
5344 json_object *stats = (json_object *)stats0;
5345
5346 CHECKMEM(array);
5347
5348 LOCK(&mgr->lock);
5349
5350 sock = ISC_LIST_HEAD(mgr->socklist);
5351 while (sock != NULL) {
5352 json_object *states, *entry = json_object_new_object();
5353 char buf[255];
5354
5355 CHECKMEM(entry);
5356 json_object_array_add(array, entry);
5357
5358 LOCK(&sock->lock);
5359
5360 snprintf(buf, sizeof(buf), "%p", sock);
5361 obj = json_object_new_string(buf);
5362 CHECKMEM(obj);
5363 json_object_object_add(entry, "id", obj);
5364
5365 if (sock->name[0] != 0) {
5366 obj = json_object_new_string(sock->name);
5367 CHECKMEM(obj);
5368 json_object_object_add(entry, "name", obj);
5369 }
5370
5371 obj = json_object_new_int(
5372 (int)isc_refcount_current(&sock->references));
5373 CHECKMEM(obj);
5374 json_object_object_add(entry, "references", obj);
5375
5376 obj = json_object_new_string(_socktype(sock->type));
5377 CHECKMEM(obj);
5378 json_object_object_add(entry, "type", obj);
5379
5380 if (sock->connected) {
5381 isc_sockaddr_format(&sock->peer_address, peerbuf,
5382 sizeof(peerbuf));
5383 obj = json_object_new_string(peerbuf);
5384 CHECKMEM(obj);
5385 json_object_object_add(entry, "peer-address", obj);
5386 }
5387
5388 len = sizeof(addr);
5389 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5390 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5391 obj = json_object_new_string(peerbuf);
5392 CHECKMEM(obj);
5393 json_object_object_add(entry, "local-address", obj);
5394 }
5395
5396 states = json_object_new_array();
5397 CHECKMEM(states);
5398 json_object_object_add(entry, "states", states);
5399
5400 if (sock->listener) {
5401 obj = json_object_new_string("listener");
5402 CHECKMEM(obj);
5403 json_object_array_add(states, obj);
5404 }
5405
5406 if (sock->connected) {
5407 obj = json_object_new_string("connected");
5408 CHECKMEM(obj);
5409 json_object_array_add(states, obj);
5410 }
5411
5412 if (sock->connecting) {
5413 obj = json_object_new_string("connecting");
5414 CHECKMEM(obj);
5415 json_object_array_add(states, obj);
5416 }
5417
5418 if (sock->bound) {
5419 obj = json_object_new_string("bound");
5420 CHECKMEM(obj);
5421 json_object_array_add(states, obj);
5422 }
5423
5424 UNLOCK(&sock->lock);
5425 sock = ISC_LIST_NEXT(sock, link);
5426 }
5427
5428 json_object_object_add(stats, "sockets", array);
5429 array = NULL;
5430 result = ISC_R_SUCCESS;
5431
5432 error:
5433 if (array != NULL) {
5434 json_object_put(array);
5435 }
5436
5437 if (sock != NULL) {
5438 UNLOCK(&sock->lock);
5439 }
5440
5441 UNLOCK(&mgr->lock);
5442
5443 return (result);
5444 }
5445 #endif /* HAVE_JSON_C */
5446