1 /*
2 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3 *
4 * This Source Code Form is subject to the terms of the Mozilla Public
5 * License, v. 2.0. If a copy of the MPL was not distributed with this
6 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 *
8 * See the COPYRIGHT file distributed with this work for additional
9 * information regarding copyright ownership.
10 */
11
12 /*! \file */
13
14 #include <inttypes.h>
15 #include <stdbool.h>
16 #include <sys/param.h>
17 #include <sys/socket.h>
18 #include <sys/stat.h>
19 #include <sys/types.h>
20 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
21 #include <sys/sysctl.h>
22 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
23 #include <sys/time.h>
24 #include <sys/uio.h>
25
26 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
27 #include <linux/netlink.h>
28 #include <linux/rtnetlink.h>
29 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
30 */
31
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <stddef.h>
35 #include <stdlib.h>
36 #include <unistd.h>
37
38 #include <isc/app.h>
39 #include <isc/buffer.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/mutex.h>
46 #include <isc/net.h>
47 #include <isc/once.h>
48 #include <isc/platform.h>
49 #include <isc/print.h>
50 #include <isc/refcount.h>
51 #include <isc/region.h>
52 #include <isc/resource.h>
53 #include <isc/socket.h>
54 #include <isc/stats.h>
55 #include <isc/strerr.h>
56 #include <isc/string.h>
57 #include <isc/task.h>
58 #include <isc/thread.h>
59 #include <isc/util.h>
60
61 #ifdef ISC_PLATFORM_HAVESYSUNH
62 #include <sys/un.h>
63 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
64 #ifdef HAVE_KQUEUE
65 #include <sys/event.h>
66 #endif /* ifdef HAVE_KQUEUE */
67 #ifdef HAVE_EPOLL_CREATE1
68 #include <sys/epoll.h>
69 #endif /* ifdef HAVE_EPOLL_CREATE1 */
70 #if defined(HAVE_SYS_DEVPOLL_H)
71 #include <sys/devpoll.h>
72 #elif defined(HAVE_DEVPOLL_H)
73 #include <devpoll.h>
74 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
75
76 #include <netinet/tcp.h>
77
78 #include "errno2result.h"
79
80 #ifdef ENABLE_TCP_FASTOPEN
81 #include <netinet/tcp.h>
82 #endif /* ifdef ENABLE_TCP_FASTOPEN */
83
84 #ifdef HAVE_JSON_C
85 #include <json_object.h>
86 #endif /* HAVE_JSON_C */
87
88 #ifdef HAVE_LIBXML2
89 #include <libxml/xmlwriter.h>
90 #define ISC_XMLCHAR (const xmlChar *)
91 #endif /* HAVE_LIBXML2 */
92
93 /*%
94 * Choose the most preferable multiplex method.
95 */
96 #if defined(HAVE_KQUEUE)
97 #define USE_KQUEUE
98 #elif defined(HAVE_EPOLL_CREATE1)
99 #define USE_EPOLL
100 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
101 #define USE_DEVPOLL
102 typedef struct {
103 unsigned int want_read : 1, want_write : 1;
104 } pollinfo_t;
105 #else /* if defined(HAVE_KQUEUE) */
106 #define USE_SELECT
107 #endif /* HAVE_KQUEUE */
108
109 /*
110 * Set by the -T dscp option on the command line. If set to a value
111 * other than -1, we check to make sure DSCP values match it, and
112 * assert if not.
113 */
114 int isc_dscp_check_value = -1;
115
116 /*%
117 * Maximum number of allowable open sockets. This is also the maximum
118 * allowable socket file descriptor.
119 *
120 * Care should be taken before modifying this value for select():
121 * The API standard doesn't ensure select() accept more than (the system default
122 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
123 * the vast majority of cases. This constant should therefore be increased only
124 * when absolutely necessary and possible, i.e., the server is exhausting all
125 * available file descriptors (up to FD_SETSIZE) and the select() function
126 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
127 * always by true, but we keep using some of them to ensure as much
128 * portability as possible). Note also that overall server performance
129 * may be rather worsened with a larger value of this constant due to
130 * inherent scalability problems of select().
131 *
132 * As a special note, this value shouldn't have to be touched if
133 * this is a build for an authoritative only DNS server.
134 */
135 #ifndef ISC_SOCKET_MAXSOCKETS
136 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
137 #ifdef TUNE_LARGE
138 #define ISC_SOCKET_MAXSOCKETS 21000
139 #else /* ifdef TUNE_LARGE */
140 #define ISC_SOCKET_MAXSOCKETS 4096
141 #endif /* TUNE_LARGE */
142 #elif defined(USE_SELECT)
143 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
144 #endif /* USE_KQUEUE... */
145 #endif /* ISC_SOCKET_MAXSOCKETS */
146
147 #ifdef USE_SELECT
148 /*%
149 * Mac OS X needs a special definition to support larger values in select().
150 * We always define this because a larger value can be specified run-time.
151 */
152 #ifdef __APPLE__
153 #define _DARWIN_UNLIMITED_SELECT
154 #endif /* __APPLE__ */
155 #endif /* USE_SELECT */
156
157 #ifdef ISC_SOCKET_USE_POLLWATCH
158 /*%
159 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
160 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
161 * some of the specified FD. The idea is based on the observation that it's
162 * likely for a busy server to keep receiving packets. It specifically works
163 * as follows: the socket watcher is first initialized with the state of
164 * "poll_idle". While it's in the idle state it keeps sleeping until a socket
165 * event occurs. When it wakes up for a socket I/O event, it moves to the
166 * poll_active state, and sets the poll timeout to a short period
167 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
168 * watcher goes to the poll_checking state with the same timeout period.
169 * In this state, the watcher tries to detect whether this is a break
170 * during intermittent events or the kernel bug is triggered. If the next
171 * polling reports an event within the short period, the previous timeout is
172 * likely to be a kernel bug, and so the watcher goes back to the active state.
173 * Otherwise, it moves to the idle state again.
174 *
175 * It's not clear whether this is a thread-related bug, but since we've only
176 * seen this with threads, this workaround is used only when enabling threads.
177 */
178
179 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
180
181 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
182 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
183 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
184 #endif /* ISC_SOCKET_USE_POLLWATCH */
185
186 /*%
187 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
188 */
189 #define FDLOCK_BITS 10
190 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
191 #define FDLOCK_ID(fd) \
192 (((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
193 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
194
195 /*%
196 * Maximum number of events communicated with the kernel. There should normally
197 * be no need for having a large number.
198 */
199 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
200 #ifndef ISC_SOCKET_MAXEVENTS
201 #ifdef TUNE_LARGE
202 #define ISC_SOCKET_MAXEVENTS 2048
203 #else /* ifdef TUNE_LARGE */
204 #define ISC_SOCKET_MAXEVENTS 64
205 #endif /* TUNE_LARGE */
206 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
207 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
208 * */
209
210 /*%
211 * Some systems define the socket length argument as an int, some as size_t,
212 * some as socklen_t. This is here so it can be easily changed if needed.
213 */
214 #ifndef socklen_t
215 #define socklen_t unsigned int
216 #endif /* ifndef socklen_t */
217
218 /*%
219 * Define what the possible "soft" errors can be. These are non-fatal returns
220 * of various network related functions, like recv() and so on.
221 *
222 * For some reason, BSDI (and perhaps others) will sometimes return <0
223 * from recv() but will have errno==0. This is broken, but we have to
224 * work around it here.
225 */
226 #define SOFT_ERROR(e) \
227 ((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
228 (e) == EINTR || (e) == 0)
229
230 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
231
232 /*!<
233 * DLVL(90) -- Function entry/exit and other tracing.
234 * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
235 * DLVL(60) -- Socket data send/receive
236 * DLVL(50) -- Event tracing, including receiving/sending completion events.
237 * DLVL(20) -- Socket creation/destruction.
238 */
239 #define TRACE_LEVEL 90
240 #define CORRECTNESS_LEVEL 70
241 #define IOEVENT_LEVEL 60
242 #define EVENT_LEVEL 50
243 #define CREATION_LEVEL 20
244
245 #define TRACE DLVL(TRACE_LEVEL)
246 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
247 #define IOEVENT DLVL(IOEVENT_LEVEL)
248 #define EVENT DLVL(EVENT_LEVEL)
249 #define CREATION DLVL(CREATION_LEVEL)
250
251 typedef isc_event_t intev_t;
252
253 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
254 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
255
256 /*!
257 * IPv6 control information. If the socket is an IPv6 socket we want
258 * to collect the destination address and interface so the client can
259 * set them on outgoing packets.
260 */
261 #ifndef USE_CMSG
262 #define USE_CMSG 1
263 #endif /* ifndef USE_CMSG */
264
265 /*%
266 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
267 * a setsockopt() like interface to request timestamps, and if the OS
268 * doesn't do it for us, call gettimeofday() on every UDP receive?
269 */
270 #ifdef SO_TIMESTAMP
271 #ifndef USE_CMSG
272 #define USE_CMSG 1
273 #endif /* ifndef USE_CMSG */
274 #endif /* ifdef SO_TIMESTAMP */
275
276 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
277 #define SET_RCVBUF
278 #endif
279
280 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
281 #define SET_SNDBUF
282 #endif
283
284 /*%
285 * Instead of calculating the cmsgbuf lengths every time we take
286 * a rule of thumb approach - sizes are taken from x86_64 linux,
287 * multiplied by 2, everything should fit. Those sizes are not
288 * large enough to cause any concern.
289 */
290 #if defined(USE_CMSG)
291 #define CMSG_SP_IN6PKT 40
292 #else /* if defined(USE_CMSG) */
293 #define CMSG_SP_IN6PKT 0
294 #endif /* if defined(USE_CMSG) */
295
296 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
297 #define CMSG_SP_TIMESTAMP 32
298 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
299 #define CMSG_SP_TIMESTAMP 0
300 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
301
302 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
303 #define CMSG_SP_TCTOS 24
304 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
305 #define CMSG_SP_TCTOS 0
306 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
307
308 #define CMSG_SP_INT 24
309
310 /* Align cmsg buffers to be safe on SPARC etc. */
311 #define RECVCMSGBUFLEN \
312 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
313 1, \
314 sizeof(void *))
315 #define SENDCMSGBUFLEN \
316 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
317 sizeof(void *))
318
319 /*%
320 * The number of times a send operation is repeated if the result is EINTR.
321 */
322 #define NRETRIES 10
323
324 typedef struct isc__socket isc__socket_t;
325 typedef struct isc__socketmgr isc__socketmgr_t;
326 typedef struct isc__socketthread isc__socketthread_t;
327
328 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
329
330 struct isc__socket {
331 /* Not locked. */
332 isc_socket_t common;
333 isc__socketmgr_t *manager;
334 isc_mutex_t lock;
335 isc_sockettype_t type;
336 const isc_statscounter_t *statsindex;
337 isc_refcount_t references;
338
339 /* Locked by socket lock. */
340 ISC_LINK(isc__socket_t) link;
341 int fd;
342 int pf;
343 int threadid;
344 char name[16];
345 void *tag;
346
347 ISC_LIST(isc_socketevent_t) send_list;
348 ISC_LIST(isc_socketevent_t) recv_list;
349 ISC_LIST(isc_socket_newconnev_t) accept_list;
350 ISC_LIST(isc_socket_connev_t) connect_list;
351
352 isc_sockaddr_t peer_address; /* remote address */
353
354 unsigned int listener : 1, /* listener socket */
355 connected : 1, connecting : 1, /* connect pending
356 * */
357 bound : 1, /* bound to local addr */
358 dupped : 1, active : 1, /* currently active */
359 pktdscp : 1; /* per packet dscp */
360
361 #ifdef ISC_PLATFORM_RECVOVERFLOW
362 unsigned char overflow; /* used for MSG_TRUNC fake */
363 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
364
365 unsigned int dscp;
366 };
367
368 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
369 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
370
371 struct isc__socketmgr {
372 /* Not locked. */
373 isc_socketmgr_t common;
374 isc_mem_t *mctx;
375 isc_mutex_t lock;
376 isc_stats_t *stats;
377 int nthreads;
378 isc__socketthread_t *threads;
379 unsigned int maxsocks;
380 /* Locked by manager lock. */
381 ISC_LIST(isc__socket_t) socklist;
382 int reserved; /* unlocked */
383 isc_condition_t shutdown_ok;
384 size_t maxudp;
385 };
386
387 struct isc__socketthread {
388 isc__socketmgr_t *manager;
389 int threadid;
390 isc_thread_t thread;
391 int pipe_fds[2];
392 isc_mutex_t *fdlock;
393 /* Locked by fdlock. */
394 isc__socket_t **fds;
395 int *fdstate;
396 #ifdef USE_KQUEUE
397 int kqueue_fd;
398 int nevents;
399 struct kevent *events;
400 #endif /* USE_KQUEUE */
401 #ifdef USE_EPOLL
402 int epoll_fd;
403 int nevents;
404 struct epoll_event *events;
405 uint32_t *epoll_events;
406 #endif /* USE_EPOLL */
407 #ifdef USE_DEVPOLL
408 int devpoll_fd;
409 isc_resourcevalue_t open_max;
410 unsigned int calls;
411 int nevents;
412 struct pollfd *events;
413 pollinfo_t *fdpollinfo;
414 #endif /* USE_DEVPOLL */
415 #ifdef USE_SELECT
416 int fd_bufsize;
417 fd_set *read_fds;
418 fd_set *read_fds_copy;
419 fd_set *write_fds;
420 fd_set *write_fds_copy;
421 int maxfd;
422 #endif /* USE_SELECT */
423 };
424
425 #define CLOSED 0 /* this one must be zero */
426 #define MANAGED 1
427 #define CLOSE_PENDING 2
428
429 /*
430 * send() and recv() iovec counts
431 */
432 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
433 #ifdef ISC_PLATFORM_RECVOVERFLOW
434 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
435 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
436 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
437 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
438
439 static isc_result_t
440 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
441 isc_socket_t **socketp, isc_socket_t *dup_socket);
442 static void
443 send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
444 static void
445 send_senddone_event(isc__socket_t *, isc_socketevent_t **);
446 static void
447 send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
448 static void
449 free_socket(isc__socket_t **);
450 static isc_result_t
451 allocate_socket(isc__socketmgr_t *, isc_sockettype_t, isc__socket_t **);
452 static void
453 destroy(isc__socket_t **);
454 static void
455 internal_accept(isc__socket_t *);
456 static void
457 internal_connect(isc__socket_t *);
458 static void
459 internal_recv(isc__socket_t *);
460 static void
461 internal_send(isc__socket_t *);
462 static void
463 process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
464 static void
465 build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
466 struct iovec *, size_t *);
467 static void
468 build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
469 struct iovec *, size_t *);
470 static bool
471 process_ctlfd(isc__socketthread_t *thread);
472 static void
473 setdscp(isc__socket_t *sock, isc_dscp_t dscp);
474
475 #define SELECT_POKE_SHUTDOWN (-1)
476 #define SELECT_POKE_NOTHING (-2)
477 #define SELECT_POKE_READ (-3)
478 #define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
479 #define SELECT_POKE_WRITE (-4)
480 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
481 #define SELECT_POKE_CLOSE (-5)
482
483 /*%
484 * Shortcut index arrays to get access to statistics counters.
485 */
486 enum { STATID_OPEN = 0,
487 STATID_OPENFAIL = 1,
488 STATID_CLOSE = 2,
489 STATID_BINDFAIL = 3,
490 STATID_CONNECTFAIL = 4,
491 STATID_CONNECT = 5,
492 STATID_ACCEPTFAIL = 6,
493 STATID_ACCEPT = 7,
494 STATID_SENDFAIL = 8,
495 STATID_RECVFAIL = 9,
496 STATID_ACTIVE = 10 };
497 static const isc_statscounter_t udp4statsindex[] = {
498 isc_sockstatscounter_udp4open,
499 isc_sockstatscounter_udp4openfail,
500 isc_sockstatscounter_udp4close,
501 isc_sockstatscounter_udp4bindfail,
502 isc_sockstatscounter_udp4connectfail,
503 isc_sockstatscounter_udp4connect,
504 -1,
505 -1,
506 isc_sockstatscounter_udp4sendfail,
507 isc_sockstatscounter_udp4recvfail,
508 isc_sockstatscounter_udp4active
509 };
510 static const isc_statscounter_t udp6statsindex[] = {
511 isc_sockstatscounter_udp6open,
512 isc_sockstatscounter_udp6openfail,
513 isc_sockstatscounter_udp6close,
514 isc_sockstatscounter_udp6bindfail,
515 isc_sockstatscounter_udp6connectfail,
516 isc_sockstatscounter_udp6connect,
517 -1,
518 -1,
519 isc_sockstatscounter_udp6sendfail,
520 isc_sockstatscounter_udp6recvfail,
521 isc_sockstatscounter_udp6active
522 };
523 static const isc_statscounter_t tcp4statsindex[] = {
524 isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail,
525 isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail,
526 isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
527 isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept,
528 isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail,
529 isc_sockstatscounter_tcp4active
530 };
531 static const isc_statscounter_t tcp6statsindex[] = {
532 isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail,
533 isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail,
534 isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
535 isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept,
536 isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail,
537 isc_sockstatscounter_tcp6active
538 };
539 static const isc_statscounter_t unixstatsindex[] = {
540 isc_sockstatscounter_unixopen, isc_sockstatscounter_unixopenfail,
541 isc_sockstatscounter_unixclose, isc_sockstatscounter_unixbindfail,
542 isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
543 isc_sockstatscounter_unixacceptfail, isc_sockstatscounter_unixaccept,
544 isc_sockstatscounter_unixsendfail, isc_sockstatscounter_unixrecvfail,
545 isc_sockstatscounter_unixactive
546 };
547 static const isc_statscounter_t rawstatsindex[] = {
548 isc_sockstatscounter_rawopen,
549 isc_sockstatscounter_rawopenfail,
550 isc_sockstatscounter_rawclose,
551 -1,
552 -1,
553 -1,
554 -1,
555 -1,
556 -1,
557 isc_sockstatscounter_rawrecvfail,
558 isc_sockstatscounter_rawactive
559 };
560
561 static int
562 gen_threadid(isc__socket_t *sock);
563
564 static int
gen_threadid(isc__socket_t * sock)565 gen_threadid(isc__socket_t *sock) {
566 return (sock->fd % sock->manager->nthreads);
567 }
568
569 static void
570 manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
571 isc_logmodule_t *module, int level, const char *fmt, ...)
572 ISC_FORMAT_PRINTF(5, 6);
573 static void
manager_log(isc__socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)574 manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
575 isc_logmodule_t *module, int level, const char *fmt, ...) {
576 char msgbuf[2048];
577 va_list ap;
578
579 if (!isc_log_wouldlog(isc_lctx, level)) {
580 return;
581 }
582
583 va_start(ap, fmt);
584 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
585 va_end(ap);
586
587 isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
588 sockmgr, msgbuf);
589 }
590
591 static void
592 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
593 isc_logmodule_t *module, int level, const char *fmt, ...)
594 ISC_FORMAT_PRINTF(5, 6);
595 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)596 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
597 isc_logmodule_t *module, int level, const char *fmt, ...) {
598 char msgbuf[2048];
599 va_list ap;
600
601 if (!isc_log_wouldlog(isc_lctx, level)) {
602 return;
603 }
604
605 va_start(ap, fmt);
606 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
607 va_end(ap);
608
609 isc_log_write(isc_lctx, category, module, level,
610 "sockmgr %p thread %d: %s", thread->manager,
611 thread->threadid, msgbuf);
612 }
613
614 static void
615 socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
616 isc_logcategory_t *category, isc_logmodule_t *module, int level,
617 const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
618 static void
socket_log(isc__socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)619 socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
620 isc_logcategory_t *category, isc_logmodule_t *module, int level,
621 const char *fmt, ...) {
622 char msgbuf[2048];
623 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
624 va_list ap;
625
626 if (!isc_log_wouldlog(isc_lctx, level)) {
627 return;
628 }
629
630 va_start(ap, fmt);
631 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
632 va_end(ap);
633
634 if (address == NULL) {
635 isc_log_write(isc_lctx, category, module, level,
636 "socket %p: %s", sock, msgbuf);
637 } else {
638 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
639 isc_log_write(isc_lctx, category, module, level,
640 "socket %p %s: %s", sock, peerbuf, msgbuf);
641 }
642 }
643
644 /*%
645 * Increment socket-related statistics counters.
646 */
647 static inline void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)648 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
649 REQUIRE(counterid != -1);
650
651 if (stats != NULL) {
652 isc_stats_increment(stats, counterid);
653 }
654 }
655
656 /*%
657 * Decrement socket-related statistics counters.
658 */
659 static inline void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)660 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
661 REQUIRE(counterid != -1);
662
663 if (stats != NULL) {
664 isc_stats_decrement(stats, counterid);
665 }
666 }
667
668 static inline isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)669 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
670 isc_result_t result = ISC_R_SUCCESS;
671
672 #ifdef USE_KQUEUE
673 struct kevent evchange;
674
675 memset(&evchange, 0, sizeof(evchange));
676 if (msg == SELECT_POKE_READ) {
677 evchange.filter = EVFILT_READ;
678 } else {
679 evchange.filter = EVFILT_WRITE;
680 }
681 evchange.flags = EV_ADD;
682 evchange.ident = fd;
683 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
684 result = isc__errno2result(errno);
685 }
686
687 return (result);
688 #elif defined(USE_EPOLL)
689 struct epoll_event event;
690 uint32_t oldevents;
691 int ret;
692 int op;
693
694 oldevents = thread->epoll_events[fd];
695 if (msg == SELECT_POKE_READ) {
696 thread->epoll_events[fd] |= EPOLLIN;
697 } else {
698 thread->epoll_events[fd] |= EPOLLOUT;
699 }
700
701 event.events = thread->epoll_events[fd];
702 memset(&event.data, 0, sizeof(event.data));
703 event.data.fd = fd;
704
705 op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
706 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
707 if (ret == -1) {
708 if (errno == EEXIST) {
709 UNEXPECTED_ERROR(__FILE__, __LINE__,
710 "epoll_ctl(ADD/MOD) returned "
711 "EEXIST for fd %d",
712 fd);
713 }
714 result = isc__errno2result(errno);
715 }
716
717 return (result);
718 #elif defined(USE_DEVPOLL)
719 struct pollfd pfd;
720 int lockid = FDLOCK_ID(fd);
721
722 memset(&pfd, 0, sizeof(pfd));
723 if (msg == SELECT_POKE_READ) {
724 pfd.events = POLLIN;
725 } else {
726 pfd.events = POLLOUT;
727 }
728 pfd.fd = fd;
729 pfd.revents = 0;
730 if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
731 result = isc__errno2result(errno);
732 } else {
733 if (msg == SELECT_POKE_READ) {
734 thread->fdpollinfo[fd].want_read = 1;
735 } else {
736 thread->fdpollinfo[fd].want_write = 1;
737 }
738 }
739
740 return (result);
741 #elif defined(USE_SELECT)
742 LOCK(&thread->manager->lock);
743 if (msg == SELECT_POKE_READ) {
744 FD_SET(fd, thread->read_fds);
745 }
746 if (msg == SELECT_POKE_WRITE) {
747 FD_SET(fd, thread->write_fds);
748 }
749 UNLOCK(&thread->manager->lock);
750
751 return (result);
752 #endif /* ifdef USE_KQUEUE */
753 }
754
755 static inline isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)756 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
757 isc_result_t result = ISC_R_SUCCESS;
758
759 #ifdef USE_KQUEUE
760 struct kevent evchange;
761
762 memset(&evchange, 0, sizeof(evchange));
763 if (msg == SELECT_POKE_READ) {
764 evchange.filter = EVFILT_READ;
765 } else {
766 evchange.filter = EVFILT_WRITE;
767 }
768 evchange.flags = EV_DELETE;
769 evchange.ident = fd;
770 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
771 result = isc__errno2result(errno);
772 }
773
774 return (result);
775 #elif defined(USE_EPOLL)
776 struct epoll_event event;
777 int ret;
778 int op;
779
780 if (msg == SELECT_POKE_READ) {
781 thread->epoll_events[fd] &= ~(EPOLLIN);
782 } else {
783 thread->epoll_events[fd] &= ~(EPOLLOUT);
784 }
785
786 event.events = thread->epoll_events[fd];
787 memset(&event.data, 0, sizeof(event.data));
788 event.data.fd = fd;
789
790 op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
791 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
792 if (ret == -1 && errno != ENOENT) {
793 char strbuf[ISC_STRERRORSIZE];
794 strerror_r(errno, strbuf, sizeof(strbuf));
795 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
796 fd, strbuf);
797 result = ISC_R_UNEXPECTED;
798 }
799 return (result);
800 #elif defined(USE_DEVPOLL)
801 struct pollfd pfds[2];
802 size_t writelen = sizeof(pfds[0]);
803 int lockid = FDLOCK_ID(fd);
804
805 memset(pfds, 0, sizeof(pfds));
806 pfds[0].events = POLLREMOVE;
807 pfds[0].fd = fd;
808
809 /*
810 * Canceling read or write polling via /dev/poll is tricky. Since it
811 * only provides a way of canceling per FD, we may need to re-poll the
812 * socket for the other operation.
813 */
814 if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
815 pfds[1].events = POLLOUT;
816 pfds[1].fd = fd;
817 writelen += sizeof(pfds[1]);
818 }
819 if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
820 pfds[1].events = POLLIN;
821 pfds[1].fd = fd;
822 writelen += sizeof(pfds[1]);
823 }
824
825 if (write(thread->devpoll_fd, pfds, writelen) == -1) {
826 result = isc__errno2result(errno);
827 } else {
828 if (msg == SELECT_POKE_READ) {
829 thread->fdpollinfo[fd].want_read = 0;
830 } else {
831 thread->fdpollinfo[fd].want_write = 0;
832 }
833 }
834
835 return (result);
836 #elif defined(USE_SELECT)
837 LOCK(&thread->manager->lock);
838 if (msg == SELECT_POKE_READ) {
839 FD_CLR(fd, thread->read_fds);
840 } else if (msg == SELECT_POKE_WRITE) {
841 FD_CLR(fd, thread->write_fds);
842 }
843 UNLOCK(&thread->manager->lock);
844
845 return (result);
846 #endif /* ifdef USE_KQUEUE */
847 }
848
849 /*
850 * A poke message was received, perform a proper watch/unwatch
851 * on a fd provided
852 */
853 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)854 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
855 isc_result_t result;
856 int lockid = FDLOCK_ID(fd);
857
858 /*
859 * This is a wakeup on a socket. If the socket is not in the
860 * process of being closed, start watching it for either reads
861 * or writes.
862 */
863
864 INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
865
866 if (msg == SELECT_POKE_CLOSE) {
867 LOCK(&thread->fdlock[lockid]);
868 INSIST(thread->fdstate[fd] == CLOSE_PENDING);
869 thread->fdstate[fd] = CLOSED;
870 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
871 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
872 (void)close(fd);
873 UNLOCK(&thread->fdlock[lockid]);
874 return;
875 }
876
877 LOCK(&thread->fdlock[lockid]);
878 if (thread->fdstate[fd] == CLOSE_PENDING) {
879 /*
880 * We accept (and ignore) any error from unwatch_fd() as we are
881 * closing the socket, hoping it doesn't leave dangling state in
882 * the kernel.
883 * Note that unwatch_fd() must be called after releasing the
884 * fdlock; otherwise it could cause deadlock due to a lock order
885 * reversal.
886 */
887 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
888 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
889 UNLOCK(&thread->fdlock[lockid]);
890 return;
891 }
892 if (thread->fdstate[fd] != MANAGED) {
893 UNLOCK(&thread->fdlock[lockid]);
894 return;
895 }
896 UNLOCK(&thread->fdlock[lockid]);
897
898 /*
899 * Set requested bit.
900 */
901 result = watch_fd(thread, fd, msg);
902 if (result != ISC_R_SUCCESS) {
903 /*
904 * XXXJT: what should we do? Ignoring the failure of watching
905 * a socket will make the application dysfunctional, but there
906 * seems to be no reasonable recovery process.
907 */
908 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
909 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
910 "failed to start watching FD (%d): %s", fd,
911 isc_result_totext(result));
912 }
913 }
914
915 /*
916 * Poke the select loop when there is something for us to do.
917 * The write is required (by POSIX) to complete. That is, we
918 * will not get partial writes.
919 */
920 static void
select_poke(isc__socketmgr_t * mgr,int threadid,int fd,int msg)921 select_poke(isc__socketmgr_t *mgr, int threadid, int fd, int msg) {
922 int cc;
923 int buf[2];
924 char strbuf[ISC_STRERRORSIZE];
925
926 buf[0] = fd;
927 buf[1] = msg;
928
929 do {
930 cc = write(mgr->threads[threadid].pipe_fds[1], buf,
931 sizeof(buf));
932 #ifdef ENOSR
933 /*
934 * Treat ENOSR as EAGAIN but loop slowly as it is
935 * unlikely to clear fast.
936 */
937 if (cc < 0 && errno == ENOSR) {
938 sleep(1);
939 errno = EAGAIN;
940 }
941 #endif /* ifdef ENOSR */
942 } while (cc < 0 && SOFT_ERROR(errno));
943
944 if (cc < 0) {
945 strerror_r(errno, strbuf, sizeof(strbuf));
946 FATAL_ERROR(__FILE__, __LINE__,
947 "write() failed during watcher poke: %s", strbuf);
948 }
949
950 INSIST(cc == sizeof(buf));
951 }
952
953 /*
954 * Read a message on the internal fd.
955 */
956 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)957 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
958 int buf[2];
959 int cc;
960 char strbuf[ISC_STRERRORSIZE];
961
962 cc = read(thread->pipe_fds[0], buf, sizeof(buf));
963 if (cc < 0) {
964 *msg = SELECT_POKE_NOTHING;
965 *fd = -1; /* Silence compiler. */
966 if (SOFT_ERROR(errno)) {
967 return;
968 }
969
970 strerror_r(errno, strbuf, sizeof(strbuf));
971 FATAL_ERROR(__FILE__, __LINE__,
972 "read() failed during watcher poke: %s", strbuf);
973 }
974 INSIST(cc == sizeof(buf));
975
976 *fd = buf[0];
977 *msg = buf[1];
978 }
979
980 /*
981 * Make a fd non-blocking.
982 */
983 static isc_result_t
make_nonblock(int fd)984 make_nonblock(int fd) {
985 int ret;
986 char strbuf[ISC_STRERRORSIZE];
987 #ifdef USE_FIONBIO_IOCTL
988 int on = 1;
989 #else /* ifdef USE_FIONBIO_IOCTL */
990 int flags;
991 #endif /* ifdef USE_FIONBIO_IOCTL */
992
993 #ifdef USE_FIONBIO_IOCTL
994 ret = ioctl(fd, FIONBIO, (char *)&on);
995 #else /* ifdef USE_FIONBIO_IOCTL */
996 flags = fcntl(fd, F_GETFL, 0);
997 flags |= PORT_NONBLOCK;
998 ret = fcntl(fd, F_SETFL, flags);
999 #endif /* ifdef USE_FIONBIO_IOCTL */
1000
1001 if (ret == -1) {
1002 strerror_r(errno, strbuf, sizeof(strbuf));
1003 UNEXPECTED_ERROR(__FILE__, __LINE__,
1004 #ifdef USE_FIONBIO_IOCTL
1005 "ioctl(%d, FIONBIO, &on): %s", fd,
1006 #else /* ifdef USE_FIONBIO_IOCTL */
1007 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1008 #endif /* ifdef USE_FIONBIO_IOCTL */
1009 strbuf);
1010
1011 return (ISC_R_UNEXPECTED);
1012 }
1013
1014 return (ISC_R_SUCCESS);
1015 }
1016
1017 #ifdef USE_CMSG
1018 /*
1019 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1020 * In order to ensure as much portability as possible, we provide wrapper
1021 * functions of these macros.
1022 * Note that cmsg_space() could run slow on OSes that do not have
1023 * CMSG_SPACE.
1024 */
1025 static inline socklen_t
cmsg_len(socklen_t len)1026 cmsg_len(socklen_t len) {
1027 #ifdef CMSG_LEN
1028 return (CMSG_LEN(len));
1029 #else /* ifdef CMSG_LEN */
1030 socklen_t hdrlen;
1031
1032 /*
1033 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1034 * is correct.
1035 */
1036 hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1037 return (hdrlen + len);
1038 #endif /* ifdef CMSG_LEN */
1039 }
1040
1041 static inline socklen_t
cmsg_space(socklen_t len)1042 cmsg_space(socklen_t len) {
1043 #ifdef CMSG_SPACE
1044 return (CMSG_SPACE(len));
1045 #else /* ifdef CMSG_SPACE */
1046 struct msghdr msg;
1047 struct cmsghdr *cmsgp;
1048 /*
1049 * XXX: The buffer length is an ad-hoc value, but should be enough
1050 * in a practical sense.
1051 */
1052 char dummybuf[sizeof(struct cmsghdr) + 1024];
1053
1054 memset(&msg, 0, sizeof(msg));
1055 msg.msg_control = dummybuf;
1056 msg.msg_controllen = sizeof(dummybuf);
1057
1058 cmsgp = (struct cmsghdr *)dummybuf;
1059 cmsgp->cmsg_len = cmsg_len(len);
1060
1061 cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1062 if (cmsgp != NULL) {
1063 return ((char *)cmsgp - (char *)msg.msg_control);
1064 } else {
1065 return (0);
1066 }
1067 #endif /* ifdef CMSG_SPACE */
1068 }
1069 #endif /* USE_CMSG */
1070
1071 /*
1072 * Process control messages received on a socket.
1073 */
1074 static void
process_cmsg(isc__socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1075 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1076 #ifdef USE_CMSG
1077 struct cmsghdr *cmsgp;
1078 struct in6_pktinfo *pktinfop;
1079 #ifdef SO_TIMESTAMP
1080 void *timevalp;
1081 #endif /* ifdef SO_TIMESTAMP */
1082 #endif /* ifdef USE_CMSG */
1083
1084 /*
1085 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1086 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1087 * They are all here, outside of the CPP tests, because it is
1088 * more consistent with the usual ISC coding style.
1089 */
1090 UNUSED(sock);
1091 UNUSED(msg);
1092 UNUSED(dev);
1093
1094 #ifdef MSG_TRUNC
1095 if ((msg->msg_flags & MSG_TRUNC) != 0) {
1096 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1097 }
1098 #endif /* ifdef MSG_TRUNC */
1099
1100 #ifdef MSG_CTRUNC
1101 if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1102 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1103 }
1104 #endif /* ifdef MSG_CTRUNC */
1105
1106 #ifndef USE_CMSG
1107 return;
1108 #else /* ifndef USE_CMSG */
1109 if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1110 return;
1111 }
1112
1113 #ifdef SO_TIMESTAMP
1114 timevalp = NULL;
1115 #endif /* ifdef SO_TIMESTAMP */
1116 pktinfop = NULL;
1117
1118 cmsgp = CMSG_FIRSTHDR(msg);
1119 while (cmsgp != NULL) {
1120 socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1121
1122 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1123 cmsgp->cmsg_type == IPV6_PKTINFO) {
1124 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1125 memmove(&dev->pktinfo, pktinfop,
1126 sizeof(struct in6_pktinfo));
1127 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1128 socket_log(sock, NULL, TRACE,
1129 "interface received on ifindex %u",
1130 dev->pktinfo.ipi6_ifindex);
1131 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1132 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1133 }
1134 goto next;
1135 }
1136
1137 #ifdef SO_TIMESTAMP
1138 if (cmsgp->cmsg_level == SOL_SOCKET &&
1139 cmsgp->cmsg_type == SCM_TIMESTAMP) {
1140 struct timeval tv;
1141 timevalp = CMSG_DATA(cmsgp);
1142 memmove(&tv, timevalp, sizeof(tv));
1143 dev->timestamp.seconds = tv.tv_sec;
1144 dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1145 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1146 goto next;
1147 }
1148 #endif /* ifdef SO_TIMESTAMP */
1149
1150 #ifdef IPV6_TCLASS
1151 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1152 cmsgp->cmsg_type == IPV6_TCLASS) {
1153 dev->dscp = *(int *)CMSG_DATA(cmsgp);
1154 dev->dscp >>= 2;
1155 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1156 goto next;
1157 }
1158 #endif /* ifdef IPV6_TCLASS */
1159
1160 #ifdef IP_TOS
1161 if (cmsgp->cmsg_level == IPPROTO_IP &&
1162 (cmsgp->cmsg_type == IP_TOS
1163 #ifdef IP_RECVTOS
1164 || cmsgp->cmsg_type == IP_RECVTOS
1165 #endif /* ifdef IP_RECVTOS */
1166 ))
1167 {
1168 dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1169 dev->dscp >>= 2;
1170 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1171 goto next;
1172 }
1173 #endif /* ifdef IP_TOS */
1174 next:
1175 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1176 }
1177 #endif /* USE_CMSG */
1178 }
1179
1180 /*
1181 * Construct an iov array and attach it to the msghdr passed in. This is
1182 * the SEND constructor, which will use the used region of the buffer
1183 * (if using a buffer list) or will use the internal region (if a single
1184 * buffer I/O is requested).
1185 *
1186 * Nothing can be NULL, and the done event must list at least one buffer
1187 * on the buffer linked list for this function to be meaningful.
1188 *
1189 * If write_countp != NULL, *write_countp will hold the number of bytes
1190 * this transaction can send.
1191 */
1192 static void
build_msghdr_send(isc__socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1193 build_msghdr_send(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1194 struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1195 unsigned int iovcount;
1196 size_t write_count;
1197 struct cmsghdr *cmsgp;
1198
1199 memset(msg, 0, sizeof(*msg));
1200
1201 if (!sock->connected) {
1202 msg->msg_name = (void *)&dev->address.type.sa;
1203 msg->msg_namelen = dev->address.length;
1204 } else {
1205 msg->msg_name = NULL;
1206 msg->msg_namelen = 0;
1207 }
1208
1209 write_count = dev->region.length - dev->n;
1210 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1211 iov[0].iov_len = write_count;
1212 iovcount = 1;
1213
1214 msg->msg_iov = iov;
1215 msg->msg_iovlen = iovcount;
1216 msg->msg_control = NULL;
1217 msg->msg_controllen = 0;
1218 msg->msg_flags = 0;
1219 #if defined(USE_CMSG)
1220
1221 if ((sock->type == isc_sockettype_udp) &&
1222 ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1223 {
1224 struct in6_pktinfo *pktinfop;
1225
1226 socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1227 dev->pktinfo.ipi6_ifindex);
1228
1229 msg->msg_control = (void *)cmsgbuf;
1230 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1231 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1232
1233 cmsgp = (struct cmsghdr *)cmsgbuf;
1234 cmsgp->cmsg_level = IPPROTO_IPV6;
1235 cmsgp->cmsg_type = IPV6_PKTINFO;
1236 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1237 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1238 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1239 }
1240
1241 #if defined(IPV6_USE_MIN_MTU)
1242 if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1243 ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1244 {
1245 int use_min_mtu = 1; /* -1, 0, 1 */
1246
1247 cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1248 msg->msg_control = (void *)cmsgbuf;
1249 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1250 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1251
1252 cmsgp->cmsg_level = IPPROTO_IPV6;
1253 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1254 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1255 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1256 }
1257 #endif /* if defined(IPV6_USE_MIN_MTU) */
1258
1259 if (isc_dscp_check_value > -1) {
1260 if (sock->type == isc_sockettype_udp) {
1261 INSIST((int)dev->dscp == isc_dscp_check_value);
1262 } else if (sock->type == isc_sockettype_tcp) {
1263 INSIST((int)sock->dscp == isc_dscp_check_value);
1264 }
1265 }
1266
1267 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1268 if ((sock->type == isc_sockettype_udp) &&
1269 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1270 {
1271 int dscp = (dev->dscp << 2) & 0xff;
1272
1273 INSIST(dev->dscp < 0x40);
1274
1275 #ifdef IP_TOS
1276 if (sock->pf == AF_INET && sock->pktdscp) {
1277 cmsgp = (struct cmsghdr *)(cmsgbuf +
1278 msg->msg_controllen);
1279 msg->msg_control = (void *)cmsgbuf;
1280 msg->msg_controllen += cmsg_space(sizeof(dscp));
1281 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1282
1283 cmsgp->cmsg_level = IPPROTO_IP;
1284 cmsgp->cmsg_type = IP_TOS;
1285 cmsgp->cmsg_len = cmsg_len(sizeof(char));
1286 *(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1287 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1288 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1289 (void *)&dscp, sizeof(int)) < 0) {
1290 char strbuf[ISC_STRERRORSIZE];
1291 strerror_r(errno, strbuf, sizeof(strbuf));
1292 UNEXPECTED_ERROR(__FILE__, __LINE__,
1293 "setsockopt(%d, IP_TOS, %.02x)"
1294 " failed: %s",
1295 sock->fd, dscp >> 2, strbuf);
1296 } else {
1297 sock->dscp = dscp;
1298 }
1299 }
1300 #endif /* ifdef IP_TOS */
1301 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1302 if (sock->pf == AF_INET6 && sock->pktdscp) {
1303 cmsgp = (struct cmsghdr *)(cmsgbuf +
1304 msg->msg_controllen);
1305 msg->msg_control = (void *)cmsgbuf;
1306 msg->msg_controllen += cmsg_space(sizeof(dscp));
1307 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1308
1309 cmsgp->cmsg_level = IPPROTO_IPV6;
1310 cmsgp->cmsg_type = IPV6_TCLASS;
1311 cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1312 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1313 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1314 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1315 (void *)&dscp, sizeof(int)) < 0)
1316 {
1317 char strbuf[ISC_STRERRORSIZE];
1318 strerror_r(errno, strbuf, sizeof(strbuf));
1319 UNEXPECTED_ERROR(__FILE__, __LINE__,
1320 "setsockopt(%d, IPV6_TCLASS, "
1321 "%.02x) failed: %s",
1322 sock->fd, dscp >> 2, strbuf);
1323 } else {
1324 sock->dscp = dscp;
1325 }
1326 }
1327 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1328 if (msg->msg_controllen != 0 &&
1329 msg->msg_controllen < SENDCMSGBUFLEN) {
1330 memset(cmsgbuf + msg->msg_controllen, 0,
1331 SENDCMSGBUFLEN - msg->msg_controllen);
1332 }
1333 }
1334 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1335 * defined(IPV6_TCLASS)) \
1336 * */
1337 #endif /* USE_CMSG */
1338
1339 if (write_countp != NULL) {
1340 *write_countp = write_count;
1341 }
1342 }
1343
1344 /*
1345 * Construct an iov array and attach it to the msghdr passed in. This is
1346 * the RECV constructor, which will use the available region of the buffer
1347 * (if using a buffer list) or will use the internal region (if a single
1348 * buffer I/O is requested).
1349 *
1350 * Nothing can be NULL, and the done event must list at least one buffer
1351 * on the buffer linked list for this function to be meaningful.
1352 *
1353 * If read_countp != NULL, *read_countp will hold the number of bytes
1354 * this transaction can receive.
1355 */
1356 static void
build_msghdr_recv(isc__socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1357 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1358 struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1359 unsigned int iovcount;
1360 size_t read_count;
1361
1362 memset(msg, 0, sizeof(struct msghdr));
1363
1364 if (sock->type == isc_sockettype_udp) {
1365 memset(&dev->address, 0, sizeof(dev->address));
1366 msg->msg_name = (void *)&dev->address.type.sa;
1367 msg->msg_namelen = sizeof(dev->address.type);
1368 } else { /* TCP */
1369 msg->msg_name = NULL;
1370 msg->msg_namelen = 0;
1371 dev->address = sock->peer_address;
1372 }
1373
1374 read_count = dev->region.length - dev->n;
1375 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1376 iov[0].iov_len = read_count;
1377 iovcount = 1;
1378
1379 /*
1380 * If needed, set up to receive that one extra byte.
1381 */
1382 #ifdef ISC_PLATFORM_RECVOVERFLOW
1383 if (sock->type == isc_sockettype_udp) {
1384 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1385 iov[iovcount].iov_base = (void *)(&sock->overflow);
1386 iov[iovcount].iov_len = 1;
1387 iovcount++;
1388 }
1389 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1390
1391 msg->msg_iov = iov;
1392 msg->msg_iovlen = iovcount;
1393
1394 #if defined(USE_CMSG)
1395 msg->msg_control = cmsgbuf;
1396 msg->msg_controllen = RECVCMSGBUFLEN;
1397 #else /* if defined(USE_CMSG) */
1398 msg->msg_control = NULL;
1399 msg->msg_controllen = 0;
1400 #endif /* USE_CMSG */
1401 msg->msg_flags = 0;
1402
1403 if (read_countp != NULL) {
1404 *read_countp = read_count;
1405 }
1406 }
1407
1408 static void
set_dev_address(const isc_sockaddr_t * address,isc__socket_t * sock,isc_socketevent_t * dev)1409 set_dev_address(const isc_sockaddr_t *address, isc__socket_t *sock,
1410 isc_socketevent_t *dev) {
1411 if (sock->type == isc_sockettype_udp) {
1412 if (address != NULL) {
1413 dev->address = *address;
1414 } else {
1415 dev->address = sock->peer_address;
1416 }
1417 } else if (sock->type == isc_sockettype_tcp) {
1418 INSIST(address == NULL);
1419 dev->address = sock->peer_address;
1420 }
1421 }
1422
1423 static void
destroy_socketevent(isc_event_t * event)1424 destroy_socketevent(isc_event_t *event) {
1425 isc_socketevent_t *ev = (isc_socketevent_t *)event;
1426
1427 (ev->destroy)(event);
1428 }
1429
1430 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1431 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1432 isc_taskaction_t action, void *arg) {
1433 isc_socketevent_t *ev;
1434
1435 ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1436 action, arg, sizeof(*ev));
1437
1438 ev->result = ISC_R_UNSET;
1439 ISC_LINK_INIT(ev, ev_link);
1440 ev->region.base = NULL;
1441 ev->n = 0;
1442 ev->offset = 0;
1443 ev->attributes = 0;
1444 ev->destroy = ev->ev_destroy;
1445 ev->ev_destroy = destroy_socketevent;
1446 ev->dscp = 0;
1447
1448 return (ev);
1449 }
1450
1451 #if defined(ISC_SOCKET_DEBUG)
1452 static void
dump_msg(struct msghdr * msg)1453 dump_msg(struct msghdr *msg) {
1454 unsigned int i;
1455
1456 printf("MSGHDR %p\n", msg);
1457 printf("\tname %p, namelen %ld\n", msg->msg_name,
1458 (long)msg->msg_namelen);
1459 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1460 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1461 printf("\t\t%u\tbase %p, len %ld\n", i,
1462 msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1463 printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1464 (long)msg->msg_controllen);
1465 }
1466 #endif /* if defined(ISC_SOCKET_DEBUG) */
1467
1468 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1469 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
1470 #define DOIO_HARD 2 /* i/o error, event sent */
1471 #define DOIO_EOF 3 /* EOF, no event sent */
1472
1473 static int
doio_recv(isc__socket_t * sock,isc_socketevent_t * dev)1474 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1475 int cc;
1476 struct iovec iov[MAXSCATTERGATHER_RECV];
1477 size_t read_count;
1478 struct msghdr msghdr;
1479 int recv_errno;
1480 char strbuf[ISC_STRERRORSIZE];
1481 char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1482
1483 build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1484
1485 #if defined(ISC_SOCKET_DEBUG)
1486 dump_msg(&msghdr);
1487 #endif /* if defined(ISC_SOCKET_DEBUG) */
1488
1489 cc = recvmsg(sock->fd, &msghdr, 0);
1490 recv_errno = errno;
1491
1492 #if defined(ISC_SOCKET_DEBUG)
1493 dump_msg(&msghdr);
1494 #endif /* if defined(ISC_SOCKET_DEBUG) */
1495
1496 if (cc < 0) {
1497 if (SOFT_ERROR(recv_errno)) {
1498 return (DOIO_SOFT);
1499 }
1500
1501 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1502 strerror_r(recv_errno, strbuf, sizeof(strbuf));
1503 socket_log(sock, NULL, IOEVENT,
1504 "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1505 sock->fd, cc, recv_errno, strbuf);
1506 }
1507
1508 #define SOFT_OR_HARD(_system, _isc) \
1509 if (recv_errno == _system) { \
1510 if (sock->connected) { \
1511 dev->result = _isc; \
1512 inc_stats(sock->manager->stats, \
1513 sock->statsindex[STATID_RECVFAIL]); \
1514 return (DOIO_HARD); \
1515 } \
1516 return (DOIO_SOFT); \
1517 }
1518 #define ALWAYS_HARD(_system, _isc) \
1519 if (recv_errno == _system) { \
1520 dev->result = _isc; \
1521 inc_stats(sock->manager->stats, \
1522 sock->statsindex[STATID_RECVFAIL]); \
1523 return (DOIO_HARD); \
1524 }
1525
1526 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1527 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1528 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1529 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1530 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1531 /* Should never get this one but it was seen. */
1532 #ifdef ENOPROTOOPT
1533 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1534 #endif /* ifdef ENOPROTOOPT */
1535 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1536
1537 #undef SOFT_OR_HARD
1538 #undef ALWAYS_HARD
1539
1540 dev->result = isc__errno2result(recv_errno);
1541 inc_stats(sock->manager->stats,
1542 sock->statsindex[STATID_RECVFAIL]);
1543 return (DOIO_HARD);
1544 }
1545
1546 /*
1547 * On TCP and UNIX sockets, zero length reads indicate EOF,
1548 * while on UDP sockets, zero length reads are perfectly valid,
1549 * although strange.
1550 */
1551 switch (sock->type) {
1552 case isc_sockettype_tcp:
1553 case isc_sockettype_unix:
1554 if (cc == 0) {
1555 return (DOIO_EOF);
1556 }
1557 break;
1558 case isc_sockettype_udp:
1559 case isc_sockettype_raw:
1560 break;
1561 default:
1562 INSIST(0);
1563 ISC_UNREACHABLE();
1564 }
1565
1566 if (sock->type == isc_sockettype_udp) {
1567 dev->address.length = msghdr.msg_namelen;
1568 if (isc_sockaddr_getport(&dev->address) == 0) {
1569 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1570 socket_log(sock, &dev->address, IOEVENT,
1571 "dropping source port zero packet");
1572 }
1573 return (DOIO_SOFT);
1574 }
1575 /*
1576 * Simulate a firewall blocking UDP responses bigger than
1577 * 'maxudp' bytes.
1578 */
1579 if (sock->manager->maxudp != 0 &&
1580 cc > (int)sock->manager->maxudp) {
1581 return (DOIO_SOFT);
1582 }
1583 }
1584
1585 socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1586
1587 /*
1588 * Overflow bit detection. If we received MORE bytes than we should,
1589 * this indicates an overflow situation. Set the flag in the
1590 * dev entry and adjust how much we read by one.
1591 */
1592 #ifdef ISC_PLATFORM_RECVOVERFLOW
1593 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1594 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1595 cc--;
1596 }
1597 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1598
1599 /*
1600 * If there are control messages attached, run through them and pull
1601 * out the interesting bits.
1602 */
1603 process_cmsg(sock, &msghdr, dev);
1604
1605 /*
1606 * update the buffers (if any) and the i/o count
1607 */
1608 dev->n += cc;
1609
1610 /*
1611 * If we read less than we expected, update counters,
1612 * and let the upper layer poke the descriptor.
1613 */
1614 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1615 return (DOIO_SOFT);
1616 }
1617
1618 /*
1619 * Full reads are posted, or partials if partials are ok.
1620 */
1621 dev->result = ISC_R_SUCCESS;
1622 return (DOIO_SUCCESS);
1623 }
1624
1625 /*
1626 * Returns:
1627 * DOIO_SUCCESS The operation succeeded. dev->result contains
1628 * ISC_R_SUCCESS.
1629 *
1630 * DOIO_HARD A hard or unexpected I/O error was encountered.
1631 * dev->result contains the appropriate error.
1632 *
1633 * DOIO_SOFT A soft I/O error was encountered. No senddone
1634 * event was sent. The operation should be retried.
1635 *
1636 * No other return values are possible.
1637 */
1638 static int
doio_send(isc__socket_t * sock,isc_socketevent_t * dev)1639 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1640 int cc;
1641 struct iovec iov[MAXSCATTERGATHER_SEND];
1642 size_t write_count;
1643 struct msghdr msghdr;
1644 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1645 int attempts = 0;
1646 int send_errno;
1647 char strbuf[ISC_STRERRORSIZE];
1648 char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1649
1650 build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1651
1652 resend:
1653 if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1654 write_count > sock->manager->maxudp)
1655 {
1656 cc = write_count;
1657 } else {
1658 cc = sendmsg(sock->fd, &msghdr, 0);
1659 }
1660 send_errno = errno;
1661
1662 /*
1663 * Check for error or block condition.
1664 */
1665 if (cc < 0) {
1666 if (send_errno == EINTR && ++attempts < NRETRIES) {
1667 goto resend;
1668 }
1669
1670 if (SOFT_ERROR(send_errno)) {
1671 if (errno == EWOULDBLOCK || errno == EAGAIN) {
1672 dev->result = ISC_R_WOULDBLOCK;
1673 }
1674 return (DOIO_SOFT);
1675 }
1676
1677 #define SOFT_OR_HARD(_system, _isc) \
1678 if (send_errno == _system) { \
1679 if (sock->connected) { \
1680 dev->result = _isc; \
1681 inc_stats(sock->manager->stats, \
1682 sock->statsindex[STATID_SENDFAIL]); \
1683 return (DOIO_HARD); \
1684 } \
1685 return (DOIO_SOFT); \
1686 }
1687 #define ALWAYS_HARD(_system, _isc) \
1688 if (send_errno == _system) { \
1689 dev->result = _isc; \
1690 inc_stats(sock->manager->stats, \
1691 sock->statsindex[STATID_SENDFAIL]); \
1692 return (DOIO_HARD); \
1693 }
1694
1695 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1696 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1697 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1698 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1699 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1700 #ifdef EHOSTDOWN
1701 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1702 #endif /* ifdef EHOSTDOWN */
1703 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1704 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1705 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1706 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1707 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1708
1709 #undef SOFT_OR_HARD
1710 #undef ALWAYS_HARD
1711
1712 /*
1713 * The other error types depend on whether or not the
1714 * socket is UDP or TCP. If it is UDP, some errors
1715 * that we expect to be fatal under TCP are merely
1716 * annoying, and are really soft errors.
1717 *
1718 * However, these soft errors are still returned as
1719 * a status.
1720 */
1721 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1722 strerror_r(send_errno, strbuf, sizeof(strbuf));
1723 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1724 addrbuf, strbuf);
1725 dev->result = isc__errno2result(send_errno);
1726 inc_stats(sock->manager->stats,
1727 sock->statsindex[STATID_SENDFAIL]);
1728 return (DOIO_HARD);
1729 }
1730
1731 if (cc == 0) {
1732 inc_stats(sock->manager->stats,
1733 sock->statsindex[STATID_SENDFAIL]);
1734 UNEXPECTED_ERROR(__FILE__, __LINE__,
1735 "doio_send: send() returned 0");
1736 }
1737
1738 /*
1739 * If we write less than we expected, update counters, poke.
1740 */
1741 dev->n += cc;
1742 if ((size_t)cc != write_count) {
1743 return (DOIO_SOFT);
1744 }
1745
1746 /*
1747 * Exactly what we wanted to write. We're done with this
1748 * entry. Post its completion event.
1749 */
1750 dev->result = ISC_R_SUCCESS;
1751 return (DOIO_SUCCESS);
1752 }
1753
1754 /*
1755 * Kill.
1756 *
1757 * Caller must ensure that the socket is not locked and no external
1758 * references exist.
1759 */
1760 static void
socketclose(isc__socketthread_t * thread,isc__socket_t * sock,int fd)1761 socketclose(isc__socketthread_t *thread, isc__socket_t *sock, int fd) {
1762 int lockid = FDLOCK_ID(fd);
1763 /*
1764 * No one has this socket open, so the watcher doesn't have to be
1765 * poked, and the socket doesn't have to be locked.
1766 */
1767 LOCK(&thread->fdlock[lockid]);
1768 thread->fds[fd] = NULL;
1769 thread->fdstate[fd] = CLOSE_PENDING;
1770 UNLOCK(&thread->fdlock[lockid]);
1771 select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
1772
1773 inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1774
1775 LOCK(&sock->lock);
1776 if (sock->active == 1) {
1777 dec_stats(thread->manager->stats,
1778 sock->statsindex[STATID_ACTIVE]);
1779 sock->active = 0;
1780 }
1781 UNLOCK(&sock->lock);
1782
1783 /*
1784 * update manager->maxfd here (XXX: this should be implemented more
1785 * efficiently)
1786 */
1787 #ifdef USE_SELECT
1788 LOCK(&thread->manager->lock);
1789 if (thread->maxfd == fd) {
1790 int i;
1791
1792 thread->maxfd = 0;
1793 for (i = fd - 1; i >= 0; i--) {
1794 lockid = FDLOCK_ID(i);
1795
1796 LOCK(&thread->fdlock[lockid]);
1797 if (thread->fdstate[i] == MANAGED) {
1798 thread->maxfd = i;
1799 UNLOCK(&thread->fdlock[lockid]);
1800 break;
1801 }
1802 UNLOCK(&thread->fdlock[lockid]);
1803 }
1804 if (thread->maxfd < thread->pipe_fds[0]) {
1805 thread->maxfd = thread->pipe_fds[0];
1806 }
1807 }
1808
1809 UNLOCK(&thread->manager->lock);
1810 #endif /* USE_SELECT */
1811 }
1812
1813 static void
destroy(isc__socket_t ** sockp)1814 destroy(isc__socket_t **sockp) {
1815 int fd = 0;
1816 isc__socket_t *sock = *sockp;
1817 isc__socketmgr_t *manager = sock->manager;
1818 isc__socketthread_t *thread = NULL;
1819
1820 socket_log(sock, NULL, CREATION, "destroying");
1821
1822 isc_refcount_destroy(&sock->references);
1823
1824 LOCK(&sock->lock);
1825 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1826 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1827 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1828 INSIST(ISC_LIST_EMPTY(sock->send_list));
1829 INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1830
1831 if (sock->fd >= 0) {
1832 fd = sock->fd;
1833 thread = &manager->threads[sock->threadid];
1834 sock->fd = -1;
1835 sock->threadid = -1;
1836 }
1837 UNLOCK(&sock->lock);
1838
1839 if (fd > 0) {
1840 socketclose(thread, sock, fd);
1841 }
1842
1843 LOCK(&manager->lock);
1844
1845 ISC_LIST_UNLINK(manager->socklist, sock, link);
1846
1847 if (ISC_LIST_EMPTY(manager->socklist)) {
1848 SIGNAL(&manager->shutdown_ok);
1849 }
1850
1851 /* can't unlock manager as its memory context is still used */
1852 free_socket(sockp);
1853
1854 UNLOCK(&manager->lock);
1855 }
1856
1857 static isc_result_t
allocate_socket(isc__socketmgr_t * manager,isc_sockettype_t type,isc__socket_t ** socketp)1858 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1859 isc__socket_t **socketp) {
1860 isc__socket_t *sock;
1861
1862 sock = isc_mem_get(manager->mctx, sizeof(*sock));
1863
1864 sock->common.magic = 0;
1865 sock->common.impmagic = 0;
1866 isc_refcount_init(&sock->references, 0);
1867
1868 sock->manager = manager;
1869 sock->type = type;
1870 sock->fd = -1;
1871 sock->threadid = -1;
1872 sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1873 sock->dupped = 0;
1874 sock->statsindex = NULL;
1875 sock->active = 0;
1876
1877 ISC_LINK_INIT(sock, link);
1878
1879 memset(sock->name, 0, sizeof(sock->name));
1880 sock->tag = NULL;
1881
1882 /*
1883 * Set up list of readers and writers to be initially empty.
1884 */
1885 ISC_LIST_INIT(sock->recv_list);
1886 ISC_LIST_INIT(sock->send_list);
1887 ISC_LIST_INIT(sock->accept_list);
1888 ISC_LIST_INIT(sock->connect_list);
1889
1890 sock->listener = 0;
1891 sock->connected = 0;
1892 sock->connecting = 0;
1893 sock->bound = 0;
1894 sock->pktdscp = 0;
1895
1896 /*
1897 * Initialize the lock.
1898 */
1899 isc_mutex_init(&sock->lock);
1900
1901 sock->common.magic = ISCAPI_SOCKET_MAGIC;
1902 sock->common.impmagic = SOCKET_MAGIC;
1903 *socketp = sock;
1904
1905 return (ISC_R_SUCCESS);
1906 }
1907
1908 /*
1909 * This event requires that the various lists be empty, that the reference
1910 * count be 1, and that the magic number is valid. The other socket bits,
1911 * like the lock, must be initialized as well. The fd associated must be
1912 * marked as closed, by setting it to -1 on close, or this routine will
1913 * also close the socket.
1914 */
1915 static void
free_socket(isc__socket_t ** socketp)1916 free_socket(isc__socket_t **socketp) {
1917 isc__socket_t *sock = *socketp;
1918 *socketp = NULL;
1919
1920 INSIST(VALID_SOCKET(sock));
1921 isc_refcount_destroy(&sock->references);
1922 LOCK(&sock->lock);
1923 INSIST(!sock->connecting);
1924 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1925 INSIST(ISC_LIST_EMPTY(sock->send_list));
1926 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1927 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1928 INSIST(!ISC_LINK_LINKED(sock, link));
1929 UNLOCK(&sock->lock);
1930
1931 sock->common.magic = 0;
1932 sock->common.impmagic = 0;
1933
1934 isc_mutex_destroy(&sock->lock);
1935
1936 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1937 }
1938
1939 #if defined(SET_RCVBUF)
1940 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1941 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1942
1943 static void
set_rcvbuf(void)1944 set_rcvbuf(void) {
1945 int fd;
1946 int max = rcvbuf, min;
1947 socklen_t len;
1948
1949 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1950 if (fd == -1) {
1951 switch (errno) {
1952 case EPROTONOSUPPORT:
1953 case EPFNOSUPPORT:
1954 case EAFNOSUPPORT:
1955 /*
1956 * Linux 2.2 (and maybe others) return EINVAL instead of
1957 * EAFNOSUPPORT.
1958 */
1959 case EINVAL:
1960 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
1961 break;
1962 }
1963 }
1964 if (fd == -1) {
1965 return;
1966 }
1967
1968 len = sizeof(min);
1969 if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
1970 min < rcvbuf)
1971 {
1972 again:
1973 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
1974 sizeof(rcvbuf)) == -1)
1975 {
1976 if (errno == ENOBUFS && rcvbuf > min) {
1977 max = rcvbuf - 1;
1978 rcvbuf = (rcvbuf + min) / 2;
1979 goto again;
1980 } else {
1981 rcvbuf = min;
1982 goto cleanup;
1983 }
1984 } else {
1985 min = rcvbuf;
1986 }
1987 if (min != max) {
1988 rcvbuf = max;
1989 goto again;
1990 }
1991 }
1992 cleanup:
1993 close(fd);
1994 }
1995 #endif /* ifdef SO_RCVBUF */
1996
1997 #if defined(SET_SNDBUF)
1998 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
1999 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2000
2001 static void
set_sndbuf(void)2002 set_sndbuf(void) {
2003 int fd;
2004 int max = sndbuf, min;
2005 socklen_t len;
2006
2007 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2008 #if defined(ISC_PLATFORM_HAVEIPV6)
2009 if (fd == -1) {
2010 switch (errno) {
2011 case EPROTONOSUPPORT:
2012 case EPFNOSUPPORT:
2013 case EAFNOSUPPORT:
2014 /*
2015 * Linux 2.2 (and maybe others) return EINVAL instead of
2016 * EAFNOSUPPORT.
2017 */
2018 case EINVAL:
2019 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2020 break;
2021 }
2022 }
2023 #endif /* if defined(ISC_PLATFORM_HAVEIPV6) */
2024 if (fd == -1) {
2025 return;
2026 }
2027
2028 len = sizeof(min);
2029 if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2030 min < sndbuf)
2031 {
2032 again:
2033 if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2034 sizeof(sndbuf)) == -1)
2035 {
2036 if (errno == ENOBUFS && sndbuf > min) {
2037 max = sndbuf - 1;
2038 sndbuf = (sndbuf + min) / 2;
2039 goto again;
2040 } else {
2041 sndbuf = min;
2042 goto cleanup;
2043 }
2044 } else {
2045 min = sndbuf;
2046 }
2047 if (min != max) {
2048 sndbuf = max;
2049 goto again;
2050 }
2051 }
2052 cleanup:
2053 close(fd);
2054 }
2055 #endif /* ifdef SO_SNDBUF */
2056
2057 static void
use_min_mtu(isc__socket_t * sock)2058 use_min_mtu(isc__socket_t *sock) {
2059 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2060 UNUSED(sock);
2061 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2062 #ifdef IPV6_USE_MIN_MTU
2063 /* use minimum MTU */
2064 if (sock->pf == AF_INET6) {
2065 int on = 1;
2066 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2067 (void *)&on, sizeof(on));
2068 }
2069 #endif /* ifdef IPV6_USE_MIN_MTU */
2070 #if defined(IPV6_MTU)
2071 /*
2072 * Use minimum MTU on IPv6 sockets.
2073 */
2074 if (sock->pf == AF_INET6) {
2075 int mtu = 1280;
2076 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2077 sizeof(mtu));
2078 }
2079 #endif /* if defined(IPV6_MTU) */
2080 }
2081
2082 static void
set_tcp_maxseg(isc__socket_t * sock,int size)2083 set_tcp_maxseg(isc__socket_t *sock, int size) {
2084 #ifdef TCP_MAXSEG
2085 if (sock->type == isc_sockettype_tcp) {
2086 (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2087 (void *)&size, sizeof(size));
2088 }
2089 #endif /* ifdef TCP_MAXSEG */
2090 }
2091
2092 static isc_result_t
opensocket(isc__socketmgr_t * manager,isc__socket_t * sock,isc__socket_t * dup_socket)2093 opensocket(isc__socketmgr_t *manager, isc__socket_t *sock,
2094 isc__socket_t *dup_socket) {
2095 isc_result_t result;
2096 char strbuf[ISC_STRERRORSIZE];
2097 const char *err = "socket";
2098 int tries = 0;
2099 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2100 int on = 1;
2101 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2102 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2103 socklen_t optlen;
2104 int size = 0;
2105 #endif
2106
2107 again:
2108 if (dup_socket == NULL) {
2109 switch (sock->type) {
2110 case isc_sockettype_udp:
2111 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2112 break;
2113 case isc_sockettype_tcp:
2114 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2115 break;
2116 case isc_sockettype_unix:
2117 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2118 break;
2119 case isc_sockettype_raw:
2120 errno = EPFNOSUPPORT;
2121 /*
2122 * PF_ROUTE is a alias for PF_NETLINK on linux.
2123 */
2124 #if defined(PF_ROUTE)
2125 if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2126 #ifdef NETLINK_ROUTE
2127 sock->fd = socket(sock->pf, SOCK_RAW,
2128 NETLINK_ROUTE);
2129 #else /* ifdef NETLINK_ROUTE */
2130 sock->fd = socket(sock->pf, SOCK_RAW, 0);
2131 #endif /* ifdef NETLINK_ROUTE */
2132 if (sock->fd != -1) {
2133 #ifdef NETLINK_ROUTE
2134 struct sockaddr_nl sa;
2135 int n;
2136
2137 /*
2138 * Do an implicit bind.
2139 */
2140 memset(&sa, 0, sizeof(sa));
2141 sa.nl_family = AF_NETLINK;
2142 sa.nl_groups = RTMGRP_IPV4_IFADDR |
2143 RTMGRP_IPV6_IFADDR;
2144 n = bind(sock->fd,
2145 (struct sockaddr *)&sa,
2146 sizeof(sa));
2147 if (n < 0) {
2148 close(sock->fd);
2149 sock->fd = -1;
2150 }
2151 #endif /* ifdef NETLINK_ROUTE */
2152 sock->bound = 1;
2153 }
2154 }
2155 #endif /* if defined(PF_ROUTE) */
2156 break;
2157 }
2158 } else {
2159 sock->fd = dup(dup_socket->fd);
2160 sock->dupped = 1;
2161 sock->bound = dup_socket->bound;
2162 }
2163 if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2164 goto again;
2165 }
2166
2167 #ifdef F_DUPFD
2168 /*
2169 * Leave a space for stdio and TCP to work in.
2170 */
2171 if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2172 sock->fd >= 0 && sock->fd < manager->reserved)
2173 {
2174 int newfd, tmp;
2175 newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2176 tmp = errno;
2177 (void)close(sock->fd);
2178 errno = tmp;
2179 sock->fd = newfd;
2180 err = "isc_socket_create: fcntl/reserved";
2181 } else if (sock->fd >= 0 && sock->fd < 20) {
2182 int newfd, tmp;
2183 newfd = fcntl(sock->fd, F_DUPFD, 20);
2184 tmp = errno;
2185 (void)close(sock->fd);
2186 errno = tmp;
2187 sock->fd = newfd;
2188 err = "isc_socket_create: fcntl";
2189 }
2190 #endif /* ifdef F_DUPFD */
2191
2192 if (sock->fd >= (int)manager->maxsocks) {
2193 (void)close(sock->fd);
2194 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2195 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2196 "socket: file descriptor exceeds limit (%d/%u)",
2197 sock->fd, manager->maxsocks);
2198 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2199 return (ISC_R_NORESOURCES);
2200 }
2201
2202 if (sock->fd < 0) {
2203 switch (errno) {
2204 case EMFILE:
2205 case ENFILE:
2206 strerror_r(errno, strbuf, sizeof(strbuf));
2207 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2208 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2209 "%s: %s", err, strbuf);
2210 /* fallthrough */
2211 case ENOBUFS:
2212 inc_stats(manager->stats,
2213 sock->statsindex[STATID_OPENFAIL]);
2214 return (ISC_R_NORESOURCES);
2215
2216 case EPROTONOSUPPORT:
2217 case EPFNOSUPPORT:
2218 case EAFNOSUPPORT:
2219 /*
2220 * Linux 2.2 (and maybe others) return EINVAL instead of
2221 * EAFNOSUPPORT.
2222 */
2223 case EINVAL:
2224 inc_stats(manager->stats,
2225 sock->statsindex[STATID_OPENFAIL]);
2226 return (ISC_R_FAMILYNOSUPPORT);
2227
2228 default:
2229 strerror_r(errno, strbuf, sizeof(strbuf));
2230 UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2231 err, strbuf);
2232 inc_stats(manager->stats,
2233 sock->statsindex[STATID_OPENFAIL]);
2234 return (ISC_R_UNEXPECTED);
2235 }
2236 }
2237
2238 if (dup_socket != NULL) {
2239 goto setup_done;
2240 }
2241
2242 result = make_nonblock(sock->fd);
2243 if (result != ISC_R_SUCCESS) {
2244 (void)close(sock->fd);
2245 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2246 return (result);
2247 }
2248
2249 #ifdef SO_NOSIGPIPE
2250 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2251 sizeof(on)) < 0) {
2252 strerror_r(errno, strbuf, sizeof(strbuf));
2253 UNEXPECTED_ERROR(__FILE__, __LINE__,
2254 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2255 sock->fd, strbuf);
2256 /* Press on... */
2257 }
2258 #endif /* ifdef SO_NOSIGPIPE */
2259
2260 /*
2261 * Use minimum mtu if possible.
2262 */
2263 if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2264 use_min_mtu(sock);
2265 set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2266 }
2267
2268 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2269 if (sock->type == isc_sockettype_udp) {
2270 #if defined(USE_CMSG)
2271 #if defined(SO_TIMESTAMP)
2272 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2273 sizeof(on)) < 0 &&
2274 errno != ENOPROTOOPT)
2275 {
2276 strerror_r(errno, strbuf, sizeof(strbuf));
2277 UNEXPECTED_ERROR(__FILE__, __LINE__,
2278 "setsockopt(%d, SO_TIMESTAMP) failed: "
2279 "%s",
2280 sock->fd, strbuf);
2281 /* Press on... */
2282 }
2283 #endif /* SO_TIMESTAMP */
2284
2285 #ifdef IPV6_RECVPKTINFO
2286 /* RFC 3542 */
2287 if ((sock->pf == AF_INET6) &&
2288 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2289 (void *)&on, sizeof(on)) < 0))
2290 {
2291 strerror_r(errno, strbuf, sizeof(strbuf));
2292 UNEXPECTED_ERROR(__FILE__, __LINE__,
2293 "setsockopt(%d, IPV6_RECVPKTINFO) "
2294 "failed: %s",
2295 sock->fd, strbuf);
2296 }
2297 #else /* ifdef IPV6_RECVPKTINFO */
2298 /* RFC 2292 */
2299 if ((sock->pf == AF_INET6) &&
2300 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2301 (void *)&on, sizeof(on)) < 0))
2302 {
2303 strerror_r(errno, strbuf, sizeof(strbuf));
2304 UNEXPECTED_ERROR(__FILE__, __LINE__,
2305 "setsockopt(%d, IPV6_PKTINFO) failed: "
2306 "%s",
2307 sock->fd, strbuf);
2308 }
2309 #endif /* IPV6_RECVPKTINFO */
2310 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2311 /*
2312 * Turn off Path MTU discovery on IPv6/UDP sockets.
2313 */
2314 if (sock->pf == AF_INET6) {
2315 int action = IPV6_PMTUDISC_DONT;
2316 (void)setsockopt(sock->fd, IPPROTO_IPV6,
2317 IPV6_MTU_DISCOVER, &action,
2318 sizeof(action));
2319 }
2320 #endif /* if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT) */
2321 #endif /* defined(USE_CMSG) */
2322
2323 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2324 /*
2325 * Turn off Path MTU discovery on IPv4/UDP sockets.
2326 * Prefer IP_PMTUDISC_OMIT over IP_PMTUDISC_DONT
2327 * if it available.
2328 */
2329 if (sock->pf == AF_INET) {
2330 int action;
2331 #if defined(IP_PMTUDISC_OMIT)
2332 action = IP_PMTUDISC_OMIT;
2333 if (setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2334 &action, sizeof(action)) < 0)
2335 {
2336 #endif /* if defined(IP_PMTUDISC_OMIT) */
2337 action = IP_PMTUDISC_DONT;
2338 (void)setsockopt(sock->fd, IPPROTO_IP,
2339 IP_MTU_DISCOVER, &action,
2340 sizeof(action));
2341 #if defined(IP_PMTUDISC_OMIT)
2342 }
2343 #endif /* if defined(IP_PMTUDISC_OMIT) */
2344 }
2345 #endif /* if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) */
2346 #if defined(IP_DONTFRAG)
2347 /*
2348 * Turn off Path MTU discovery on IPv4/UDP sockets.
2349 */
2350 if (sock->pf == AF_INET) {
2351 int off = 0;
2352 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2353 &off, sizeof(off));
2354 }
2355 #endif /* if defined(IP_DONTFRAG) */
2356
2357 #if defined(SET_RCVBUF)
2358 optlen = sizeof(size);
2359 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2360 &optlen) == 0 &&
2361 size < rcvbuf)
2362 {
2363 RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2364 ISC_R_SUCCESS);
2365 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2366 (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2367 {
2368 strerror_r(errno, strbuf, sizeof(strbuf));
2369 UNEXPECTED_ERROR(__FILE__, __LINE__,
2370 "setsockopt(%d, SO_RCVBUF, "
2371 "%d) failed: %s",
2372 sock->fd, rcvbuf, strbuf);
2373 }
2374 }
2375 #endif /* if defined(SET_RCVBUF) */
2376
2377 #if defined(SET_SNDBUF)
2378 optlen = sizeof(size);
2379 if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2380 &optlen) == 0 &&
2381 size < sndbuf)
2382 {
2383 RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2384 ISC_R_SUCCESS);
2385 if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2386 (void *)&sndbuf, sizeof(sndbuf)) == -1)
2387 {
2388 strerror_r(errno, strbuf, sizeof(strbuf));
2389 UNEXPECTED_ERROR(__FILE__, __LINE__,
2390 "setsockopt(%d, SO_SNDBUF, "
2391 "%d) failed: %s",
2392 sock->fd, sndbuf, strbuf);
2393 }
2394 }
2395 #endif /* if defined(SO_SNDBUF) */
2396 }
2397 #ifdef IPV6_RECVTCLASS
2398 if ((sock->pf == AF_INET6) &&
2399 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2400 sizeof(on)) < 0))
2401 {
2402 strerror_r(errno, strbuf, sizeof(strbuf));
2403 UNEXPECTED_ERROR(__FILE__, __LINE__,
2404 "setsockopt(%d, IPV6_RECVTCLASS) "
2405 "failed: %s",
2406 sock->fd, strbuf);
2407 }
2408 #endif /* ifdef IPV6_RECVTCLASS */
2409 #ifdef IP_RECVTOS
2410 if ((sock->pf == AF_INET) &&
2411 (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2412 sizeof(on)) < 0))
2413 {
2414 strerror_r(errno, strbuf, sizeof(strbuf));
2415 UNEXPECTED_ERROR(__FILE__, __LINE__,
2416 "setsockopt(%d, IP_RECVTOS) "
2417 "failed: %s",
2418 sock->fd, strbuf);
2419 }
2420 #endif /* ifdef IP_RECVTOS */
2421 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2422
2423 setup_done:
2424 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2425 if (sock->active == 0) {
2426 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2427 sock->active = 1;
2428 }
2429
2430 return (ISC_R_SUCCESS);
2431 }
2432
2433 /*
2434 * Create a 'type' socket or duplicate an existing socket, managed
2435 * by 'manager'. Events will be posted to 'task' and when dispatched
2436 * 'action' will be called with 'arg' as the arg value. The new
2437 * socket is returned in 'socketp'.
2438 */
2439 static isc_result_t
socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp,isc_socket_t * dup_socket)2440 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2441 isc_socket_t **socketp, isc_socket_t *dup_socket) {
2442 isc__socket_t *sock = NULL;
2443 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2444 isc__socketthread_t *thread;
2445 isc_result_t result;
2446 int lockid;
2447
2448 REQUIRE(VALID_MANAGER(manager));
2449 REQUIRE(socketp != NULL && *socketp == NULL);
2450
2451 result = allocate_socket(manager, type, &sock);
2452 if (result != ISC_R_SUCCESS) {
2453 return (result);
2454 }
2455
2456 switch (sock->type) {
2457 case isc_sockettype_udp:
2458 sock->statsindex = (pf == AF_INET) ? udp4statsindex
2459 : udp6statsindex;
2460 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2461 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2462 break;
2463 case isc_sockettype_tcp:
2464 sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2465 : tcp6statsindex;
2466 break;
2467 case isc_sockettype_unix:
2468 sock->statsindex = unixstatsindex;
2469 break;
2470 case isc_sockettype_raw:
2471 sock->statsindex = rawstatsindex;
2472 break;
2473 default:
2474 INSIST(0);
2475 ISC_UNREACHABLE();
2476 }
2477
2478 sock->pf = pf;
2479
2480 result = opensocket(manager, sock, (isc__socket_t *)dup_socket);
2481 if (result != ISC_R_SUCCESS) {
2482 free_socket(&sock);
2483 return (result);
2484 }
2485
2486 if (sock->fd == -1) {
2487 abort();
2488 }
2489 sock->threadid = gen_threadid(sock);
2490 isc_refcount_increment0(&sock->references);
2491 thread = &manager->threads[sock->threadid];
2492 *socketp = (isc_socket_t *)sock;
2493
2494 /*
2495 * Note we don't have to lock the socket like we normally would because
2496 * there are no external references to it yet.
2497 */
2498
2499 lockid = FDLOCK_ID(sock->fd);
2500 LOCK(&thread->fdlock[lockid]);
2501 thread->fds[sock->fd] = sock;
2502 thread->fdstate[sock->fd] = MANAGED;
2503 #if defined(USE_EPOLL)
2504 thread->epoll_events[sock->fd] = 0;
2505 #endif /* if defined(USE_EPOLL) */
2506 #ifdef USE_DEVPOLL
2507 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2508 thread->fdpollinfo[sock->fd].want_write == 0);
2509 #endif /* ifdef USE_DEVPOLL */
2510 UNLOCK(&thread->fdlock[lockid]);
2511
2512 LOCK(&manager->lock);
2513 ISC_LIST_APPEND(manager->socklist, sock, link);
2514 #ifdef USE_SELECT
2515 if (thread->maxfd < sock->fd) {
2516 thread->maxfd = sock->fd;
2517 }
2518 #endif /* ifdef USE_SELECT */
2519 UNLOCK(&manager->lock);
2520
2521 socket_log(sock, NULL, CREATION,
2522 dup_socket != NULL ? "dupped" : "created");
2523
2524 return (ISC_R_SUCCESS);
2525 }
2526
2527 /*%
2528 * Create a new 'type' socket managed by 'manager'. Events
2529 * will be posted to 'task' and when dispatched 'action' will be
2530 * called with 'arg' as the arg value. The new socket is returned
2531 * in 'socketp'.
2532 */
2533 isc_result_t
isc_socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2534 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2535 isc_socket_t **socketp) {
2536 return (socket_create(manager0, pf, type, socketp, NULL));
2537 }
2538
2539 /*%
2540 * Duplicate an existing socket. The new socket is returned
2541 * in 'socketp'.
2542 */
2543 isc_result_t
isc_socket_dup(isc_socket_t * sock0,isc_socket_t ** socketp)2544 isc_socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) {
2545 isc__socket_t *sock = (isc__socket_t *)sock0;
2546
2547 REQUIRE(VALID_SOCKET(sock));
2548 REQUIRE(socketp != NULL && *socketp == NULL);
2549
2550 return (socket_create((isc_socketmgr_t *)sock->manager, sock->pf,
2551 sock->type, socketp, sock0));
2552 }
2553
2554 isc_result_t
isc_socket_open(isc_socket_t * sock0)2555 isc_socket_open(isc_socket_t *sock0) {
2556 isc_result_t result;
2557 isc__socket_t *sock = (isc__socket_t *)sock0;
2558 isc__socketthread_t *thread;
2559
2560 REQUIRE(VALID_SOCKET(sock));
2561
2562 LOCK(&sock->lock);
2563
2564 REQUIRE(isc_refcount_current(&sock->references) >= 1);
2565 REQUIRE(sock->fd == -1);
2566 REQUIRE(sock->threadid == -1);
2567
2568 result = opensocket(sock->manager, sock, NULL);
2569
2570 UNLOCK(&sock->lock);
2571
2572 if (result != ISC_R_SUCCESS) {
2573 sock->fd = -1;
2574 } else {
2575 sock->threadid = gen_threadid(sock);
2576 thread = &sock->manager->threads[sock->threadid];
2577 int lockid = FDLOCK_ID(sock->fd);
2578
2579 LOCK(&thread->fdlock[lockid]);
2580 thread->fds[sock->fd] = sock;
2581 thread->fdstate[sock->fd] = MANAGED;
2582 #if defined(USE_EPOLL)
2583 thread->epoll_events[sock->fd] = 0;
2584 #endif /* if defined(USE_EPOLL) */
2585 #ifdef USE_DEVPOLL
2586 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2587 thread->fdpollinfo[sock->fd].want_write == 0);
2588 #endif /* ifdef USE_DEVPOLL */
2589 UNLOCK(&thread->fdlock[lockid]);
2590
2591 #ifdef USE_SELECT
2592 LOCK(&sock->manager->lock);
2593 if (thread->maxfd < sock->fd) {
2594 thread->maxfd = sock->fd;
2595 }
2596 UNLOCK(&sock->manager->lock);
2597 #endif /* ifdef USE_SELECT */
2598 }
2599
2600 return (result);
2601 }
2602
2603 /*
2604 * Attach to a socket. Caller must explicitly detach when it is done.
2605 */
2606 void
isc_socket_attach(isc_socket_t * sock0,isc_socket_t ** socketp)2607 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
2608 isc__socket_t *sock = (isc__socket_t *)sock0;
2609
2610 REQUIRE(VALID_SOCKET(sock));
2611 REQUIRE(socketp != NULL && *socketp == NULL);
2612
2613 int old_refs = isc_refcount_increment(&sock->references);
2614 REQUIRE(old_refs > 0);
2615
2616 *socketp = (isc_socket_t *)sock;
2617 }
2618
2619 /*
2620 * Dereference a socket. If this is the last reference to it, clean things
2621 * up by destroying the socket.
2622 */
2623 void
isc_socket_detach(isc_socket_t ** socketp)2624 isc_socket_detach(isc_socket_t **socketp) {
2625 isc__socket_t *sock;
2626
2627 REQUIRE(socketp != NULL);
2628 sock = (isc__socket_t *)*socketp;
2629 REQUIRE(VALID_SOCKET(sock));
2630 if (isc_refcount_decrement(&sock->references) == 1) {
2631 destroy(&sock);
2632 }
2633
2634 *socketp = NULL;
2635 }
2636
2637 isc_result_t
isc_socket_close(isc_socket_t * sock0)2638 isc_socket_close(isc_socket_t *sock0) {
2639 isc__socket_t *sock = (isc__socket_t *)sock0;
2640 int fd;
2641 isc__socketmgr_t *manager;
2642 isc__socketthread_t *thread;
2643 fflush(stdout);
2644 REQUIRE(VALID_SOCKET(sock));
2645
2646 LOCK(&sock->lock);
2647
2648 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2649
2650 INSIST(!sock->connecting);
2651 INSIST(ISC_LIST_EMPTY(sock->recv_list));
2652 INSIST(ISC_LIST_EMPTY(sock->send_list));
2653 INSIST(ISC_LIST_EMPTY(sock->accept_list));
2654 INSIST(ISC_LIST_EMPTY(sock->connect_list));
2655
2656 manager = sock->manager;
2657 thread = &manager->threads[sock->threadid];
2658 fd = sock->fd;
2659 sock->fd = -1;
2660 sock->threadid = -1;
2661
2662 sock->dupped = 0;
2663 memset(sock->name, 0, sizeof(sock->name));
2664 sock->tag = NULL;
2665 sock->listener = 0;
2666 sock->connected = 0;
2667 sock->connecting = 0;
2668 sock->bound = 0;
2669 isc_sockaddr_any(&sock->peer_address);
2670
2671 UNLOCK(&sock->lock);
2672
2673 socketclose(thread, sock, fd);
2674
2675 return (ISC_R_SUCCESS);
2676 }
2677
2678 /*
2679 * Dequeue an item off the given socket's read queue, set the result code
2680 * in the done event to the one provided, and send it to the task it was
2681 * destined for.
2682 *
2683 * If the event to be sent is on a list, remove it before sending. If
2684 * asked to, send and detach from the socket as well.
2685 *
2686 * Caller must have the socket locked if the event is attached to the socket.
2687 */
2688 static void
send_recvdone_event(isc__socket_t * sock,isc_socketevent_t ** dev)2689 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
2690 isc_task_t *task;
2691
2692 task = (*dev)->ev_sender;
2693
2694 (*dev)->ev_sender = sock;
2695
2696 if (ISC_LINK_LINKED(*dev, ev_link)) {
2697 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2698 }
2699
2700 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2701 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2702 sock->threadid);
2703 } else {
2704 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2705 }
2706 }
2707
2708 /*
2709 * See comments for send_recvdone_event() above.
2710 *
2711 * Caller must have the socket locked if the event is attached to the socket.
2712 */
2713 static void
send_senddone_event(isc__socket_t * sock,isc_socketevent_t ** dev)2714 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
2715 isc_task_t *task;
2716
2717 INSIST(dev != NULL && *dev != NULL);
2718
2719 task = (*dev)->ev_sender;
2720 (*dev)->ev_sender = sock;
2721
2722 if (ISC_LINK_LINKED(*dev, ev_link)) {
2723 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2724 }
2725
2726 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2727 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2728 sock->threadid);
2729 } else {
2730 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2731 }
2732 }
2733
2734 /*
2735 * See comments for send_recvdone_event() above.
2736 *
2737 * Caller must have the socket locked if the event is attached to the socket.
2738 */
2739 static void
send_connectdone_event(isc__socket_t * sock,isc_socket_connev_t ** dev)2740 send_connectdone_event(isc__socket_t *sock, isc_socket_connev_t **dev) {
2741 isc_task_t *task;
2742
2743 INSIST(dev != NULL && *dev != NULL);
2744
2745 task = (*dev)->ev_sender;
2746 (*dev)->ev_sender = sock;
2747
2748 if (ISC_LINK_LINKED(*dev, ev_link)) {
2749 ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2750 }
2751
2752 isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2753 }
2754
2755 /*
2756 * Call accept() on a socket, to get the new file descriptor. The listen
2757 * socket is used as a prototype to create a new isc_socket_t. The new
2758 * socket has one outstanding reference. The task receiving the event
2759 * will be detached from just after the event is delivered.
2760 *
2761 * On entry to this function, the event delivered is the internal
2762 * readable event, and the first item on the accept_list should be
2763 * the done event we want to send. If the list is empty, this is a no-op,
2764 * so just unlock and return.
2765 */
2766 static void
internal_accept(isc__socket_t * sock)2767 internal_accept(isc__socket_t *sock) {
2768 isc__socketmgr_t *manager;
2769 isc__socketthread_t *thread, *nthread;
2770 isc_socket_newconnev_t *dev;
2771 isc_task_t *task;
2772 socklen_t addrlen;
2773 int fd;
2774 isc_result_t result = ISC_R_SUCCESS;
2775 char strbuf[ISC_STRERRORSIZE];
2776 const char *err = "accept";
2777
2778 INSIST(VALID_SOCKET(sock));
2779 REQUIRE(sock->fd >= 0);
2780
2781 socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2782
2783 manager = sock->manager;
2784 INSIST(VALID_MANAGER(manager));
2785 thread = &manager->threads[sock->threadid];
2786
2787 INSIST(sock->listener);
2788
2789 /*
2790 * Get the first item off the accept list.
2791 * If it is empty, unlock the socket and return.
2792 */
2793 dev = ISC_LIST_HEAD(sock->accept_list);
2794 if (dev == NULL) {
2795 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2796 UNLOCK(&sock->lock);
2797 return;
2798 }
2799
2800 /*
2801 * Try to accept the new connection. If the accept fails with
2802 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2803 * again. Also ignore ECONNRESET, which has been reported to
2804 * be spuriously returned on Linux 2.2.19 although it is not
2805 * a documented error for accept(). ECONNABORTED has been
2806 * reported for Solaris 8. The rest are thrown in not because
2807 * we have seen them but because they are ignored by other
2808 * daemons such as BIND 8 and Apache.
2809 */
2810
2811 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2812 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2813 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2814 (void *)&addrlen);
2815
2816 #ifdef F_DUPFD
2817 /*
2818 * Leave a space for stdio to work in.
2819 */
2820 if (fd >= 0 && fd < 20) {
2821 int newfd, tmp;
2822 newfd = fcntl(fd, F_DUPFD, 20);
2823 tmp = errno;
2824 (void)close(fd);
2825 errno = tmp;
2826 fd = newfd;
2827 err = "accept/fcntl";
2828 }
2829 #endif /* ifdef F_DUPFD */
2830
2831 if (fd < 0) {
2832 if (SOFT_ERROR(errno)) {
2833 goto soft_error;
2834 }
2835 switch (errno) {
2836 case ENFILE:
2837 case EMFILE:
2838 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2839 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2840 "%s: too many open file descriptors",
2841 err);
2842 goto soft_error;
2843
2844 case ENOBUFS:
2845 case ENOMEM:
2846 case ECONNRESET:
2847 case ECONNABORTED:
2848 case EHOSTUNREACH:
2849 case EHOSTDOWN:
2850 case ENETUNREACH:
2851 case ENETDOWN:
2852 case ECONNREFUSED:
2853 #ifdef EPROTO
2854 case EPROTO:
2855 #endif /* ifdef EPROTO */
2856 #ifdef ENONET
2857 case ENONET:
2858 #endif /* ifdef ENONET */
2859 goto soft_error;
2860 default:
2861 break;
2862 }
2863 strerror_r(errno, strbuf, sizeof(strbuf));
2864 UNEXPECTED_ERROR(__FILE__, __LINE__,
2865 "internal_accept: %s() failed: %s", err,
2866 strbuf);
2867 fd = -1;
2868 result = ISC_R_UNEXPECTED;
2869 } else {
2870 if (addrlen == 0U) {
2871 UNEXPECTED_ERROR(__FILE__, __LINE__,
2872 "internal_accept(): "
2873 "accept() failed to return "
2874 "remote address");
2875
2876 (void)close(fd);
2877 goto soft_error;
2878 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2879 sock->pf) {
2880 UNEXPECTED_ERROR(
2881 __FILE__, __LINE__,
2882 "internal_accept(): "
2883 "accept() returned peer address "
2884 "family %u (expected %u)",
2885 NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2886 sock->pf);
2887 (void)close(fd);
2888 goto soft_error;
2889 } else if (fd >= (int)manager->maxsocks) {
2890 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2891 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2892 "accept: file descriptor exceeds limit "
2893 "(%d/%u)",
2894 fd, manager->maxsocks);
2895 (void)close(fd);
2896 goto soft_error;
2897 }
2898 }
2899
2900 if (fd != -1) {
2901 NEWCONNSOCK(dev)->peer_address.length = addrlen;
2902 NEWCONNSOCK(dev)->pf = sock->pf;
2903 }
2904
2905 /*
2906 * Pull off the done event.
2907 */
2908 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2909
2910 /*
2911 * Poke watcher if there are more pending accepts.
2912 */
2913 if (ISC_LIST_EMPTY(sock->accept_list)) {
2914 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2915 }
2916
2917 if (fd != -1) {
2918 result = make_nonblock(fd);
2919 if (result != ISC_R_SUCCESS) {
2920 (void)close(fd);
2921 fd = -1;
2922 }
2923 }
2924
2925 /*
2926 * We need to unlock sock->lock now to be able to lock manager->lock
2927 * without risking a deadlock with xmlstats.
2928 */
2929 UNLOCK(&sock->lock);
2930
2931 /*
2932 * -1 means the new socket didn't happen.
2933 */
2934 if (fd != -1) {
2935 int lockid = FDLOCK_ID(fd);
2936
2937 NEWCONNSOCK(dev)->fd = fd;
2938 NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2939 NEWCONNSOCK(dev)->bound = 1;
2940 NEWCONNSOCK(dev)->connected = 1;
2941 nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2942
2943 /*
2944 * We already hold a lock on one fdlock in accepting thread,
2945 * we need to make sure that we don't double lock.
2946 */
2947 bool same_bucket = (sock->threadid ==
2948 NEWCONNSOCK(dev)->threadid) &&
2949 (FDLOCK_ID(sock->fd) == lockid);
2950
2951 /*
2952 * Use minimum mtu if possible.
2953 */
2954 use_min_mtu(NEWCONNSOCK(dev));
2955 set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
2956
2957 /*
2958 * Ensure DSCP settings are inherited across accept.
2959 */
2960 setdscp(NEWCONNSOCK(dev), sock->dscp);
2961
2962 /*
2963 * Save away the remote address
2964 */
2965 dev->address = NEWCONNSOCK(dev)->peer_address;
2966
2967 if (NEWCONNSOCK(dev)->active == 0) {
2968 inc_stats(manager->stats,
2969 NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
2970 NEWCONNSOCK(dev)->active = 1;
2971 }
2972
2973 if (!same_bucket) {
2974 LOCK(&nthread->fdlock[lockid]);
2975 }
2976 nthread->fds[fd] = NEWCONNSOCK(dev);
2977 nthread->fdstate[fd] = MANAGED;
2978 #if defined(USE_EPOLL)
2979 nthread->epoll_events[fd] = 0;
2980 #endif /* if defined(USE_EPOLL) */
2981 if (!same_bucket) {
2982 UNLOCK(&nthread->fdlock[lockid]);
2983 }
2984
2985 LOCK(&manager->lock);
2986
2987 #ifdef USE_SELECT
2988 if (nthread->maxfd < fd) {
2989 nthread->maxfd = fd;
2990 }
2991 #endif /* ifdef USE_SELECT */
2992
2993 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
2994 "accepted connection, new socket %p",
2995 dev->newsocket);
2996
2997 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
2998
2999 UNLOCK(&manager->lock);
3000
3001 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3002 } else {
3003 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3004 (void)isc_refcount_decrement(&NEWCONNSOCK(dev)->references);
3005 free_socket((isc__socket_t **)&dev->newsocket);
3006 }
3007
3008 /*
3009 * Fill in the done event details and send it off.
3010 */
3011 dev->result = result;
3012 task = dev->ev_sender;
3013 dev->ev_sender = sock;
3014
3015 isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
3016 return;
3017
3018 soft_error:
3019 watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
3020 UNLOCK(&sock->lock);
3021
3022 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3023 return;
3024 }
3025
3026 static void
internal_recv(isc__socket_t * sock)3027 internal_recv(isc__socket_t *sock) {
3028 isc_socketevent_t *dev;
3029
3030 INSIST(VALID_SOCKET(sock));
3031 REQUIRE(sock->fd >= 0);
3032
3033 dev = ISC_LIST_HEAD(sock->recv_list);
3034 if (dev == NULL) {
3035 goto finish;
3036 }
3037
3038 socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
3039 dev, dev->ev_sender);
3040
3041 /*
3042 * Try to do as much I/O as possible on this socket. There are no
3043 * limits here, currently.
3044 */
3045 while (dev != NULL) {
3046 switch (doio_recv(sock, dev)) {
3047 case DOIO_SOFT:
3048 goto finish;
3049
3050 case DOIO_EOF:
3051 /*
3052 * read of 0 means the remote end was closed.
3053 * Run through the event queue and dispatch all
3054 * the events with an EOF result code.
3055 */
3056 do {
3057 dev->result = ISC_R_EOF;
3058 send_recvdone_event(sock, &dev);
3059 dev = ISC_LIST_HEAD(sock->recv_list);
3060 } while (dev != NULL);
3061 goto finish;
3062
3063 case DOIO_SUCCESS:
3064 case DOIO_HARD:
3065 send_recvdone_event(sock, &dev);
3066 break;
3067 }
3068
3069 dev = ISC_LIST_HEAD(sock->recv_list);
3070 }
3071
3072 finish:
3073 if (ISC_LIST_EMPTY(sock->recv_list)) {
3074 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3075 SELECT_POKE_READ);
3076 }
3077 UNLOCK(&sock->lock);
3078 }
3079
3080 static void
internal_send(isc__socket_t * sock)3081 internal_send(isc__socket_t *sock) {
3082 isc_socketevent_t *dev;
3083
3084 INSIST(VALID_SOCKET(sock));
3085 REQUIRE(sock->fd >= 0);
3086
3087 dev = ISC_LIST_HEAD(sock->send_list);
3088 if (dev == NULL) {
3089 goto finish;
3090 }
3091 socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3092 dev->ev_sender);
3093
3094 /*
3095 * Try to do as much I/O as possible on this socket. There are no
3096 * limits here, currently.
3097 */
3098 while (dev != NULL) {
3099 switch (doio_send(sock, dev)) {
3100 case DOIO_SOFT:
3101 goto finish;
3102
3103 case DOIO_HARD:
3104 case DOIO_SUCCESS:
3105 send_senddone_event(sock, &dev);
3106 break;
3107 }
3108
3109 dev = ISC_LIST_HEAD(sock->send_list);
3110 }
3111
3112 finish:
3113 if (ISC_LIST_EMPTY(sock->send_list)) {
3114 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3115 SELECT_POKE_WRITE);
3116 }
3117 UNLOCK(&sock->lock);
3118 }
3119
3120 /*
3121 * Process read/writes on each fd here. Avoid locking
3122 * and unlocking twice if both reads and writes are possible.
3123 */
3124 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3125 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3126 isc__socket_t *sock;
3127 int lockid = FDLOCK_ID(fd);
3128
3129 /*
3130 * If the socket is going to be closed, don't do more I/O.
3131 */
3132 LOCK(&thread->fdlock[lockid]);
3133 if (thread->fdstate[fd] == CLOSE_PENDING) {
3134 UNLOCK(&thread->fdlock[lockid]);
3135
3136 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3137 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3138 return;
3139 }
3140
3141 sock = thread->fds[fd];
3142 if (sock == NULL) {
3143 UNLOCK(&thread->fdlock[lockid]);
3144 return;
3145 }
3146
3147 LOCK(&sock->lock);
3148
3149 if (sock->fd < 0) {
3150 /*
3151 * Sock is being closed - the final external reference
3152 * is gone but it was not yet removed from event loop
3153 * and fdstate[]/fds[] as destroy() is waiting on
3154 * thread->fdlock[lockid] or sock->lock that we're holding.
3155 * Just release the locks and bail.
3156 */
3157 UNLOCK(&sock->lock);
3158 UNLOCK(&thread->fdlock[lockid]);
3159 return;
3160 }
3161
3162 REQUIRE(readable || writeable);
3163 if (readable) {
3164 if (sock->listener) {
3165 internal_accept(sock);
3166 } else {
3167 internal_recv(sock);
3168 }
3169 }
3170
3171 if (writeable) {
3172 if (sock->connecting) {
3173 internal_connect(sock);
3174 } else {
3175 internal_send(sock);
3176 }
3177 }
3178
3179 /* sock->lock is unlocked in internal_* function */
3180 UNLOCK(&thread->fdlock[lockid]);
3181
3182 /*
3183 * Socket destruction might be pending, it will resume
3184 * after releasing fdlock and sock->lock.
3185 */
3186 }
3187
3188 /*
3189 * process_fds is different for different event loops
3190 * it takes the events from event loops and for each FD
3191 * launches process_fd
3192 */
3193 #ifdef USE_KQUEUE
3194 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3195 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3196 int i;
3197 bool readable, writable;
3198 bool done = false;
3199 bool have_ctlevent = false;
3200 if (nevents == thread->nevents) {
3201 /*
3202 * This is not an error, but something unexpected. If this
3203 * happens, it may indicate the need for increasing
3204 * ISC_SOCKET_MAXEVENTS.
3205 */
3206 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3207 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3208 "maximum number of FD events (%d) received",
3209 nevents);
3210 }
3211
3212 for (i = 0; i < nevents; i++) {
3213 REQUIRE(events[i].ident < thread->manager->maxsocks);
3214 if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3215 have_ctlevent = true;
3216 continue;
3217 }
3218 readable = (events[i].filter == EVFILT_READ);
3219 writable = (events[i].filter == EVFILT_WRITE);
3220 process_fd(thread, events[i].ident, readable, writable);
3221 }
3222
3223 if (have_ctlevent) {
3224 done = process_ctlfd(thread);
3225 }
3226
3227 return (done);
3228 }
3229 #elif defined(USE_EPOLL)
3230 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3231 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3232 int nevents) {
3233 int i;
3234 bool done = false;
3235 bool have_ctlevent = false;
3236
3237 if (nevents == thread->nevents) {
3238 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3239 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3240 "maximum number of FD events (%d) received",
3241 nevents);
3242 }
3243
3244 for (i = 0; i < nevents; i++) {
3245 REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3246 if (events[i].data.fd == thread->pipe_fds[0]) {
3247 have_ctlevent = true;
3248 continue;
3249 }
3250 if ((events[i].events & EPOLLERR) != 0 ||
3251 (events[i].events & EPOLLHUP) != 0) {
3252 /*
3253 * epoll does not set IN/OUT bits on an erroneous
3254 * condition, so we need to try both anyway. This is a
3255 * bit inefficient, but should be okay for such rare
3256 * events. Note also that the read or write attempt
3257 * won't block because we use non-blocking sockets.
3258 */
3259 int fd = events[i].data.fd;
3260 events[i].events |= thread->epoll_events[fd];
3261 }
3262 process_fd(thread, events[i].data.fd,
3263 (events[i].events & EPOLLIN) != 0,
3264 (events[i].events & EPOLLOUT) != 0);
3265 }
3266
3267 if (have_ctlevent) {
3268 done = process_ctlfd(thread);
3269 }
3270
3271 return (done);
3272 }
3273 #elif defined(USE_DEVPOLL)
3274 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3275 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3276 int i;
3277 bool done = false;
3278 bool have_ctlevent = false;
3279
3280 if (nevents == thread->nevents) {
3281 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3282 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3283 "maximum number of FD events (%d) received",
3284 nevents);
3285 }
3286
3287 for (i = 0; i < nevents; i++) {
3288 REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3289 if (events[i].fd == thread->pipe_fds[0]) {
3290 have_ctlevent = true;
3291 continue;
3292 }
3293 process_fd(thread, events[i].fd,
3294 (events[i].events & POLLIN) != 0,
3295 (events[i].events & POLLOUT) != 0);
3296 }
3297
3298 if (have_ctlevent) {
3299 done = process_ctlfd(thread);
3300 }
3301
3302 return (done);
3303 }
3304 #elif defined(USE_SELECT)
3305 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3306 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3307 fd_set *writefds) {
3308 int i;
3309
3310 REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3311
3312 for (i = 0; i < maxfd; i++) {
3313 if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3314 continue;
3315 }
3316 process_fd(thread, i, FD_ISSET(i, readfds),
3317 FD_ISSET(i, writefds));
3318 }
3319 }
3320 #endif /* ifdef USE_KQUEUE */
3321
3322 static bool
process_ctlfd(isc__socketthread_t * thread)3323 process_ctlfd(isc__socketthread_t *thread) {
3324 int msg, fd;
3325
3326 for (;;) {
3327 select_readmsg(thread, &fd, &msg);
3328
3329 thread_log(thread, IOEVENT,
3330 "watcher got message %d for socket %d", msg, fd);
3331
3332 /*
3333 * Nothing to read?
3334 */
3335 if (msg == SELECT_POKE_NOTHING) {
3336 break;
3337 }
3338
3339 /*
3340 * Handle shutdown message. We really should
3341 * jump out of this loop right away, but
3342 * it doesn't matter if we have to do a little
3343 * more work first.
3344 */
3345 if (msg == SELECT_POKE_SHUTDOWN) {
3346 return (true);
3347 }
3348
3349 /*
3350 * This is a wakeup on a socket. Look
3351 * at the event queue for both read and write,
3352 * and decide if we need to watch on it now
3353 * or not.
3354 */
3355 wakeup_socket(thread, fd, msg);
3356 }
3357
3358 return (false);
3359 }
3360
3361 /*
3362 * This is the thread that will loop forever, always in a select or poll
3363 * call.
3364 *
3365 * When select returns something to do, do whatever's necessary and post
3366 * an event to the task that was requesting the action.
3367 */
3368 static isc_threadresult_t
netthread(void * uap)3369 netthread(void *uap) {
3370 isc__socketthread_t *thread = uap;
3371 isc__socketmgr_t *manager = thread->manager;
3372 (void)manager;
3373 bool done;
3374 int cc;
3375 if (manager->nthreads > 1) {
3376 isc_thread_setaffinity(thread->threadid);
3377 }
3378 #ifdef USE_KQUEUE
3379 const char *fnname = "kevent()";
3380 #elif defined(USE_EPOLL)
3381 const char *fnname = "epoll_wait()";
3382 #elif defined(USE_DEVPOLL)
3383 isc_result_t result;
3384 const char *fnname = "ioctl(DP_POLL)";
3385 struct dvpoll dvp;
3386 int pass;
3387 #if defined(ISC_SOCKET_USE_POLLWATCH)
3388 pollstate_t pollstate = poll_idle;
3389 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3390 #elif defined(USE_SELECT)
3391 const char *fnname = "select()";
3392 int maxfd;
3393 int ctlfd;
3394 #endif /* ifdef USE_KQUEUE */
3395 char strbuf[ISC_STRERRORSIZE];
3396
3397 #if defined(USE_SELECT)
3398 /*
3399 * Get the control fd here. This will never change.
3400 */
3401 ctlfd = thread->pipe_fds[0];
3402 #endif /* if defined(USE_SELECT) */
3403 done = false;
3404 while (!done) {
3405 do {
3406 #ifdef USE_KQUEUE
3407 cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3408 thread->nevents, NULL);
3409 #elif defined(USE_EPOLL)
3410 cc = epoll_wait(thread->epoll_fd, thread->events,
3411 thread->nevents, -1);
3412 #elif defined(USE_DEVPOLL)
3413 /*
3414 * Re-probe every thousand calls.
3415 */
3416 if (thread->calls++ > 1000U) {
3417 result = isc_resource_getcurlimit(
3418 isc_resource_openfiles,
3419 &thread->open_max);
3420 if (result != ISC_R_SUCCESS) {
3421 thread->open_max = 64;
3422 }
3423 thread->calls = 0;
3424 }
3425 for (pass = 0; pass < 2; pass++) {
3426 dvp.dp_fds = thread->events;
3427 dvp.dp_nfds = thread->nevents;
3428 if (dvp.dp_nfds >= thread->open_max) {
3429 dvp.dp_nfds = thread->open_max - 1;
3430 }
3431 #ifndef ISC_SOCKET_USE_POLLWATCH
3432 dvp.dp_timeout = -1;
3433 #else /* ifndef ISC_SOCKET_USE_POLLWATCH */
3434 if (pollstate == poll_idle) {
3435 dvp.dp_timeout = -1;
3436 } else {
3437 dvp.dp_timeout =
3438 ISC_SOCKET_POLLWATCH_TIMEOUT;
3439 }
3440 #endif /* ISC_SOCKET_USE_POLLWATCH */
3441 cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3442 if (cc == -1 && errno == EINVAL) {
3443 /*
3444 * {OPEN_MAX} may have dropped. Look
3445 * up the current value and try again.
3446 */
3447 result = isc_resource_getcurlimit(
3448 isc_resource_openfiles,
3449 &thread->open_max);
3450 if (result != ISC_R_SUCCESS) {
3451 thread->open_max = 64;
3452 }
3453 } else {
3454 break;
3455 }
3456 }
3457 #elif defined(USE_SELECT)
3458 /*
3459 * We will have only one thread anyway, we can lock
3460 * manager lock and don't care
3461 */
3462 LOCK(&manager->lock);
3463 memmove(thread->read_fds_copy, thread->read_fds,
3464 thread->fd_bufsize);
3465 memmove(thread->write_fds_copy, thread->write_fds,
3466 thread->fd_bufsize);
3467 maxfd = thread->maxfd + 1;
3468 UNLOCK(&manager->lock);
3469
3470 cc = select(maxfd, thread->read_fds_copy,
3471 thread->write_fds_copy, NULL, NULL);
3472 #endif /* USE_KQUEUE */
3473
3474 if (cc < 0 && !SOFT_ERROR(errno)) {
3475 strerror_r(errno, strbuf, sizeof(strbuf));
3476 FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3477 fnname, strbuf);
3478 }
3479
3480 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3481 if (cc == 0) {
3482 if (pollstate == poll_active) {
3483 pollstate = poll_checking;
3484 } else if (pollstate == poll_checking) {
3485 pollstate = poll_idle;
3486 }
3487 } else if (cc > 0) {
3488 if (pollstate == poll_checking) {
3489 /*
3490 * XXX: We'd like to use a more
3491 * verbose log level as it's actually an
3492 * unexpected event, but the kernel bug
3493 * reportedly happens pretty frequently
3494 * (and it can also be a false positive)
3495 * so it would be just too noisy.
3496 */
3497 thread_log(thread,
3498 ISC_LOGCATEGORY_GENERAL,
3499 ISC_LOGMODULE_SOCKET,
3500 ISC_LOG_DEBUG(1),
3501 "unexpected POLL timeout");
3502 }
3503 pollstate = poll_active;
3504 }
3505 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3506 } while (cc < 0);
3507
3508 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3509 done = process_fds(thread, thread->events, cc);
3510 #elif defined(USE_SELECT)
3511 process_fds(thread, maxfd, thread->read_fds_copy,
3512 thread->write_fds_copy);
3513
3514 /*
3515 * Process reads on internal, control fd.
3516 */
3517 if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3518 done = process_ctlfd(thread);
3519 }
3520 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3521 * */
3522 }
3523
3524 thread_log(thread, TRACE, "watcher exiting");
3525 return ((isc_threadresult_t)0);
3526 }
3527
3528 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager0,uint32_t reserved)3529 isc_socketmgr_setreserved(isc_socketmgr_t *manager0, uint32_t reserved) {
3530 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3531
3532 REQUIRE(VALID_MANAGER(manager));
3533
3534 manager->reserved = reserved;
3535 }
3536
3537 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager0,unsigned int maxudp)3538 isc_socketmgr_maxudp(isc_socketmgr_t *manager0, unsigned int maxudp) {
3539 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3540
3541 REQUIRE(VALID_MANAGER(manager));
3542
3543 manager->maxudp = maxudp;
3544 }
3545
3546 /*
3547 * Setup socket thread, thread->manager and thread->threadid must be filled.
3548 */
3549
3550 static isc_result_t
setup_thread(isc__socketthread_t * thread)3551 setup_thread(isc__socketthread_t *thread) {
3552 isc_result_t result = ISC_R_SUCCESS;
3553 int i;
3554 char strbuf[ISC_STRERRORSIZE];
3555
3556 REQUIRE(thread != NULL);
3557 REQUIRE(VALID_MANAGER(thread->manager));
3558 REQUIRE(thread->threadid >= 0 &&
3559 thread->threadid < thread->manager->nthreads);
3560
3561 thread->fds = isc_mem_get(thread->manager->mctx,
3562 thread->manager->maxsocks *
3563 sizeof(isc__socket_t *));
3564
3565 memset(thread->fds, 0,
3566 thread->manager->maxsocks * sizeof(isc_socket_t *));
3567
3568 thread->fdstate = isc_mem_get(thread->manager->mctx,
3569 thread->manager->maxsocks * sizeof(int));
3570
3571 memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3572
3573 thread->fdlock = isc_mem_get(thread->manager->mctx,
3574 FDLOCK_COUNT * sizeof(isc_mutex_t));
3575
3576 for (i = 0; i < FDLOCK_COUNT; i++) {
3577 isc_mutex_init(&thread->fdlock[i]);
3578 }
3579
3580 if (pipe(thread->pipe_fds) != 0) {
3581 strerror_r(errno, strbuf, sizeof(strbuf));
3582 UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3583 strbuf);
3584 return (ISC_R_UNEXPECTED);
3585 }
3586 RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3587
3588 #ifdef USE_KQUEUE
3589 thread->nevents = ISC_SOCKET_MAXEVENTS;
3590 thread->events = isc_mem_get(thread->manager->mctx,
3591 sizeof(struct kevent) * thread->nevents);
3592
3593 thread->kqueue_fd = kqueue();
3594 if (thread->kqueue_fd == -1) {
3595 result = isc__errno2result(errno);
3596 strerror_r(errno, strbuf, sizeof(strbuf));
3597 UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3598 strbuf);
3599 isc_mem_put(thread->manager->mctx, thread->events,
3600 sizeof(struct kevent) * thread->nevents);
3601 return (result);
3602 }
3603
3604 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3605 if (result != ISC_R_SUCCESS) {
3606 close(thread->kqueue_fd);
3607 isc_mem_put(thread->manager->mctx, thread->events,
3608 sizeof(struct kevent) * thread->nevents);
3609 }
3610 return (result);
3611
3612 #elif defined(USE_EPOLL)
3613 thread->nevents = ISC_SOCKET_MAXEVENTS;
3614 thread->epoll_events =
3615 isc_mem_get(thread->manager->mctx,
3616 (thread->manager->maxsocks * sizeof(uint32_t)));
3617
3618 memset(thread->epoll_events, 0,
3619 thread->manager->maxsocks * sizeof(uint32_t));
3620
3621 thread->events =
3622 isc_mem_get(thread->manager->mctx,
3623 sizeof(struct epoll_event) * thread->nevents);
3624
3625 thread->epoll_fd = epoll_create(thread->nevents);
3626 if (thread->epoll_fd == -1) {
3627 result = isc__errno2result(errno);
3628 strerror_r(errno, strbuf, sizeof(strbuf));
3629 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3630 strbuf);
3631 return (result);
3632 }
3633
3634 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3635 return (result);
3636
3637 #elif defined(USE_DEVPOLL)
3638 thread->nevents = ISC_SOCKET_MAXEVENTS;
3639 result = isc_resource_getcurlimit(isc_resource_openfiles,
3640 &thread->open_max);
3641 if (result != ISC_R_SUCCESS) {
3642 thread->open_max = 64;
3643 }
3644 thread->calls = 0;
3645 thread->events = isc_mem_get(thread->manager->mctx,
3646 sizeof(struct pollfd) * thread->nevents);
3647
3648 /*
3649 * Note: fdpollinfo should be able to support all possible FDs, so
3650 * it must have maxsocks entries (not nevents).
3651 */
3652 thread->fdpollinfo =
3653 isc_mem_get(thread->manager->mctx,
3654 sizeof(pollinfo_t) * thread->manager->maxsocks);
3655 memset(thread->fdpollinfo, 0,
3656 sizeof(pollinfo_t) * thread->manager->maxsocks);
3657 thread->devpoll_fd = open("/dev/poll", O_RDWR);
3658 if (thread->devpoll_fd == -1) {
3659 result = isc__errno2result(errno);
3660 strerror_r(errno, strbuf, sizeof(strbuf));
3661 UNEXPECTED_ERROR(__FILE__, __LINE__,
3662 "open(/dev/poll) failed: %s", strbuf);
3663 isc_mem_put(thread->manager->mctx, thread->events,
3664 sizeof(struct pollfd) * thread->nevents);
3665 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3666 sizeof(pollinfo_t) * thread->manager->maxsocks);
3667 return (result);
3668 }
3669 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3670 if (result != ISC_R_SUCCESS) {
3671 close(thread->devpoll_fd);
3672 isc_mem_put(thread->manager->mctx, thread->events,
3673 sizeof(struct pollfd) * thread->nevents);
3674 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3675 sizeof(pollinfo_t) * thread->manager->maxsocks);
3676 return (result);
3677 }
3678
3679 return (ISC_R_SUCCESS);
3680 #elif defined(USE_SELECT)
3681 UNUSED(result);
3682
3683 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3684 /*
3685 * Note: this code should also cover the case of MAXSOCKETS <=
3686 * FD_SETSIZE, but we separate the cases to avoid possible portability
3687 * issues regarding howmany() and the actual representation of fd_set.
3688 */
3689 thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3690 sizeof(fd_mask);
3691 #else /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3692 thread->fd_bufsize = sizeof(fd_set);
3693 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3694
3695 thread->read_fds = isc_mem_get(thread->manager->mctx,
3696 thread->fd_bufsize);
3697 thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3698 thread->fd_bufsize);
3699 thread->write_fds = isc_mem_get(thread->manager->mctx,
3700 thread->fd_bufsize);
3701 thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3702 thread->fd_bufsize);
3703 memset(thread->read_fds, 0, thread->fd_bufsize);
3704 memset(thread->write_fds, 0, thread->fd_bufsize);
3705
3706 (void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3707 thread->maxfd = thread->pipe_fds[0];
3708
3709 return (ISC_R_SUCCESS);
3710 #endif /* USE_KQUEUE */
3711 }
3712
3713 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3714 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3715 isc_result_t result;
3716 int i;
3717
3718 result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3719 if (result != ISC_R_SUCCESS) {
3720 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3721 }
3722 #ifdef USE_KQUEUE
3723 close(thread->kqueue_fd);
3724 isc_mem_put(mctx, thread->events,
3725 sizeof(struct kevent) * thread->nevents);
3726 #elif defined(USE_EPOLL)
3727 close(thread->epoll_fd);
3728
3729 isc_mem_put(mctx, thread->events,
3730 sizeof(struct epoll_event) * thread->nevents);
3731 #elif defined(USE_DEVPOLL)
3732 close(thread->devpoll_fd);
3733 isc_mem_put(mctx, thread->events,
3734 sizeof(struct pollfd) * thread->nevents);
3735 isc_mem_put(mctx, thread->fdpollinfo,
3736 sizeof(pollinfo_t) * thread->manager->maxsocks);
3737 #elif defined(USE_SELECT)
3738 if (thread->read_fds != NULL) {
3739 isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3740 }
3741 if (thread->read_fds_copy != NULL) {
3742 isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3743 }
3744 if (thread->write_fds != NULL) {
3745 isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3746 }
3747 if (thread->write_fds_copy != NULL) {
3748 isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3749 }
3750 #endif /* USE_KQUEUE */
3751 for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3752 if (thread->fdstate[i] == CLOSE_PENDING) {
3753 /* no need to lock */
3754 (void)close(i);
3755 }
3756 }
3757
3758 #if defined(USE_EPOLL)
3759 isc_mem_put(thread->manager->mctx, thread->epoll_events,
3760 thread->manager->maxsocks * sizeof(uint32_t));
3761 #endif /* if defined(USE_EPOLL) */
3762 isc_mem_put(thread->manager->mctx, thread->fds,
3763 thread->manager->maxsocks * sizeof(isc__socket_t *));
3764 isc_mem_put(thread->manager->mctx, thread->fdstate,
3765 thread->manager->maxsocks * sizeof(int));
3766
3767 if (thread->fdlock != NULL) {
3768 for (i = 0; i < FDLOCK_COUNT; i++) {
3769 isc_mutex_destroy(&thread->fdlock[i]);
3770 }
3771 isc_mem_put(thread->manager->mctx, thread->fdlock,
3772 FDLOCK_COUNT * sizeof(isc_mutex_t));
3773 }
3774 }
3775
3776 isc_result_t
isc_socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp)3777 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3778 return (isc_socketmgr_create2(mctx, managerp, 0, 1));
3779 }
3780
3781 isc_result_t
isc_socketmgr_create2(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3782 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3783 unsigned int maxsocks, int nthreads) {
3784 int i;
3785 isc__socketmgr_t *manager;
3786
3787 REQUIRE(managerp != NULL && *managerp == NULL);
3788
3789 if (maxsocks == 0) {
3790 maxsocks = ISC_SOCKET_MAXSOCKETS;
3791 }
3792
3793 manager = isc_mem_get(mctx, sizeof(*manager));
3794
3795 /* zero-clear so that necessary cleanup on failure will be easy */
3796 memset(manager, 0, sizeof(*manager));
3797 manager->maxsocks = maxsocks;
3798 manager->reserved = 0;
3799 manager->maxudp = 0;
3800 manager->nthreads = nthreads;
3801 manager->stats = NULL;
3802
3803 manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
3804 manager->common.impmagic = SOCKET_MANAGER_MAGIC;
3805 manager->mctx = NULL;
3806 ISC_LIST_INIT(manager->socklist);
3807 isc_mutex_init(&manager->lock);
3808 isc_condition_init(&manager->shutdown_ok);
3809
3810 /*
3811 * Start up the select/poll thread.
3812 */
3813 manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3814 manager->nthreads);
3815 isc_mem_attach(mctx, &manager->mctx);
3816
3817 for (i = 0; i < manager->nthreads; i++) {
3818 manager->threads[i].manager = manager;
3819 manager->threads[i].threadid = i;
3820 setup_thread(&manager->threads[i]);
3821 isc_thread_create(netthread, &manager->threads[i],
3822 &manager->threads[i].thread);
3823 char tname[1024];
3824 sprintf(tname, "isc-socket-%d", i);
3825 isc_thread_setname(manager->threads[i].thread, tname);
3826 }
3827
3828 *managerp = (isc_socketmgr_t *)manager;
3829
3830 return (ISC_R_SUCCESS);
3831 }
3832
3833 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager0,unsigned int * nsockp)3834 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
3835 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3836 REQUIRE(VALID_MANAGER(manager));
3837 REQUIRE(nsockp != NULL);
3838
3839 *nsockp = manager->maxsocks;
3840
3841 return (ISC_R_SUCCESS);
3842 }
3843
3844 void
isc_socketmgr_setstats(isc_socketmgr_t * manager0,isc_stats_t * stats)3845 isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
3846 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3847
3848 REQUIRE(VALID_MANAGER(manager));
3849 REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3850 REQUIRE(manager->stats == NULL);
3851 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3852
3853 isc_stats_attach(stats, &manager->stats);
3854 }
3855
3856 void
isc_socketmgr_destroy(isc_socketmgr_t ** managerp)3857 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3858 isc__socketmgr_t *manager;
3859
3860 /*
3861 * Destroy a socket manager.
3862 */
3863
3864 REQUIRE(managerp != NULL);
3865 manager = (isc__socketmgr_t *)*managerp;
3866 REQUIRE(VALID_MANAGER(manager));
3867
3868 LOCK(&manager->lock);
3869
3870 /*
3871 * Wait for all sockets to be destroyed.
3872 */
3873 while (!ISC_LIST_EMPTY(manager->socklist)) {
3874 manager_log(manager, CREATION, "sockets exist");
3875 WAIT(&manager->shutdown_ok, &manager->lock);
3876 }
3877
3878 UNLOCK(&manager->lock);
3879
3880 /*
3881 * Here, poke our select/poll thread. Do this by closing the write
3882 * half of the pipe, which will send EOF to the read half.
3883 * This is currently a no-op in the non-threaded case.
3884 */
3885 for (int i = 0; i < manager->nthreads; i++) {
3886 select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3887 }
3888
3889 /*
3890 * Wait for thread to exit.
3891 */
3892 for (int i = 0; i < manager->nthreads; i++) {
3893 isc_thread_join(manager->threads[i].thread, NULL);
3894 cleanup_thread(manager->mctx, &manager->threads[i]);
3895 }
3896 /*
3897 * Clean up.
3898 */
3899 isc_mem_put(manager->mctx, manager->threads,
3900 sizeof(isc__socketthread_t) * manager->nthreads);
3901 (void)isc_condition_destroy(&manager->shutdown_ok);
3902
3903 if (manager->stats != NULL) {
3904 isc_stats_detach(&manager->stats);
3905 }
3906 isc_mutex_destroy(&manager->lock);
3907 manager->common.magic = 0;
3908 manager->common.impmagic = 0;
3909 isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
3910
3911 *managerp = NULL;
3912 }
3913
3914 static isc_result_t
socket_recv(isc__socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)3915 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3916 unsigned int flags) {
3917 int io_state;
3918 bool have_lock = false;
3919 isc_task_t *ntask = NULL;
3920 isc_result_t result = ISC_R_SUCCESS;
3921
3922 dev->ev_sender = task;
3923
3924 if (sock->type == isc_sockettype_udp) {
3925 io_state = doio_recv(sock, dev);
3926 } else {
3927 LOCK(&sock->lock);
3928 have_lock = true;
3929
3930 if (ISC_LIST_EMPTY(sock->recv_list)) {
3931 io_state = doio_recv(sock, dev);
3932 } else {
3933 io_state = DOIO_SOFT;
3934 }
3935 }
3936
3937 switch (io_state) {
3938 case DOIO_SOFT:
3939 /*
3940 * We couldn't read all or part of the request right now, so
3941 * queue it.
3942 *
3943 * Attach to socket and to task
3944 */
3945 isc_task_attach(task, &ntask);
3946 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3947
3948 if (!have_lock) {
3949 LOCK(&sock->lock);
3950 have_lock = true;
3951 }
3952
3953 /*
3954 * Enqueue the request. If the socket was previously not being
3955 * watched, poke the watcher to start paying attention to it.
3956 */
3957 bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
3958 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3959 if (do_poke) {
3960 select_poke(sock->manager, sock->threadid, sock->fd,
3961 SELECT_POKE_READ);
3962 }
3963
3964 socket_log(sock, NULL, EVENT,
3965 "socket_recv: event %p -> task %p", dev, ntask);
3966
3967 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
3968 result = ISC_R_INPROGRESS;
3969 }
3970 break;
3971
3972 case DOIO_EOF:
3973 dev->result = ISC_R_EOF;
3974 /* fallthrough */
3975
3976 case DOIO_HARD:
3977 case DOIO_SUCCESS:
3978 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
3979 send_recvdone_event(sock, &dev);
3980 }
3981 break;
3982 }
3983
3984 if (have_lock) {
3985 UNLOCK(&sock->lock);
3986 }
3987
3988 return (result);
3989 }
3990
3991 isc_result_t
isc_socket_recv(isc_socket_t * sock0,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)3992 isc_socket_recv(isc_socket_t *sock0, isc_region_t *region, unsigned int minimum,
3993 isc_task_t *task, isc_taskaction_t action, void *arg) {
3994 isc__socket_t *sock = (isc__socket_t *)sock0;
3995 isc_socketevent_t *dev;
3996 isc__socketmgr_t *manager;
3997
3998 REQUIRE(VALID_SOCKET(sock));
3999 REQUIRE(action != NULL);
4000
4001 manager = sock->manager;
4002 REQUIRE(VALID_MANAGER(manager));
4003
4004 INSIST(sock->bound);
4005
4006 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
4007 action, arg);
4008 if (dev == NULL) {
4009 return (ISC_R_NOMEMORY);
4010 }
4011
4012 return (isc_socket_recv2(sock0, region, minimum, task, dev, 0));
4013 }
4014
4015 isc_result_t
isc_socket_recv2(isc_socket_t * sock0,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)4016 isc_socket_recv2(isc_socket_t *sock0, isc_region_t *region,
4017 unsigned int minimum, isc_task_t *task,
4018 isc_socketevent_t *event, unsigned int flags) {
4019 isc__socket_t *sock = (isc__socket_t *)sock0;
4020
4021 event->ev_sender = sock;
4022 event->result = ISC_R_UNSET;
4023 event->region = *region;
4024 event->n = 0;
4025 event->offset = 0;
4026 event->attributes = 0;
4027
4028 /*
4029 * UDP sockets are always partial read.
4030 */
4031 if (sock->type == isc_sockettype_udp) {
4032 event->minimum = 1;
4033 } else {
4034 if (minimum == 0) {
4035 event->minimum = region->length;
4036 } else {
4037 event->minimum = minimum;
4038 }
4039 }
4040
4041 return (socket_recv(sock, event, task, flags));
4042 }
4043
4044 static isc_result_t
socket_send(isc__socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)4045 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4046 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4047 unsigned int flags) {
4048 int io_state;
4049 bool have_lock = false;
4050 isc_task_t *ntask = NULL;
4051 isc_result_t result = ISC_R_SUCCESS;
4052
4053 dev->ev_sender = task;
4054
4055 set_dev_address(address, sock, dev);
4056 if (pktinfo != NULL) {
4057 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4058 dev->pktinfo = *pktinfo;
4059
4060 if (!isc_sockaddr_issitelocal(&dev->address) &&
4061 !isc_sockaddr_islinklocal(&dev->address))
4062 {
4063 socket_log(sock, NULL, TRACE,
4064 "pktinfo structure provided, ifindex %u "
4065 "(set to 0)",
4066 pktinfo->ipi6_ifindex);
4067
4068 /*
4069 * Set the pktinfo index to 0 here, to let the
4070 * kernel decide what interface it should send on.
4071 */
4072 dev->pktinfo.ipi6_ifindex = 0;
4073 }
4074 }
4075
4076 if (sock->type == isc_sockettype_udp) {
4077 io_state = doio_send(sock, dev);
4078 } else {
4079 LOCK(&sock->lock);
4080 have_lock = true;
4081
4082 if (ISC_LIST_EMPTY(sock->send_list)) {
4083 io_state = doio_send(sock, dev);
4084 } else {
4085 io_state = DOIO_SOFT;
4086 }
4087 }
4088
4089 switch (io_state) {
4090 case DOIO_SOFT:
4091 /*
4092 * We couldn't send all or part of the request right now, so
4093 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4094 */
4095 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4096 isc_task_attach(task, &ntask);
4097 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4098
4099 if (!have_lock) {
4100 LOCK(&sock->lock);
4101 have_lock = true;
4102 }
4103
4104 /*
4105 * Enqueue the request. If the socket was previously
4106 * not being watched, poke the watcher to start
4107 * paying attention to it.
4108 */
4109 bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4110 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4111 if (do_poke) {
4112 select_poke(sock->manager, sock->threadid,
4113 sock->fd, SELECT_POKE_WRITE);
4114 }
4115 socket_log(sock, NULL, EVENT,
4116 "socket_send: event %p -> task %p", dev,
4117 ntask);
4118
4119 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4120 result = ISC_R_INPROGRESS;
4121 }
4122 break;
4123 }
4124
4125 /* FALLTHROUGH */
4126
4127 case DOIO_HARD:
4128 case DOIO_SUCCESS:
4129 if (!have_lock) {
4130 LOCK(&sock->lock);
4131 have_lock = true;
4132 }
4133 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4134 send_senddone_event(sock, &dev);
4135 }
4136 break;
4137 }
4138
4139 if (have_lock) {
4140 UNLOCK(&sock->lock);
4141 }
4142
4143 return (result);
4144 }
4145
4146 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4147 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4148 isc_taskaction_t action, void *arg) {
4149 /*
4150 * REQUIRE() checking is performed in isc_socket_sendto().
4151 */
4152 return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4153 }
4154
4155 isc_result_t
isc_socket_sendto(isc_socket_t * sock0,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4156 isc_socket_sendto(isc_socket_t *sock0, isc_region_t *region, isc_task_t *task,
4157 isc_taskaction_t action, void *arg,
4158 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4159 isc__socket_t *sock = (isc__socket_t *)sock0;
4160 isc_socketevent_t *dev;
4161 isc__socketmgr_t *manager;
4162
4163 REQUIRE(VALID_SOCKET(sock));
4164 REQUIRE(region != NULL);
4165 REQUIRE(task != NULL);
4166 REQUIRE(action != NULL);
4167
4168 manager = sock->manager;
4169 REQUIRE(VALID_MANAGER(manager));
4170
4171 INSIST(sock->bound);
4172
4173 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4174 action, arg);
4175 if (dev == NULL) {
4176 return (ISC_R_NOMEMORY);
4177 }
4178
4179 dev->region = *region;
4180
4181 return (socket_send(sock, dev, task, address, pktinfo, 0));
4182 }
4183
4184 isc_result_t
isc_socket_sendto2(isc_socket_t * sock0,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4185 isc_socket_sendto2(isc_socket_t *sock0, isc_region_t *region, isc_task_t *task,
4186 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4187 isc_socketevent_t *event, unsigned int flags) {
4188 isc__socket_t *sock = (isc__socket_t *)sock0;
4189
4190 REQUIRE(VALID_SOCKET(sock));
4191 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4192 0);
4193 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4194 REQUIRE(sock->type == isc_sockettype_udp);
4195 }
4196 event->ev_sender = sock;
4197 event->result = ISC_R_UNSET;
4198 event->region = *region;
4199 event->n = 0;
4200 event->offset = 0;
4201 event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4202
4203 return (socket_send(sock, event, task, address, pktinfo, flags));
4204 }
4205
4206 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4207 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4208 #ifdef ISC_PLATFORM_HAVESYSUNH
4209 int s;
4210 struct stat sb;
4211 char strbuf[ISC_STRERRORSIZE];
4212
4213 if (sockaddr->type.sa.sa_family != AF_UNIX) {
4214 return;
4215 }
4216
4217 #ifndef S_ISSOCK
4218 #if defined(S_IFMT) && defined(S_IFSOCK)
4219 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4220 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4221 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4222 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4223 #endif /* ifndef S_ISSOCK */
4224
4225 #ifndef S_ISFIFO
4226 #if defined(S_IFMT) && defined(S_IFIFO)
4227 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4228 #elif defined(_S_IFMT) && defined(S_IFIFO)
4229 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4230 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4231 #endif /* ifndef S_ISFIFO */
4232
4233 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4234 /* cppcheck-suppress preprocessorErrorDirective */
4235 #error \
4236 You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>.
4237 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4238
4239 #ifndef S_ISFIFO
4240 #define S_ISFIFO(mode) 0
4241 #endif /* ifndef S_ISFIFO */
4242
4243 #ifndef S_ISSOCK
4244 #define S_ISSOCK(mode) 0
4245 #endif /* ifndef S_ISSOCK */
4246
4247 if (active) {
4248 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4249 strerror_r(errno, strbuf, sizeof(strbuf));
4250 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4251 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4252 "isc_socket_cleanunix: stat(%s): %s",
4253 sockaddr->type.sunix.sun_path, strbuf);
4254 return;
4255 }
4256 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4257 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4258 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4259 "isc_socket_cleanunix: %s: not a socket",
4260 sockaddr->type.sunix.sun_path);
4261 return;
4262 }
4263 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4264 strerror_r(errno, strbuf, sizeof(strbuf));
4265 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4266 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4267 "isc_socket_cleanunix: unlink(%s): %s",
4268 sockaddr->type.sunix.sun_path, strbuf);
4269 }
4270 return;
4271 }
4272
4273 s = socket(AF_UNIX, SOCK_STREAM, 0);
4274 if (s < 0) {
4275 strerror_r(errno, strbuf, sizeof(strbuf));
4276 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4277 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4278 "isc_socket_cleanunix: socket(%s): %s",
4279 sockaddr->type.sunix.sun_path, strbuf);
4280 return;
4281 }
4282
4283 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4284 switch (errno) {
4285 case ENOENT: /* We exited cleanly last time */
4286 break;
4287 default:
4288 strerror_r(errno, strbuf, sizeof(strbuf));
4289 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4290 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4291 "isc_socket_cleanunix: stat(%s): %s",
4292 sockaddr->type.sunix.sun_path, strbuf);
4293 break;
4294 }
4295 goto cleanup;
4296 }
4297
4298 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4299 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4300 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4301 "isc_socket_cleanunix: %s: not a socket",
4302 sockaddr->type.sunix.sun_path);
4303 goto cleanup;
4304 }
4305
4306 if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4307 sizeof(sockaddr->type.sunix)) < 0)
4308 {
4309 switch (errno) {
4310 case ECONNREFUSED:
4311 case ECONNRESET:
4312 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4313 strerror_r(errno, strbuf, sizeof(strbuf));
4314 isc_log_write(
4315 isc_lctx, ISC_LOGCATEGORY_GENERAL,
4316 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4317 "isc_socket_cleanunix: "
4318 "unlink(%s): %s",
4319 sockaddr->type.sunix.sun_path, strbuf);
4320 }
4321 break;
4322 default:
4323 strerror_r(errno, strbuf, sizeof(strbuf));
4324 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4325 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4326 "isc_socket_cleanunix: connect(%s): %s",
4327 sockaddr->type.sunix.sun_path, strbuf);
4328 break;
4329 }
4330 }
4331 cleanup:
4332 close(s);
4333 #else /* ifdef ISC_PLATFORM_HAVESYSUNH */
4334 UNUSED(sockaddr);
4335 UNUSED(active);
4336 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4337 }
4338
4339 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4340 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4341 uint32_t owner, uint32_t group) {
4342 #ifdef ISC_PLATFORM_HAVESYSUNH
4343 isc_result_t result = ISC_R_SUCCESS;
4344 char strbuf[ISC_STRERRORSIZE];
4345 char path[sizeof(sockaddr->type.sunix.sun_path)];
4346 #ifdef NEED_SECURE_DIRECTORY
4347 char *slash;
4348 #endif /* ifdef NEED_SECURE_DIRECTORY */
4349
4350 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4351 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4352 strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4353
4354 #ifdef NEED_SECURE_DIRECTORY
4355 slash = strrchr(path, '/');
4356 if (slash != NULL) {
4357 if (slash != path) {
4358 *slash = '\0';
4359 } else {
4360 strlcpy(path, "/", sizeof(path));
4361 }
4362 } else {
4363 strlcpy(path, ".", sizeof(path));
4364 }
4365 #endif /* ifdef NEED_SECURE_DIRECTORY */
4366
4367 if (chmod(path, perm) < 0) {
4368 strerror_r(errno, strbuf, sizeof(strbuf));
4369 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4370 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4371 "isc_socket_permunix: chmod(%s, %d): %s", path,
4372 perm, strbuf);
4373 result = ISC_R_FAILURE;
4374 }
4375 if (chown(path, owner, group) < 0) {
4376 strerror_r(errno, strbuf, sizeof(strbuf));
4377 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4378 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4379 "isc_socket_permunix: chown(%s, %d, %d): %s",
4380 path, owner, group, strbuf);
4381 result = ISC_R_FAILURE;
4382 }
4383 return (result);
4384 #else /* ifdef ISC_PLATFORM_HAVESYSUNH */
4385 UNUSED(sockaddr);
4386 UNUSED(perm);
4387 UNUSED(owner);
4388 UNUSED(group);
4389 return (ISC_R_NOTIMPLEMENTED);
4390 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4391 }
4392
4393 isc_result_t
isc_socket_bind(isc_socket_t * sock0,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4394 isc_socket_bind(isc_socket_t *sock0, const isc_sockaddr_t *sockaddr,
4395 isc_socket_options_t options) {
4396 isc__socket_t *sock = (isc__socket_t *)sock0;
4397 char strbuf[ISC_STRERRORSIZE];
4398 int on = 1;
4399
4400 REQUIRE(VALID_SOCKET(sock));
4401
4402 LOCK(&sock->lock);
4403
4404 INSIST(!sock->bound);
4405 INSIST(!sock->dupped);
4406
4407 if (sock->pf != sockaddr->type.sa.sa_family) {
4408 UNLOCK(&sock->lock);
4409 return (ISC_R_FAMILYMISMATCH);
4410 }
4411
4412 /*
4413 * Only set SO_REUSEADDR when we want a specific port.
4414 */
4415 #ifdef AF_UNIX
4416 if (sock->pf == AF_UNIX) {
4417 goto bind_socket;
4418 }
4419 #endif /* ifdef AF_UNIX */
4420 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4421 isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4422 {
4423 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4424 sizeof(on)) < 0) {
4425 UNEXPECTED_ERROR(__FILE__, __LINE__,
4426 "setsockopt(%d) failed", sock->fd);
4427 }
4428 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4429 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4430 (void *)&on, sizeof(on)) < 0)
4431 {
4432 UNEXPECTED_ERROR(__FILE__, __LINE__,
4433 "setsockopt(%d) failed", sock->fd);
4434 }
4435 #elif defined(__linux__) && defined(SO_REUSEPORT)
4436 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4437 sizeof(on)) < 0) {
4438 UNEXPECTED_ERROR(__FILE__, __LINE__,
4439 "setsockopt(%d) failed", sock->fd);
4440 }
4441 #endif /* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4442 /* Press on... */
4443 }
4444 #ifdef AF_UNIX
4445 bind_socket:
4446 #endif /* ifdef AF_UNIX */
4447 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4448 inc_stats(sock->manager->stats,
4449 sock->statsindex[STATID_BINDFAIL]);
4450
4451 UNLOCK(&sock->lock);
4452 switch (errno) {
4453 case EACCES:
4454 return (ISC_R_NOPERM);
4455 case EADDRNOTAVAIL:
4456 return (ISC_R_ADDRNOTAVAIL);
4457 case EADDRINUSE:
4458 return (ISC_R_ADDRINUSE);
4459 case EINVAL:
4460 return (ISC_R_BOUND);
4461 default:
4462 strerror_r(errno, strbuf, sizeof(strbuf));
4463 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4464 strbuf);
4465 return (ISC_R_UNEXPECTED);
4466 }
4467 }
4468
4469 socket_log(sock, sockaddr, TRACE, "bound");
4470 sock->bound = 1;
4471
4472 UNLOCK(&sock->lock);
4473 return (ISC_R_SUCCESS);
4474 }
4475
4476 /*
4477 * Enable this only for specific OS versions, and only when they have repaired
4478 * their problems with it. Until then, this is is broken and needs to be
4479 * disabled by default. See RT22589 for details.
4480 */
4481 #undef ENABLE_ACCEPTFILTER
4482
4483 isc_result_t
isc_socket_filter(isc_socket_t * sock0,const char * filter)4484 isc_socket_filter(isc_socket_t *sock0, const char *filter) {
4485 isc__socket_t *sock = (isc__socket_t *)sock0;
4486 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4487 char strbuf[ISC_STRERRORSIZE];
4488 struct accept_filter_arg afa;
4489 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4490 UNUSED(sock);
4491 UNUSED(filter);
4492 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4493
4494 REQUIRE(VALID_SOCKET(sock));
4495
4496 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4497 bzero(&afa, sizeof(afa));
4498 strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4499 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4500 sizeof(afa)) == -1) {
4501 strerror_r(errno, strbuf, sizeof(strbuf));
4502 socket_log(sock, NULL, CREATION,
4503 "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4504 return (ISC_R_FAILURE);
4505 }
4506 return (ISC_R_SUCCESS);
4507 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4508 return (ISC_R_NOTIMPLEMENTED);
4509 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4510 }
4511
4512 /*
4513 * Try enabling TCP Fast Open for a given socket if the OS supports it.
4514 */
4515 static void
set_tcp_fastopen(isc__socket_t * sock,unsigned int backlog)4516 set_tcp_fastopen(isc__socket_t *sock, unsigned int backlog) {
4517 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4518 char strbuf[ISC_STRERRORSIZE];
4519
4520 /*
4521 * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4522 * shipping a default kernel without TFO support, so we special-case it by
4523 * performing an additional runtime check for TFO support using sysctl to
4524 * prevent setsockopt() errors from being logged.
4525 */
4526 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4527 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4528 unsigned int enabled;
4529 size_t enabledlen = sizeof(enabled);
4530 static bool tfo_notice_logged = false;
4531
4532 if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4533 /*
4534 * This kernel does not support TCP Fast Open. There is
4535 * nothing more we can do.
4536 */
4537 return;
4538 } else if (enabled == 0) {
4539 /*
4540 * This kernel does support TCP Fast Open, but it is disabled
4541 * by sysctl. Notify the user, but do not nag.
4542 */
4543 if (!tfo_notice_logged) {
4544 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4545 ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4546 "TCP_FASTOPEN support is disabled by "
4547 "sysctl (" SYSCTL_TFO " = 0)");
4548 tfo_notice_logged = true;
4549 }
4550 return;
4551 }
4552 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4553
4554 #ifdef __APPLE__
4555 backlog = 1;
4556 #else /* ifdef __APPLE__ */
4557 backlog = backlog / 2;
4558 if (backlog == 0) {
4559 backlog = 1;
4560 }
4561 #endif /* ifdef __APPLE__ */
4562 if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4563 sizeof(backlog)) < 0)
4564 {
4565 strerror_r(errno, strbuf, sizeof(strbuf));
4566 UNEXPECTED_ERROR(__FILE__, __LINE__,
4567 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4568 sock->fd, strbuf);
4569 /* TCP_FASTOPEN is experimental so ignore failures */
4570 }
4571 #else /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4572 UNUSED(sock);
4573 UNUSED(backlog);
4574 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4575 }
4576
4577 /*
4578 * Set up to listen on a given socket. We do this by creating an internal
4579 * event that will be dispatched when the socket has read activity. The
4580 * watcher will send the internal event to the task when there is a new
4581 * connection.
4582 *
4583 * Unlike in read, we don't preallocate a done event here. Every time there
4584 * is a new connection we'll have to allocate a new one anyway, so we might
4585 * as well keep things simple rather than having to track them.
4586 */
4587 isc_result_t
isc_socket_listen(isc_socket_t * sock0,unsigned int backlog)4588 isc_socket_listen(isc_socket_t *sock0, unsigned int backlog) {
4589 isc__socket_t *sock = (isc__socket_t *)sock0;
4590 char strbuf[ISC_STRERRORSIZE];
4591
4592 REQUIRE(VALID_SOCKET(sock));
4593
4594 LOCK(&sock->lock);
4595
4596 REQUIRE(!sock->listener);
4597 REQUIRE(sock->bound);
4598 REQUIRE(sock->type == isc_sockettype_tcp ||
4599 sock->type == isc_sockettype_unix);
4600
4601 if (backlog == 0) {
4602 backlog = SOMAXCONN;
4603 }
4604
4605 if (listen(sock->fd, (int)backlog) < 0) {
4606 UNLOCK(&sock->lock);
4607 strerror_r(errno, strbuf, sizeof(strbuf));
4608
4609 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4610
4611 return (ISC_R_UNEXPECTED);
4612 }
4613
4614 set_tcp_fastopen(sock, backlog);
4615
4616 sock->listener = 1;
4617
4618 UNLOCK(&sock->lock);
4619 return (ISC_R_SUCCESS);
4620 }
4621
4622 /*
4623 * This should try to do aggressive accept() XXXMLG
4624 */
4625 isc_result_t
isc_socket_accept(isc_socket_t * sock0,isc_task_t * task,isc_taskaction_t action,void * arg)4626 isc_socket_accept(isc_socket_t *sock0, isc_task_t *task,
4627 isc_taskaction_t action, void *arg) {
4628 isc__socket_t *sock = (isc__socket_t *)sock0;
4629 isc_socket_newconnev_t *dev;
4630 isc__socketmgr_t *manager;
4631 isc_task_t *ntask = NULL;
4632 isc__socket_t *nsock;
4633 isc_result_t result;
4634 bool do_poke = false;
4635
4636 REQUIRE(VALID_SOCKET(sock));
4637 manager = sock->manager;
4638 REQUIRE(VALID_MANAGER(manager));
4639
4640 LOCK(&sock->lock);
4641
4642 REQUIRE(sock->listener);
4643
4644 /*
4645 * Sender field is overloaded here with the task we will be sending
4646 * this event to. Just before the actual event is delivered the
4647 * actual ev_sender will be touched up to be the socket.
4648 */
4649 dev = (isc_socket_newconnev_t *)isc_event_allocate(
4650 manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4651 sizeof(*dev));
4652 ISC_LINK_INIT(dev, ev_link);
4653
4654 result = allocate_socket(manager, sock->type, &nsock);
4655 if (result != ISC_R_SUCCESS) {
4656 isc_event_free(ISC_EVENT_PTR(&dev));
4657 UNLOCK(&sock->lock);
4658 return (result);
4659 }
4660
4661 /*
4662 * Attach to socket and to task.
4663 */
4664 isc_task_attach(task, &ntask);
4665 if (isc_task_exiting(ntask)) {
4666 free_socket(&nsock);
4667 isc_task_detach(&ntask);
4668 isc_event_free(ISC_EVENT_PTR(&dev));
4669 UNLOCK(&sock->lock);
4670 return (ISC_R_SHUTTINGDOWN);
4671 }
4672 isc_refcount_increment0(&nsock->references);
4673 nsock->statsindex = sock->statsindex;
4674
4675 dev->ev_sender = ntask;
4676 dev->newsocket = (isc_socket_t *)nsock;
4677
4678 /*
4679 * Poke watcher here. We still have the socket locked, so there
4680 * is no race condition. We will keep the lock for such a short
4681 * bit of time waking it up now or later won't matter all that much.
4682 */
4683 do_poke = ISC_LIST_EMPTY(sock->accept_list);
4684 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4685 if (do_poke) {
4686 select_poke(manager, sock->threadid, sock->fd,
4687 SELECT_POKE_ACCEPT);
4688 }
4689 UNLOCK(&sock->lock);
4690 return (ISC_R_SUCCESS);
4691 }
4692
4693 isc_result_t
isc_socket_connect(isc_socket_t * sock0,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4694 isc_socket_connect(isc_socket_t *sock0, const isc_sockaddr_t *addr,
4695 isc_task_t *task, isc_taskaction_t action, void *arg) {
4696 isc__socket_t *sock = (isc__socket_t *)sock0;
4697 isc_socket_connev_t *dev;
4698 isc_task_t *ntask = NULL;
4699 isc__socketmgr_t *manager;
4700 int cc;
4701 char strbuf[ISC_STRERRORSIZE];
4702 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4703
4704 REQUIRE(VALID_SOCKET(sock));
4705 REQUIRE(addr != NULL);
4706 REQUIRE(task != NULL);
4707 REQUIRE(action != NULL);
4708
4709 manager = sock->manager;
4710 REQUIRE(VALID_MANAGER(manager));
4711 REQUIRE(addr != NULL);
4712
4713 if (isc_sockaddr_ismulticast(addr)) {
4714 return (ISC_R_MULTICAST);
4715 }
4716
4717 LOCK(&sock->lock);
4718
4719 dev = (isc_socket_connev_t *)isc_event_allocate(
4720 manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4721 sizeof(*dev));
4722 ISC_LINK_INIT(dev, ev_link);
4723
4724 if (sock->connecting) {
4725 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4726 goto queue;
4727 }
4728
4729 if (sock->connected) {
4730 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4731 dev->result = ISC_R_SUCCESS;
4732 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4733
4734 UNLOCK(&sock->lock);
4735
4736 return (ISC_R_SUCCESS);
4737 }
4738
4739 /*
4740 * Try to do the connect right away, as there can be only one
4741 * outstanding, and it might happen to complete.
4742 */
4743 sock->peer_address = *addr;
4744 cc = connect(sock->fd, &addr->type.sa, addr->length);
4745 if (cc < 0) {
4746 /*
4747 * The socket is nonblocking and the connection cannot be
4748 * completed immediately. It is possible to select(2) or
4749 * poll(2) for completion by selecting the socket for writing.
4750 * After select(2) indicates writability, use getsockopt(2) to
4751 * read the SO_ERROR option at level SOL_SOCKET to determine
4752 * whether connect() completed successfully (SO_ERROR is zero)
4753 * or unsuccessfully (SO_ERROR is one of the usual error codes
4754 * listed here, explaining the reason for the failure).
4755 */
4756 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4757 cc = 0;
4758 goto success;
4759 }
4760 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4761 goto queue;
4762 }
4763
4764 switch (errno) {
4765 #define ERROR_MATCH(a, b) \
4766 case a: \
4767 dev->result = b; \
4768 goto err_exit;
4769 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4770 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4771 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4772 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4773 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4774 #ifdef EHOSTDOWN
4775 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4776 #endif /* ifdef EHOSTDOWN */
4777 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4778 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4779 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4780 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4781 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4782 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4783 #undef ERROR_MATCH
4784 }
4785
4786 sock->connected = 0;
4787
4788 strerror_r(errno, strbuf, sizeof(strbuf));
4789 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4790 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4791 addrbuf, errno, strbuf);
4792
4793 UNLOCK(&sock->lock);
4794 inc_stats(sock->manager->stats,
4795 sock->statsindex[STATID_CONNECTFAIL]);
4796 isc_event_free(ISC_EVENT_PTR(&dev));
4797 return (ISC_R_UNEXPECTED);
4798
4799 err_exit:
4800 sock->connected = 0;
4801 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4802
4803 UNLOCK(&sock->lock);
4804 inc_stats(sock->manager->stats,
4805 sock->statsindex[STATID_CONNECTFAIL]);
4806 return (ISC_R_SUCCESS);
4807 }
4808
4809 /*
4810 * If connect completed, fire off the done event.
4811 */
4812 success:
4813 if (cc == 0) {
4814 sock->connected = 1;
4815 sock->bound = 1;
4816 dev->result = ISC_R_SUCCESS;
4817 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4818
4819 UNLOCK(&sock->lock);
4820
4821 inc_stats(sock->manager->stats,
4822 sock->statsindex[STATID_CONNECT]);
4823
4824 return (ISC_R_SUCCESS);
4825 }
4826
4827 queue:
4828
4829 /*
4830 * Attach to task.
4831 */
4832 isc_task_attach(task, &ntask);
4833
4834 dev->ev_sender = ntask;
4835
4836 /*
4837 * Poke watcher here. We still have the socket locked, so there
4838 * is no race condition. We will keep the lock for such a short
4839 * bit of time waking it up now or later won't matter all that much.
4840 */
4841 bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4842 ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4843 if (do_poke && !sock->connecting) {
4844 sock->connecting = 1;
4845 select_poke(manager, sock->threadid, sock->fd,
4846 SELECT_POKE_CONNECT);
4847 }
4848
4849 UNLOCK(&sock->lock);
4850 return (ISC_R_SUCCESS);
4851 }
4852
4853 /*
4854 * Called when a socket with a pending connect() finishes.
4855 */
4856 static void
internal_connect(isc__socket_t * sock)4857 internal_connect(isc__socket_t *sock) {
4858 isc_socket_connev_t *dev;
4859 int cc;
4860 isc_result_t result;
4861 socklen_t optlen;
4862 char strbuf[ISC_STRERRORSIZE];
4863 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4864
4865 INSIST(VALID_SOCKET(sock));
4866 REQUIRE(sock->fd >= 0);
4867
4868 /*
4869 * Get the first item off the connect list.
4870 * If it is empty, unlock the socket and return.
4871 */
4872 dev = ISC_LIST_HEAD(sock->connect_list);
4873 if (dev == NULL) {
4874 INSIST(!sock->connecting);
4875 goto finish;
4876 }
4877
4878 INSIST(sock->connecting);
4879 sock->connecting = 0;
4880
4881 /*
4882 * Get any possible error status here.
4883 */
4884 optlen = sizeof(cc);
4885 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4886 (void *)&optlen) != 0)
4887 {
4888 cc = errno;
4889 } else {
4890 errno = cc;
4891 }
4892
4893 if (errno != 0) {
4894 /*
4895 * If the error is EAGAIN, just re-select on this
4896 * fd and pretend nothing strange happened.
4897 */
4898 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4899 sock->connecting = 1;
4900 return;
4901 }
4902
4903 inc_stats(sock->manager->stats,
4904 sock->statsindex[STATID_CONNECTFAIL]);
4905
4906 /*
4907 * Translate other errors into ISC_R_* flavors.
4908 */
4909 switch (errno) {
4910 #define ERROR_MATCH(a, b) \
4911 case a: \
4912 result = b; \
4913 break;
4914 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4915 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4916 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4917 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4918 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4919 #ifdef EHOSTDOWN
4920 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4921 #endif /* ifdef EHOSTDOWN */
4922 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4923 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4924 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4925 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4926 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4927 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4928 #undef ERROR_MATCH
4929 default:
4930 result = ISC_R_UNEXPECTED;
4931 isc_sockaddr_format(&sock->peer_address, peerbuf,
4932 sizeof(peerbuf));
4933 strerror_r(errno, strbuf, sizeof(strbuf));
4934 UNEXPECTED_ERROR(__FILE__, __LINE__,
4935 "internal_connect: connect(%s) %s",
4936 peerbuf, strbuf);
4937 }
4938 } else {
4939 inc_stats(sock->manager->stats,
4940 sock->statsindex[STATID_CONNECT]);
4941 result = ISC_R_SUCCESS;
4942 sock->connected = 1;
4943 sock->bound = 1;
4944 }
4945
4946 do {
4947 dev->result = result;
4948 send_connectdone_event(sock, &dev);
4949 dev = ISC_LIST_HEAD(sock->connect_list);
4950 } while (dev != NULL);
4951
4952 finish:
4953 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
4954 SELECT_POKE_CONNECT);
4955 UNLOCK(&sock->lock);
4956 }
4957
4958 isc_result_t
isc_socket_getpeername(isc_socket_t * sock0,isc_sockaddr_t * addressp)4959 isc_socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
4960 isc__socket_t *sock = (isc__socket_t *)sock0;
4961 isc_result_t result;
4962
4963 REQUIRE(VALID_SOCKET(sock));
4964 REQUIRE(addressp != NULL);
4965
4966 LOCK(&sock->lock);
4967
4968 if (sock->connected) {
4969 *addressp = sock->peer_address;
4970 result = ISC_R_SUCCESS;
4971 } else {
4972 result = ISC_R_NOTCONNECTED;
4973 }
4974
4975 UNLOCK(&sock->lock);
4976
4977 return (result);
4978 }
4979
4980 isc_result_t
isc_socket_getsockname(isc_socket_t * sock0,isc_sockaddr_t * addressp)4981 isc_socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
4982 isc__socket_t *sock = (isc__socket_t *)sock0;
4983 socklen_t len;
4984 isc_result_t result;
4985 char strbuf[ISC_STRERRORSIZE];
4986
4987 REQUIRE(VALID_SOCKET(sock));
4988 REQUIRE(addressp != NULL);
4989
4990 LOCK(&sock->lock);
4991
4992 if (!sock->bound) {
4993 result = ISC_R_NOTBOUND;
4994 goto out;
4995 }
4996
4997 result = ISC_R_SUCCESS;
4998
4999 len = sizeof(addressp->type);
5000 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5001 strerror_r(errno, strbuf, sizeof(strbuf));
5002 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
5003 result = ISC_R_UNEXPECTED;
5004 goto out;
5005 }
5006 addressp->length = (unsigned int)len;
5007
5008 out:
5009 UNLOCK(&sock->lock);
5010
5011 return (result);
5012 }
5013
5014 /*
5015 * Run through the list of events on this socket, and cancel the ones
5016 * queued for task "task" of type "how". "how" is a bitmask.
5017 */
5018 void
isc_socket_cancel(isc_socket_t * sock0,isc_task_t * task,unsigned int how)5019 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
5020 isc__socket_t *sock = (isc__socket_t *)sock0;
5021
5022 REQUIRE(VALID_SOCKET(sock));
5023
5024 /*
5025 * Quick exit if there is nothing to do. Don't even bother locking
5026 * in this case.
5027 */
5028 if (how == 0) {
5029 return;
5030 }
5031
5032 LOCK(&sock->lock);
5033
5034 /*
5035 * All of these do the same thing, more or less.
5036 * Each will:
5037 * o If the internal event is marked as "posted" try to
5038 * remove it from the task's queue. If this fails, mark it
5039 * as canceled instead, and let the task clean it up later.
5040 * o For each I/O request for that task of that type, post
5041 * its done event with status of "ISC_R_CANCELED".
5042 * o Reset any state needed.
5043 */
5044 if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
5045 !ISC_LIST_EMPTY(sock->recv_list)) {
5046 isc_socketevent_t *dev;
5047 isc_socketevent_t *next;
5048 isc_task_t *current_task;
5049
5050 dev = ISC_LIST_HEAD(sock->recv_list);
5051
5052 while (dev != NULL) {
5053 current_task = dev->ev_sender;
5054 next = ISC_LIST_NEXT(dev, ev_link);
5055
5056 if ((task == NULL) || (task == current_task)) {
5057 dev->result = ISC_R_CANCELED;
5058 send_recvdone_event(sock, &dev);
5059 }
5060 dev = next;
5061 }
5062 }
5063
5064 if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
5065 !ISC_LIST_EMPTY(sock->send_list)) {
5066 isc_socketevent_t *dev;
5067 isc_socketevent_t *next;
5068 isc_task_t *current_task;
5069
5070 dev = ISC_LIST_HEAD(sock->send_list);
5071
5072 while (dev != NULL) {
5073 current_task = dev->ev_sender;
5074 next = ISC_LIST_NEXT(dev, ev_link);
5075
5076 if ((task == NULL) || (task == current_task)) {
5077 dev->result = ISC_R_CANCELED;
5078 send_senddone_event(sock, &dev);
5079 }
5080 dev = next;
5081 }
5082 }
5083
5084 if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
5085 !ISC_LIST_EMPTY(sock->accept_list)) {
5086 isc_socket_newconnev_t *dev;
5087 isc_socket_newconnev_t *next;
5088 isc_task_t *current_task;
5089
5090 dev = ISC_LIST_HEAD(sock->accept_list);
5091 while (dev != NULL) {
5092 current_task = dev->ev_sender;
5093 next = ISC_LIST_NEXT(dev, ev_link);
5094
5095 if ((task == NULL) || (task == current_task)) {
5096 ISC_LIST_UNLINK(sock->accept_list, dev,
5097 ev_link);
5098
5099 (void)isc_refcount_decrement(
5100 &NEWCONNSOCK(dev)->references);
5101 free_socket((isc__socket_t **)&dev->newsocket);
5102
5103 dev->result = ISC_R_CANCELED;
5104 dev->ev_sender = sock;
5105 isc_task_sendtoanddetach(¤t_task,
5106 ISC_EVENT_PTR(&dev),
5107 sock->threadid);
5108 }
5109
5110 dev = next;
5111 }
5112 }
5113
5114 if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5115 !ISC_LIST_EMPTY(sock->connect_list))
5116 {
5117 isc_socket_connev_t *dev;
5118 isc_socket_connev_t *next;
5119 isc_task_t *current_task;
5120
5121 INSIST(sock->connecting);
5122 sock->connecting = 0;
5123
5124 dev = ISC_LIST_HEAD(sock->connect_list);
5125
5126 while (dev != NULL) {
5127 current_task = dev->ev_sender;
5128 next = ISC_LIST_NEXT(dev, ev_link);
5129
5130 if ((task == NULL) || (task == current_task)) {
5131 dev->result = ISC_R_CANCELED;
5132 send_connectdone_event(sock, &dev);
5133 }
5134 dev = next;
5135 }
5136 }
5137
5138 UNLOCK(&sock->lock);
5139 }
5140
5141 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock0)5142 isc_socket_gettype(isc_socket_t *sock0) {
5143 isc__socket_t *sock = (isc__socket_t *)sock0;
5144
5145 REQUIRE(VALID_SOCKET(sock));
5146
5147 return (sock->type);
5148 }
5149
5150 void
isc_socket_ipv6only(isc_socket_t * sock0,bool yes)5151 isc_socket_ipv6only(isc_socket_t *sock0, bool yes) {
5152 isc__socket_t *sock = (isc__socket_t *)sock0;
5153 #if defined(IPV6_V6ONLY)
5154 int onoff = yes ? 1 : 0;
5155 #else /* if defined(IPV6_V6ONLY) */
5156 UNUSED(yes);
5157 UNUSED(sock);
5158 #endif /* if defined(IPV6_V6ONLY) */
5159
5160 REQUIRE(VALID_SOCKET(sock));
5161 INSIST(!sock->dupped);
5162
5163 #ifdef IPV6_V6ONLY
5164 if (sock->pf == AF_INET6) {
5165 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5166 (void *)&onoff, sizeof(int)) < 0)
5167 {
5168 char strbuf[ISC_STRERRORSIZE];
5169 strerror_r(errno, strbuf, sizeof(strbuf));
5170 UNEXPECTED_ERROR(__FILE__, __LINE__,
5171 "setsockopt(%d, IPV6_V6ONLY) failed: "
5172 "%s",
5173 sock->fd, strbuf);
5174 }
5175 }
5176 #endif /* ifdef IPV6_V6ONLY */
5177 }
5178
5179 static void
setdscp(isc__socket_t * sock,isc_dscp_t dscp)5180 setdscp(isc__socket_t *sock, isc_dscp_t dscp) {
5181 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5182 int value = dscp << 2;
5183 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5184
5185 sock->dscp = dscp;
5186
5187 #ifdef IP_TOS
5188 if (sock->pf == AF_INET) {
5189 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5190 sizeof(value)) < 0) {
5191 char strbuf[ISC_STRERRORSIZE];
5192 strerror_r(errno, strbuf, sizeof(strbuf));
5193 UNEXPECTED_ERROR(__FILE__, __LINE__,
5194 "setsockopt(%d, IP_TOS, %.02x) "
5195 "failed: %s",
5196 sock->fd, value >> 2, strbuf);
5197 }
5198 }
5199 #endif /* ifdef IP_TOS */
5200 #ifdef IPV6_TCLASS
5201 if (sock->pf == AF_INET6) {
5202 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5203 (void *)&value, sizeof(value)) < 0)
5204 {
5205 char strbuf[ISC_STRERRORSIZE];
5206 strerror_r(errno, strbuf, sizeof(strbuf));
5207 UNEXPECTED_ERROR(__FILE__, __LINE__,
5208 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5209 "failed: %s",
5210 sock->fd, dscp >> 2, strbuf);
5211 }
5212 }
5213 #endif /* ifdef IPV6_TCLASS */
5214 }
5215
5216 void
isc_socket_dscp(isc_socket_t * sock0,isc_dscp_t dscp)5217 isc_socket_dscp(isc_socket_t *sock0, isc_dscp_t dscp) {
5218 isc__socket_t *sock = (isc__socket_t *)sock0;
5219
5220 REQUIRE(VALID_SOCKET(sock));
5221 REQUIRE(dscp < 0x40);
5222
5223 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5224 UNUSED(dscp);
5225 #else /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5226 if (dscp < 0) {
5227 return;
5228 }
5229
5230 /* The DSCP value must not be changed once it has been set. */
5231 if (isc_dscp_check_value != -1) {
5232 INSIST(dscp == isc_dscp_check_value);
5233 }
5234 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5235
5236 #ifdef notyet
5237 REQUIRE(!sock->dupped);
5238 #endif /* ifdef notyet */
5239
5240 setdscp(sock, dscp);
5241 }
5242
5243 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5244 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5245 isc_taskaction_t action, void *arg) {
5246 return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5247 }
5248
5249 void
isc_socket_setname(isc_socket_t * socket0,const char * name,void * tag)5250 isc_socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
5251 isc__socket_t *sock = (isc__socket_t *)socket0;
5252
5253 /*
5254 * Name 'sock'.
5255 */
5256
5257 REQUIRE(VALID_SOCKET(sock));
5258
5259 LOCK(&sock->lock);
5260 strlcpy(sock->name, name, sizeof(sock->name));
5261 sock->tag = tag;
5262 UNLOCK(&sock->lock);
5263 }
5264
5265 const char *
isc_socket_getname(isc_socket_t * socket0)5266 isc_socket_getname(isc_socket_t *socket0) {
5267 isc__socket_t *sock = (isc__socket_t *)socket0;
5268
5269 return (sock->name);
5270 }
5271
5272 void *
isc_socket_gettag(isc_socket_t * socket0)5273 isc_socket_gettag(isc_socket_t *socket0) {
5274 isc__socket_t *sock = (isc__socket_t *)socket0;
5275
5276 return (sock->tag);
5277 }
5278
5279 int
isc_socket_getfd(isc_socket_t * socket0)5280 isc_socket_getfd(isc_socket_t *socket0) {
5281 isc__socket_t *sock = (isc__socket_t *)socket0;
5282
5283 return ((short)sock->fd);
5284 }
5285
5286 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5287 static bool hasreuseport = false;
5288
5289 static void
init_hasreuseport()5290 init_hasreuseport() {
5291 /*
5292 * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5293 * We only want to use it on Linux, if it's available. On BSD we want to dup()
5294 * sockets instead of re-binding them.
5295 */
5296 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5297 (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5298 int sock, yes = 1;
5299 sock = socket(AF_INET, SOCK_DGRAM, 0);
5300 if (sock < 0) {
5301 sock = socket(AF_INET6, SOCK_DGRAM, 0);
5302 if (sock < 0) {
5303 return;
5304 }
5305 }
5306 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5307 sizeof(yes)) < 0) {
5308 close(sock);
5309 return;
5310 #if defined(__FreeBSD_kernel__)
5311 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5312 sizeof(yes)) < 0)
5313 #else /* if defined(__FreeBSD_kernel__) */
5314 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5315 sizeof(yes)) < 0)
5316 #endif /* if defined(__FreeBSD_kernel__) */
5317 {
5318 close(sock);
5319 return;
5320 }
5321 hasreuseport = true;
5322 close(sock);
5323 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5324 * (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5325 }
5326
5327 bool
isc_socket_hasreuseport()5328 isc_socket_hasreuseport() {
5329 RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5330 ISC_R_SUCCESS);
5331 return (hasreuseport);
5332 }
5333
5334 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5335 static const char *
_socktype(isc_sockettype_t type)5336 _socktype(isc_sockettype_t type) {
5337 switch (type) {
5338 case isc_sockettype_udp:
5339 return ("udp");
5340 case isc_sockettype_tcp:
5341 return ("tcp");
5342 case isc_sockettype_unix:
5343 return ("unix");
5344 default:
5345 return ("not-initialized");
5346 }
5347 }
5348 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5349
5350 #ifdef HAVE_LIBXML2
5351 #define TRY0(a) \
5352 do { \
5353 xmlrc = (a); \
5354 if (xmlrc < 0) \
5355 goto error; \
5356 } while (0)
5357 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr0,void * writer0)5358 isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, void *writer0) {
5359 isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5360 isc__socket_t *sock = NULL;
5361 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5362 isc_sockaddr_t addr;
5363 socklen_t len;
5364 int xmlrc;
5365 xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5366
5367 LOCK(&mgr->lock);
5368
5369 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5370 sock = ISC_LIST_HEAD(mgr->socklist);
5371 while (sock != NULL) {
5372 LOCK(&sock->lock);
5373 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5374
5375 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5376 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5377 TRY0(xmlTextWriterEndElement(writer));
5378
5379 if (sock->name[0] != 0) {
5380 TRY0(xmlTextWriterStartElement(writer,
5381 ISC_XMLCHAR "name"));
5382 TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5383 sock->name));
5384 TRY0(xmlTextWriterEndElement(writer)); /* name */
5385 }
5386
5387 TRY0(xmlTextWriterStartElement(writer,
5388 ISC_XMLCHAR "references"));
5389 TRY0(xmlTextWriterWriteFormatString(
5390 writer, "%d",
5391 (int)isc_refcount_current(&sock->references)));
5392 TRY0(xmlTextWriterEndElement(writer));
5393
5394 TRY0(xmlTextWriterWriteElement(
5395 writer, ISC_XMLCHAR "type",
5396 ISC_XMLCHAR _socktype(sock->type)));
5397
5398 if (sock->connected) {
5399 isc_sockaddr_format(&sock->peer_address, peerbuf,
5400 sizeof(peerbuf));
5401 TRY0(xmlTextWriterWriteElement(
5402 writer, ISC_XMLCHAR "peer-address",
5403 ISC_XMLCHAR peerbuf));
5404 }
5405
5406 len = sizeof(addr);
5407 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5408 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5409 TRY0(xmlTextWriterWriteElement(
5410 writer, ISC_XMLCHAR "local-address",
5411 ISC_XMLCHAR peerbuf));
5412 }
5413
5414 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5415 if (sock->listener) {
5416 TRY0(xmlTextWriterWriteElement(writer,
5417 ISC_XMLCHAR "state",
5418 ISC_XMLCHAR "listener"));
5419 }
5420 if (sock->connected) {
5421 TRY0(xmlTextWriterWriteElement(
5422 writer, ISC_XMLCHAR "state",
5423 ISC_XMLCHAR "connected"));
5424 }
5425 if (sock->connecting) {
5426 TRY0(xmlTextWriterWriteElement(
5427 writer, ISC_XMLCHAR "state",
5428 ISC_XMLCHAR "connecting"));
5429 }
5430 if (sock->bound) {
5431 TRY0(xmlTextWriterWriteElement(writer,
5432 ISC_XMLCHAR "state",
5433 ISC_XMLCHAR "bound"));
5434 }
5435
5436 TRY0(xmlTextWriterEndElement(writer)); /* states */
5437
5438 TRY0(xmlTextWriterEndElement(writer)); /* socket */
5439
5440 UNLOCK(&sock->lock);
5441 sock = ISC_LIST_NEXT(sock, link);
5442 }
5443 TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5444
5445 error:
5446 if (sock != NULL) {
5447 UNLOCK(&sock->lock);
5448 }
5449
5450 UNLOCK(&mgr->lock);
5451
5452 return (xmlrc);
5453 }
5454 #endif /* HAVE_LIBXML2 */
5455
5456 #ifdef HAVE_JSON_C
5457 #define CHECKMEM(m) \
5458 do { \
5459 if (m == NULL) { \
5460 result = ISC_R_NOMEMORY; \
5461 goto error; \
5462 } \
5463 } while (0)
5464
5465 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr0,void * stats0)5466 isc_socketmgr_renderjson(isc_socketmgr_t *mgr0, void *stats0) {
5467 isc_result_t result = ISC_R_SUCCESS;
5468 isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5469 isc__socket_t *sock = NULL;
5470 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5471 isc_sockaddr_t addr;
5472 socklen_t len;
5473 json_object *obj, *array = json_object_new_array();
5474 json_object *stats = (json_object *)stats0;
5475
5476 CHECKMEM(array);
5477
5478 LOCK(&mgr->lock);
5479
5480 sock = ISC_LIST_HEAD(mgr->socklist);
5481 while (sock != NULL) {
5482 json_object *states, *entry = json_object_new_object();
5483 char buf[255];
5484
5485 CHECKMEM(entry);
5486 json_object_array_add(array, entry);
5487
5488 LOCK(&sock->lock);
5489
5490 snprintf(buf, sizeof(buf), "%p", sock);
5491 obj = json_object_new_string(buf);
5492 CHECKMEM(obj);
5493 json_object_object_add(entry, "id", obj);
5494
5495 if (sock->name[0] != 0) {
5496 obj = json_object_new_string(sock->name);
5497 CHECKMEM(obj);
5498 json_object_object_add(entry, "name", obj);
5499 }
5500
5501 obj = json_object_new_int(
5502 (int)isc_refcount_current(&sock->references));
5503 CHECKMEM(obj);
5504 json_object_object_add(entry, "references", obj);
5505
5506 obj = json_object_new_string(_socktype(sock->type));
5507 CHECKMEM(obj);
5508 json_object_object_add(entry, "type", obj);
5509
5510 if (sock->connected) {
5511 isc_sockaddr_format(&sock->peer_address, peerbuf,
5512 sizeof(peerbuf));
5513 obj = json_object_new_string(peerbuf);
5514 CHECKMEM(obj);
5515 json_object_object_add(entry, "peer-address", obj);
5516 }
5517
5518 len = sizeof(addr);
5519 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5520 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5521 obj = json_object_new_string(peerbuf);
5522 CHECKMEM(obj);
5523 json_object_object_add(entry, "local-address", obj);
5524 }
5525
5526 states = json_object_new_array();
5527 CHECKMEM(states);
5528 json_object_object_add(entry, "states", states);
5529
5530 if (sock->listener) {
5531 obj = json_object_new_string("listener");
5532 CHECKMEM(obj);
5533 json_object_array_add(states, obj);
5534 }
5535
5536 if (sock->connected) {
5537 obj = json_object_new_string("connected");
5538 CHECKMEM(obj);
5539 json_object_array_add(states, obj);
5540 }
5541
5542 if (sock->connecting) {
5543 obj = json_object_new_string("connecting");
5544 CHECKMEM(obj);
5545 json_object_array_add(states, obj);
5546 }
5547
5548 if (sock->bound) {
5549 obj = json_object_new_string("bound");
5550 CHECKMEM(obj);
5551 json_object_array_add(states, obj);
5552 }
5553
5554 UNLOCK(&sock->lock);
5555 sock = ISC_LIST_NEXT(sock, link);
5556 }
5557
5558 json_object_object_add(stats, "sockets", array);
5559 array = NULL;
5560 result = ISC_R_SUCCESS;
5561
5562 error:
5563 if (array != NULL) {
5564 json_object_put(array);
5565 }
5566
5567 if (sock != NULL) {
5568 UNLOCK(&sock->lock);
5569 }
5570
5571 UNLOCK(&mgr->lock);
5572
5573 return (result);
5574 }
5575 #endif /* HAVE_JSON_C */
5576
5577 isc_result_t
isc_socketmgr_createinctx(isc_mem_t * mctx,isc_socketmgr_t ** managerp)5578 isc_socketmgr_createinctx(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
5579 isc_result_t result;
5580
5581 result = isc_socketmgr_create(mctx, managerp);
5582
5583 return (result);
5584 }
5585