1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
7  *
8  * See the COPYRIGHT file distributed with this work for additional
9  * information regarding copyright ownership.
10  */
11 
12 /*! \file */
13 
14 #include <inttypes.h>
15 #include <stdbool.h>
16 #include <sys/param.h>
17 #include <sys/socket.h>
18 #include <sys/stat.h>
19 #include <sys/types.h>
20 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
21 #include <sys/sysctl.h>
22 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
23 #include <sys/time.h>
24 #include <sys/uio.h>
25 
26 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
27 #include <linux/netlink.h>
28 #include <linux/rtnetlink.h>
29 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
30 	*/
31 
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <stddef.h>
35 #include <stdlib.h>
36 #include <sys/un.h>
37 #include <unistd.h>
38 
39 #include <isc/app.h>
40 #include <isc/buffer.h>
41 #include <isc/condition.h>
42 #include <isc/formatcheck.h>
43 #include <isc/list.h>
44 #include <isc/log.h>
45 #include <isc/mem.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/print.h>
50 #include <isc/refcount.h>
51 #include <isc/region.h>
52 #include <isc/resource.h>
53 #include <isc/socket.h>
54 #include <isc/stats.h>
55 #include <isc/strerr.h>
56 #include <isc/string.h>
57 #include <isc/task.h>
58 #include <isc/thread.h>
59 #include <isc/util.h>
60 
61 #ifdef HAVE_KQUEUE
62 #include <sys/event.h>
63 #endif /* ifdef HAVE_KQUEUE */
64 #ifdef HAVE_EPOLL_CREATE1
65 #include <sys/epoll.h>
66 #endif /* ifdef HAVE_EPOLL_CREATE1 */
67 #if defined(HAVE_SYS_DEVPOLL_H)
68 #include <sys/devpoll.h>
69 #elif defined(HAVE_DEVPOLL_H)
70 #include <devpoll.h>
71 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
72 
73 #include <netinet/tcp.h>
74 
75 #include "errno2result.h"
76 #include "socket_p.h"
77 
78 #ifdef ENABLE_TCP_FASTOPEN
79 #include <netinet/tcp.h>
80 #endif /* ifdef ENABLE_TCP_FASTOPEN */
81 
82 #ifdef HAVE_JSON_C
83 #include <json_object.h>
84 #endif /* HAVE_JSON_C */
85 
86 #ifdef HAVE_LIBXML2
87 #include <libxml/xmlwriter.h>
88 #define ISC_XMLCHAR (const xmlChar *)
89 #endif /* HAVE_LIBXML2 */
90 
91 /*%
92  * Choose the most preferable multiplex method.
93  */
94 #if defined(HAVE_KQUEUE)
95 #define USE_KQUEUE
96 #elif defined(HAVE_EPOLL_CREATE1)
97 #define USE_EPOLL
98 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
99 #define USE_DEVPOLL
100 typedef struct {
101 	unsigned int want_read : 1, want_write : 1;
102 } pollinfo_t;
103 #else /* if defined(HAVE_KQUEUE) */
104 #define USE_SELECT
105 #endif /* HAVE_KQUEUE */
106 
107 /*
108  * Set by the -T dscp option on the command line. If set to a value
109  * other than -1, we check to make sure DSCP values match it, and
110  * assert if not.
111  */
112 int isc_dscp_check_value = -1;
113 
114 /*%
115  * Maximum number of allowable open sockets.  This is also the maximum
116  * allowable socket file descriptor.
117  *
118  * Care should be taken before modifying this value for select():
119  * The API standard doesn't ensure select() accept more than (the system default
120  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
121  * the vast majority of cases.  This constant should therefore be increased only
122  * when absolutely necessary and possible, i.e., the server is exhausting all
123  * available file descriptors (up to FD_SETSIZE) and the select() function
124  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
125  * always by true, but we keep using some of them to ensure as much
126  * portability as possible).  Note also that overall server performance
127  * may be rather worsened with a larger value of this constant due to
128  * inherent scalability problems of select().
129  *
130  * As a special note, this value shouldn't have to be touched if
131  * this is a build for an authoritative only DNS server.
132  */
133 #ifndef ISC_SOCKET_MAXSOCKETS
134 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
135 #ifdef TUNE_LARGE
136 #define ISC_SOCKET_MAXSOCKETS 21000
137 #else /* ifdef TUNE_LARGE */
138 #define ISC_SOCKET_MAXSOCKETS 4096
139 #endif /* TUNE_LARGE */
140 #elif defined(USE_SELECT)
141 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
142 #endif /* USE_KQUEUE... */
143 #endif /* ISC_SOCKET_MAXSOCKETS */
144 
145 #ifdef USE_SELECT
146 /*%
147  * Mac OS X needs a special definition to support larger values in select().
148  * We always define this because a larger value can be specified run-time.
149  */
150 #ifdef __APPLE__
151 #define _DARWIN_UNLIMITED_SELECT
152 #endif /* __APPLE__ */
153 #endif /* USE_SELECT */
154 
155 #ifdef ISC_SOCKET_USE_POLLWATCH
156 /*%
157  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
158  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
159  * some of the specified FD.  The idea is based on the observation that it's
160  * likely for a busy server to keep receiving packets.  It specifically works
161  * as follows: the socket watcher is first initialized with the state of
162  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
163  * event occurs.  When it wakes up for a socket I/O event, it moves to the
164  * poll_active state, and sets the poll timeout to a short period
165  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
166  * watcher goes to the poll_checking state with the same timeout period.
167  * In this state, the watcher tries to detect whether this is a break
168  * during intermittent events or the kernel bug is triggered.  If the next
169  * polling reports an event within the short period, the previous timeout is
170  * likely to be a kernel bug, and so the watcher goes back to the active state.
171  * Otherwise, it moves to the idle state again.
172  *
173  * It's not clear whether this is a thread-related bug, but since we've only
174  * seen this with threads, this workaround is used only when enabling threads.
175  */
176 
177 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
178 
179 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
180 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
181 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
182 #endif /* ISC_SOCKET_USE_POLLWATCH */
183 
184 /*%
185  * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
186  */
187 #define FDLOCK_BITS  10
188 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
189 #define FDLOCK_ID(fd)                                   \
190 	(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
191 	 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
192 
193 /*%
194  * Maximum number of events communicated with the kernel.  There should normally
195  * be no need for having a large number.
196  */
197 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
198 #ifndef ISC_SOCKET_MAXEVENTS
199 #ifdef TUNE_LARGE
200 #define ISC_SOCKET_MAXEVENTS 2048
201 #else /* ifdef TUNE_LARGE */
202 #define ISC_SOCKET_MAXEVENTS 64
203 #endif /* TUNE_LARGE */
204 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
205 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
206 	* */
207 
208 /*%
209  * Some systems define the socket length argument as an int, some as size_t,
210  * some as socklen_t.  This is here so it can be easily changed if needed.
211  */
212 #ifndef socklen_t
213 #define socklen_t unsigned int
214 #endif /* ifndef socklen_t */
215 
216 /*%
217  * Define what the possible "soft" errors can be.  These are non-fatal returns
218  * of various network related functions, like recv() and so on.
219  *
220  * For some reason, BSDI (and perhaps others) will sometimes return <0
221  * from recv() but will have errno==0.  This is broken, but we have to
222  * work around it here.
223  */
224 #define SOFT_ERROR(e)                                             \
225 	((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
226 	 (e) == EINTR || (e) == 0)
227 
228 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
229 
230 /*!<
231  * DLVL(90)  --  Function entry/exit and other tracing.
232  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
233  * DLVL(60)  --  Socket data send/receive
234  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
235  * DLVL(20)  --  Socket creation/destruction.
236  */
237 #define TRACE_LEVEL	  90
238 #define CORRECTNESS_LEVEL 70
239 #define IOEVENT_LEVEL	  60
240 #define EVENT_LEVEL	  50
241 #define CREATION_LEVEL	  20
242 
243 #define TRACE	    DLVL(TRACE_LEVEL)
244 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
245 #define IOEVENT	    DLVL(IOEVENT_LEVEL)
246 #define EVENT	    DLVL(EVENT_LEVEL)
247 #define CREATION    DLVL(CREATION_LEVEL)
248 
249 typedef isc_event_t intev_t;
250 
251 #define SOCKET_MAGIC	ISC_MAGIC('I', 'O', 'i', 'o')
252 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
253 
254 /*!
255  * IPv6 control information.  If the socket is an IPv6 socket we want
256  * to collect the destination address and interface so the client can
257  * set them on outgoing packets.
258  */
259 #ifndef USE_CMSG
260 #define USE_CMSG 1
261 #endif /* ifndef USE_CMSG */
262 
263 /*%
264  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
265  * a setsockopt() like interface to request timestamps, and if the OS
266  * doesn't do it for us, call gettimeofday() on every UDP receive?
267  */
268 #ifdef SO_TIMESTAMP
269 #ifndef USE_CMSG
270 #define USE_CMSG 1
271 #endif /* ifndef USE_CMSG */
272 #endif /* ifdef SO_TIMESTAMP */
273 
274 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
275 #define SET_RCVBUF
276 #endif
277 
278 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
279 #define SET_SNDBUF
280 #endif
281 
282 /*%
283  * Instead of calculating the cmsgbuf lengths every time we take
284  * a rule of thumb approach - sizes are taken from x86_64 linux,
285  * multiplied by 2, everything should fit. Those sizes are not
286  * large enough to cause any concern.
287  */
288 #if defined(USE_CMSG)
289 #define CMSG_SP_IN6PKT 40
290 #else /* if defined(USE_CMSG) */
291 #define CMSG_SP_IN6PKT 0
292 #endif /* if defined(USE_CMSG) */
293 
294 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
295 #define CMSG_SP_TIMESTAMP 32
296 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
297 #define CMSG_SP_TIMESTAMP 0
298 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
299 
300 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
301 #define CMSG_SP_TCTOS 24
302 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
303 #define CMSG_SP_TCTOS 0
304 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
305 
306 #define CMSG_SP_INT 24
307 
308 /* Align cmsg buffers to be safe on SPARC etc. */
309 #define RECVCMSGBUFLEN                                                       \
310 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
311 			  1,                                                 \
312 		  sizeof(void *))
313 #define SENDCMSGBUFLEN                                                    \
314 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
315 		  sizeof(void *))
316 
317 /*%
318  * The number of times a send operation is repeated if the result is EINTR.
319  */
320 #define NRETRIES 10
321 
322 typedef struct isc__socketthread isc__socketthread_t;
323 
324 #define NEWCONNSOCK(ev) ((ev)->newsocket)
325 
326 struct isc_socket {
327 	/* Not locked. */
328 	unsigned int magic;
329 	isc_socketmgr_t *manager;
330 	isc_mutex_t lock;
331 	isc_sockettype_t type;
332 	const isc_statscounter_t *statsindex;
333 	isc_refcount_t references;
334 
335 	/* Locked by socket lock. */
336 	ISC_LINK(isc_socket_t) link;
337 	int fd;
338 	int pf;
339 	int threadid;
340 	char name[16];
341 	void *tag;
342 
343 	ISC_LIST(isc_socketevent_t) send_list;
344 	ISC_LIST(isc_socketevent_t) recv_list;
345 	ISC_LIST(isc_socket_newconnev_t) accept_list;
346 	ISC_LIST(isc_socket_connev_t) connect_list;
347 
348 	isc_sockaddr_t peer_address; /* remote address */
349 
350 	unsigned int listener : 1,	       /* listener socket */
351 		connected : 1, connecting : 1, /* connect pending
352 						* */
353 		bound	: 1,		       /* bound to local addr */
354 		active	: 1,		       /* currently active */
355 		pktdscp : 1;		       /* per packet dscp */
356 
357 #ifdef ISC_PLATFORM_RECVOVERFLOW
358 	unsigned char overflow; /* used for MSG_TRUNC fake */
359 #endif				/* ifdef ISC_PLATFORM_RECVOVERFLOW */
360 
361 	unsigned int dscp;
362 };
363 
364 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
365 #define VALID_MANAGER(m)     ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
366 
367 struct isc_socketmgr {
368 	/* Not locked. */
369 	unsigned int magic;
370 	isc_mem_t *mctx;
371 	isc_mutex_t lock;
372 	isc_stats_t *stats;
373 	int nthreads;
374 	isc__socketthread_t *threads;
375 	unsigned int maxsocks;
376 	/* Locked by manager lock. */
377 	ISC_LIST(isc_socket_t) socklist;
378 	int reserved; /* unlocked */
379 	isc_condition_t shutdown_ok;
380 	size_t maxudp;
381 };
382 
383 struct isc__socketthread {
384 	isc_socketmgr_t *manager;
385 	int threadid;
386 	isc_thread_t thread;
387 	int pipe_fds[2];
388 	isc_mutex_t *fdlock;
389 	/* Locked by fdlock. */
390 	isc_socket_t **fds;
391 	int *fdstate;
392 #ifdef USE_KQUEUE
393 	int kqueue_fd;
394 	int nevents;
395 	struct kevent *events;
396 #endif /* USE_KQUEUE */
397 #ifdef USE_EPOLL
398 	int epoll_fd;
399 	int nevents;
400 	struct epoll_event *events;
401 	uint32_t *epoll_events;
402 #endif /* USE_EPOLL */
403 #ifdef USE_DEVPOLL
404 	int devpoll_fd;
405 	isc_resourcevalue_t open_max;
406 	unsigned int calls;
407 	int nevents;
408 	struct pollfd *events;
409 	pollinfo_t *fdpollinfo;
410 #endif /* USE_DEVPOLL */
411 #ifdef USE_SELECT
412 	int fd_bufsize;
413 	fd_set *read_fds;
414 	fd_set *read_fds_copy;
415 	fd_set *write_fds;
416 	fd_set *write_fds_copy;
417 	int maxfd;
418 #endif /* USE_SELECT */
419 };
420 
421 #define CLOSED	      0 /* this one must be zero */
422 #define MANAGED	      1
423 #define CLOSE_PENDING 2
424 
425 /*
426  * send() and recv() iovec counts
427  */
428 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
429 #ifdef ISC_PLATFORM_RECVOVERFLOW
430 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
431 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
432 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
433 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
434 
435 static isc_result_t
436 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
437 	      isc_socket_t **socketp);
438 static void
439 send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
440 static void
441 send_senddone_event(isc_socket_t *, isc_socketevent_t **);
442 static void
443 send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
444 static void
445 free_socket(isc_socket_t **);
446 static isc_result_t
447 allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
448 static void
449 destroy(isc_socket_t **);
450 static void
451 internal_accept(isc_socket_t *);
452 static void
453 internal_connect(isc_socket_t *);
454 static void
455 internal_recv(isc_socket_t *);
456 static void
457 internal_send(isc_socket_t *);
458 static void
459 process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
460 static void
461 build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
462 		  struct iovec *, size_t *);
463 static void
464 build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
465 		  struct iovec *, size_t *);
466 static bool
467 process_ctlfd(isc__socketthread_t *thread);
468 static void
469 setdscp(isc_socket_t *sock, isc_dscp_t dscp);
470 
471 #define SELECT_POKE_SHUTDOWN (-1)
472 #define SELECT_POKE_NOTHING  (-2)
473 #define SELECT_POKE_READ     (-3)
474 #define SELECT_POKE_ACCEPT   (-3) /*%< Same as _READ */
475 #define SELECT_POKE_WRITE    (-4)
476 #define SELECT_POKE_CONNECT  (-4) /*%< Same as _WRITE */
477 #define SELECT_POKE_CLOSE    (-5)
478 
479 /*%
480  * Shortcut index arrays to get access to statistics counters.
481  */
482 enum {
483 	STATID_OPEN = 0,
484 	STATID_OPENFAIL = 1,
485 	STATID_CLOSE = 2,
486 	STATID_BINDFAIL = 3,
487 	STATID_CONNECTFAIL = 4,
488 	STATID_CONNECT = 5,
489 	STATID_ACCEPTFAIL = 6,
490 	STATID_ACCEPT = 7,
491 	STATID_SENDFAIL = 8,
492 	STATID_RECVFAIL = 9,
493 	STATID_ACTIVE = 10
494 };
495 static const isc_statscounter_t udp4statsindex[] = {
496 	isc_sockstatscounter_udp4open,
497 	isc_sockstatscounter_udp4openfail,
498 	isc_sockstatscounter_udp4close,
499 	isc_sockstatscounter_udp4bindfail,
500 	isc_sockstatscounter_udp4connectfail,
501 	isc_sockstatscounter_udp4connect,
502 	-1,
503 	-1,
504 	isc_sockstatscounter_udp4sendfail,
505 	isc_sockstatscounter_udp4recvfail,
506 	isc_sockstatscounter_udp4active
507 };
508 static const isc_statscounter_t udp6statsindex[] = {
509 	isc_sockstatscounter_udp6open,
510 	isc_sockstatscounter_udp6openfail,
511 	isc_sockstatscounter_udp6close,
512 	isc_sockstatscounter_udp6bindfail,
513 	isc_sockstatscounter_udp6connectfail,
514 	isc_sockstatscounter_udp6connect,
515 	-1,
516 	-1,
517 	isc_sockstatscounter_udp6sendfail,
518 	isc_sockstatscounter_udp6recvfail,
519 	isc_sockstatscounter_udp6active
520 };
521 static const isc_statscounter_t tcp4statsindex[] = {
522 	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
523 	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
524 	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
525 	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
526 	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
527 	isc_sockstatscounter_tcp4active
528 };
529 static const isc_statscounter_t tcp6statsindex[] = {
530 	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
531 	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
532 	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
533 	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
534 	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
535 	isc_sockstatscounter_tcp6active
536 };
537 static const isc_statscounter_t unixstatsindex[] = {
538 	isc_sockstatscounter_unixopen,	      isc_sockstatscounter_unixopenfail,
539 	isc_sockstatscounter_unixclose,	      isc_sockstatscounter_unixbindfail,
540 	isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
541 	isc_sockstatscounter_unixacceptfail,  isc_sockstatscounter_unixaccept,
542 	isc_sockstatscounter_unixsendfail,    isc_sockstatscounter_unixrecvfail,
543 	isc_sockstatscounter_unixactive
544 };
545 static const isc_statscounter_t rawstatsindex[] = {
546 	isc_sockstatscounter_rawopen,
547 	isc_sockstatscounter_rawopenfail,
548 	isc_sockstatscounter_rawclose,
549 	-1,
550 	-1,
551 	-1,
552 	-1,
553 	-1,
554 	-1,
555 	isc_sockstatscounter_rawrecvfail,
556 	isc_sockstatscounter_rawactive
557 };
558 
559 static int
560 gen_threadid(isc_socket_t *sock);
561 
562 static int
gen_threadid(isc_socket_t * sock)563 gen_threadid(isc_socket_t *sock) {
564 	return (sock->fd % sock->manager->nthreads);
565 }
566 
567 static void
568 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
569 	    isc_logmodule_t *module, int level, const char *fmt, ...)
570 	ISC_FORMAT_PRINTF(5, 6);
571 static void
manager_log(isc_socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)572 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
573 	    isc_logmodule_t *module, int level, const char *fmt, ...) {
574 	char msgbuf[2048];
575 	va_list ap;
576 
577 	if (!isc_log_wouldlog(isc_lctx, level)) {
578 		return;
579 	}
580 
581 	va_start(ap, fmt);
582 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
583 	va_end(ap);
584 
585 	isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
586 		      sockmgr, msgbuf);
587 }
588 
589 static void
590 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
591 	   isc_logmodule_t *module, int level, const char *fmt, ...)
592 	ISC_FORMAT_PRINTF(5, 6);
593 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)594 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
595 	   isc_logmodule_t *module, int level, const char *fmt, ...) {
596 	char msgbuf[2048];
597 	va_list ap;
598 
599 	if (!isc_log_wouldlog(isc_lctx, level)) {
600 		return;
601 	}
602 
603 	va_start(ap, fmt);
604 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
605 	va_end(ap);
606 
607 	isc_log_write(isc_lctx, category, module, level,
608 		      "sockmgr %p thread %d: %s", thread->manager,
609 		      thread->threadid, msgbuf);
610 }
611 
612 static void
613 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
614 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
615 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
616 static void
socket_log(isc_socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)617 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
618 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
619 	   const char *fmt, ...) {
620 	char msgbuf[2048];
621 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
622 	va_list ap;
623 
624 	if (!isc_log_wouldlog(isc_lctx, level)) {
625 		return;
626 	}
627 
628 	va_start(ap, fmt);
629 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
630 	va_end(ap);
631 
632 	if (address == NULL) {
633 		isc_log_write(isc_lctx, category, module, level,
634 			      "socket %p: %s", sock, msgbuf);
635 	} else {
636 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
637 		isc_log_write(isc_lctx, category, module, level,
638 			      "socket %p %s: %s", sock, peerbuf, msgbuf);
639 	}
640 }
641 
642 /*%
643  * Increment socket-related statistics counters.
644  */
645 static inline void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)646 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
647 	REQUIRE(counterid != -1);
648 
649 	if (stats != NULL) {
650 		isc_stats_increment(stats, counterid);
651 	}
652 }
653 
654 /*%
655  * Decrement socket-related statistics counters.
656  */
657 static inline void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)658 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
659 	REQUIRE(counterid != -1);
660 
661 	if (stats != NULL) {
662 		isc_stats_decrement(stats, counterid);
663 	}
664 }
665 
666 static inline isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)667 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
668 	isc_result_t result = ISC_R_SUCCESS;
669 
670 #ifdef USE_KQUEUE
671 	struct kevent evchange;
672 
673 	memset(&evchange, 0, sizeof(evchange));
674 	if (msg == SELECT_POKE_READ) {
675 		evchange.filter = EVFILT_READ;
676 	} else {
677 		evchange.filter = EVFILT_WRITE;
678 	}
679 	evchange.flags = EV_ADD;
680 	evchange.ident = fd;
681 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
682 		result = isc__errno2result(errno);
683 	}
684 
685 	return (result);
686 #elif defined(USE_EPOLL)
687 	struct epoll_event event;
688 	uint32_t oldevents;
689 	int ret;
690 	int op;
691 
692 	oldevents = thread->epoll_events[fd];
693 	if (msg == SELECT_POKE_READ) {
694 		thread->epoll_events[fd] |= EPOLLIN;
695 	} else {
696 		thread->epoll_events[fd] |= EPOLLOUT;
697 	}
698 
699 	event.events = thread->epoll_events[fd];
700 	memset(&event.data, 0, sizeof(event.data));
701 	event.data.fd = fd;
702 
703 	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
704 	if (thread->fds[fd] != NULL) {
705 		LOCK(&thread->fds[fd]->lock);
706 	}
707 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
708 	if (thread->fds[fd] != NULL) {
709 		UNLOCK(&thread->fds[fd]->lock);
710 	}
711 	if (ret == -1) {
712 		if (errno == EEXIST) {
713 			UNEXPECTED_ERROR(__FILE__, __LINE__,
714 					 "epoll_ctl(ADD/MOD) returned "
715 					 "EEXIST for fd %d",
716 					 fd);
717 		}
718 		result = isc__errno2result(errno);
719 	}
720 
721 	return (result);
722 #elif defined(USE_DEVPOLL)
723 	struct pollfd pfd;
724 
725 	memset(&pfd, 0, sizeof(pfd));
726 	if (msg == SELECT_POKE_READ) {
727 		pfd.events = POLLIN;
728 	} else {
729 		pfd.events = POLLOUT;
730 	}
731 	pfd.fd = fd;
732 	pfd.revents = 0;
733 	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
734 		result = isc__errno2result(errno);
735 	} else {
736 		if (msg == SELECT_POKE_READ) {
737 			thread->fdpollinfo[fd].want_read = 1;
738 		} else {
739 			thread->fdpollinfo[fd].want_write = 1;
740 		}
741 	}
742 
743 	return (result);
744 #elif defined(USE_SELECT)
745 	LOCK(&thread->manager->lock);
746 	if (msg == SELECT_POKE_READ) {
747 		FD_SET(fd, thread->read_fds);
748 	}
749 	if (msg == SELECT_POKE_WRITE) {
750 		FD_SET(fd, thread->write_fds);
751 	}
752 	UNLOCK(&thread->manager->lock);
753 
754 	return (result);
755 #endif /* ifdef USE_KQUEUE */
756 }
757 
758 static inline isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)759 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
760 	isc_result_t result = ISC_R_SUCCESS;
761 
762 #ifdef USE_KQUEUE
763 	struct kevent evchange;
764 
765 	memset(&evchange, 0, sizeof(evchange));
766 	if (msg == SELECT_POKE_READ) {
767 		evchange.filter = EVFILT_READ;
768 	} else {
769 		evchange.filter = EVFILT_WRITE;
770 	}
771 	evchange.flags = EV_DELETE;
772 	evchange.ident = fd;
773 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
774 		result = isc__errno2result(errno);
775 	}
776 
777 	return (result);
778 #elif defined(USE_EPOLL)
779 	struct epoll_event event;
780 	int ret;
781 	int op;
782 
783 	if (msg == SELECT_POKE_READ) {
784 		thread->epoll_events[fd] &= ~(EPOLLIN);
785 	} else {
786 		thread->epoll_events[fd] &= ~(EPOLLOUT);
787 	}
788 
789 	event.events = thread->epoll_events[fd];
790 	memset(&event.data, 0, sizeof(event.data));
791 	event.data.fd = fd;
792 
793 	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
794 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
795 	if (ret == -1 && errno != ENOENT) {
796 		char strbuf[ISC_STRERRORSIZE];
797 		strerror_r(errno, strbuf, sizeof(strbuf));
798 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
799 				 fd, strbuf);
800 		result = ISC_R_UNEXPECTED;
801 	}
802 	return (result);
803 #elif defined(USE_DEVPOLL)
804 	struct pollfd pfds[2];
805 	size_t writelen = sizeof(pfds[0]);
806 
807 	memset(pfds, 0, sizeof(pfds));
808 	pfds[0].events = POLLREMOVE;
809 	pfds[0].fd = fd;
810 
811 	/*
812 	 * Canceling read or write polling via /dev/poll is tricky.  Since it
813 	 * only provides a way of canceling per FD, we may need to re-poll the
814 	 * socket for the other operation.
815 	 */
816 	if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
817 		pfds[1].events = POLLOUT;
818 		pfds[1].fd = fd;
819 		writelen += sizeof(pfds[1]);
820 	}
821 	if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
822 		pfds[1].events = POLLIN;
823 		pfds[1].fd = fd;
824 		writelen += sizeof(pfds[1]);
825 	}
826 
827 	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
828 		result = isc__errno2result(errno);
829 	} else {
830 		if (msg == SELECT_POKE_READ) {
831 			thread->fdpollinfo[fd].want_read = 0;
832 		} else {
833 			thread->fdpollinfo[fd].want_write = 0;
834 		}
835 	}
836 
837 	return (result);
838 #elif defined(USE_SELECT)
839 	LOCK(&thread->manager->lock);
840 	if (msg == SELECT_POKE_READ) {
841 		FD_CLR(fd, thread->read_fds);
842 	} else if (msg == SELECT_POKE_WRITE) {
843 		FD_CLR(fd, thread->write_fds);
844 	}
845 	UNLOCK(&thread->manager->lock);
846 
847 	return (result);
848 #endif /* ifdef USE_KQUEUE */
849 }
850 
851 /*
852  * A poke message was received, perform a proper watch/unwatch
853  * on a fd provided
854  */
855 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)856 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
857 	isc_result_t result;
858 	int lockid = FDLOCK_ID(fd);
859 
860 	/*
861 	 * This is a wakeup on a socket.  If the socket is not in the
862 	 * process of being closed, start watching it for either reads
863 	 * or writes.
864 	 */
865 
866 	INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
867 
868 	if (msg == SELECT_POKE_CLOSE) {
869 		LOCK(&thread->fdlock[lockid]);
870 		INSIST(thread->fdstate[fd] == CLOSE_PENDING);
871 		thread->fdstate[fd] = CLOSED;
872 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
873 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
874 		(void)close(fd);
875 		UNLOCK(&thread->fdlock[lockid]);
876 		return;
877 	}
878 
879 	LOCK(&thread->fdlock[lockid]);
880 	if (thread->fdstate[fd] == CLOSE_PENDING) {
881 		/*
882 		 * We accept (and ignore) any error from unwatch_fd() as we are
883 		 * closing the socket, hoping it doesn't leave dangling state in
884 		 * the kernel.
885 		 * Note that unwatch_fd() must be called after releasing the
886 		 * fdlock; otherwise it could cause deadlock due to a lock order
887 		 * reversal.
888 		 */
889 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
890 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
891 		UNLOCK(&thread->fdlock[lockid]);
892 		return;
893 	}
894 	if (thread->fdstate[fd] != MANAGED) {
895 		UNLOCK(&thread->fdlock[lockid]);
896 		return;
897 	}
898 
899 	/*
900 	 * Set requested bit.
901 	 */
902 	result = watch_fd(thread, fd, msg);
903 	if (result != ISC_R_SUCCESS) {
904 		/*
905 		 * XXXJT: what should we do?  Ignoring the failure of watching
906 		 * a socket will make the application dysfunctional, but there
907 		 * seems to be no reasonable recovery process.
908 		 */
909 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
910 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
911 			      "failed to start watching FD (%d): %s", fd,
912 			      isc_result_totext(result));
913 	}
914 	UNLOCK(&thread->fdlock[lockid]);
915 }
916 
917 /*
918  * Poke the select loop when there is something for us to do.
919  * The write is required (by POSIX) to complete.  That is, we
920  * will not get partial writes.
921  */
922 static void
select_poke(isc_socketmgr_t * mgr,int threadid,int fd,int msg)923 select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
924 	int cc;
925 	int buf[2];
926 	char strbuf[ISC_STRERRORSIZE];
927 
928 	buf[0] = fd;
929 	buf[1] = msg;
930 
931 	do {
932 		cc = write(mgr->threads[threadid].pipe_fds[1], buf,
933 			   sizeof(buf));
934 #ifdef ENOSR
935 		/*
936 		 * Treat ENOSR as EAGAIN but loop slowly as it is
937 		 * unlikely to clear fast.
938 		 */
939 		if (cc < 0 && errno == ENOSR) {
940 			sleep(1);
941 			errno = EAGAIN;
942 		}
943 #endif /* ifdef ENOSR */
944 	} while (cc < 0 && SOFT_ERROR(errno));
945 
946 	if (cc < 0) {
947 		strerror_r(errno, strbuf, sizeof(strbuf));
948 		FATAL_ERROR(__FILE__, __LINE__,
949 			    "write() failed during watcher poke: %s", strbuf);
950 	}
951 
952 	INSIST(cc == sizeof(buf));
953 }
954 
955 /*
956  * Read a message on the internal fd.
957  */
958 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)959 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
960 	int buf[2];
961 	int cc;
962 	char strbuf[ISC_STRERRORSIZE];
963 
964 	cc = read(thread->pipe_fds[0], buf, sizeof(buf));
965 	if (cc < 0) {
966 		*msg = SELECT_POKE_NOTHING;
967 		*fd = -1; /* Silence compiler. */
968 		if (SOFT_ERROR(errno)) {
969 			return;
970 		}
971 
972 		strerror_r(errno, strbuf, sizeof(strbuf));
973 		FATAL_ERROR(__FILE__, __LINE__,
974 			    "read() failed during watcher poke: %s", strbuf);
975 	}
976 	INSIST(cc == sizeof(buf));
977 
978 	*fd = buf[0];
979 	*msg = buf[1];
980 }
981 
982 /*
983  * Make a fd non-blocking.
984  */
985 static isc_result_t
make_nonblock(int fd)986 make_nonblock(int fd) {
987 	int ret;
988 	char strbuf[ISC_STRERRORSIZE];
989 #ifdef USE_FIONBIO_IOCTL
990 	int on = 1;
991 #else  /* ifdef USE_FIONBIO_IOCTL */
992 	int flags;
993 #endif /* ifdef USE_FIONBIO_IOCTL */
994 
995 #ifdef USE_FIONBIO_IOCTL
996 	ret = ioctl(fd, FIONBIO, (char *)&on);
997 #else  /* ifdef USE_FIONBIO_IOCTL */
998 	flags = fcntl(fd, F_GETFL, 0);
999 	flags |= O_NONBLOCK;
1000 	ret = fcntl(fd, F_SETFL, flags);
1001 #endif /* ifdef USE_FIONBIO_IOCTL */
1002 
1003 	if (ret == -1) {
1004 		strerror_r(errno, strbuf, sizeof(strbuf));
1005 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1006 #ifdef USE_FIONBIO_IOCTL
1007 				 "ioctl(%d, FIONBIO, &on): %s", fd,
1008 #else  /* ifdef USE_FIONBIO_IOCTL */
1009 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1010 #endif /* ifdef USE_FIONBIO_IOCTL */
1011 				 strbuf);
1012 
1013 		return (ISC_R_UNEXPECTED);
1014 	}
1015 
1016 	return (ISC_R_SUCCESS);
1017 }
1018 
1019 #ifdef USE_CMSG
1020 /*
1021  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1022  * In order to ensure as much portability as possible, we provide wrapper
1023  * functions of these macros.
1024  * Note that cmsg_space() could run slow on OSes that do not have
1025  * CMSG_SPACE.
1026  */
1027 static inline socklen_t
cmsg_len(socklen_t len)1028 cmsg_len(socklen_t len) {
1029 #ifdef CMSG_LEN
1030 	return (CMSG_LEN(len));
1031 #else  /* ifdef CMSG_LEN */
1032 	socklen_t hdrlen;
1033 
1034 	/*
1035 	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1036 	 * is correct.
1037 	 */
1038 	hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1039 	return (hdrlen + len);
1040 #endif /* ifdef CMSG_LEN */
1041 }
1042 
1043 static inline socklen_t
cmsg_space(socklen_t len)1044 cmsg_space(socklen_t len) {
1045 #ifdef CMSG_SPACE
1046 	return (CMSG_SPACE(len));
1047 #else  /* ifdef CMSG_SPACE */
1048 	struct msghdr msg;
1049 	struct cmsghdr *cmsgp;
1050 	/*
1051 	 * XXX: The buffer length is an ad-hoc value, but should be enough
1052 	 * in a practical sense.
1053 	 */
1054 	char dummybuf[sizeof(struct cmsghdr) + 1024];
1055 
1056 	memset(&msg, 0, sizeof(msg));
1057 	msg.msg_control = dummybuf;
1058 	msg.msg_controllen = sizeof(dummybuf);
1059 
1060 	cmsgp = (struct cmsghdr *)dummybuf;
1061 	cmsgp->cmsg_len = cmsg_len(len);
1062 
1063 	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1064 	if (cmsgp != NULL) {
1065 		return ((char *)cmsgp - (char *)msg.msg_control);
1066 	} else {
1067 		return (0);
1068 	}
1069 #endif /* ifdef CMSG_SPACE */
1070 }
1071 #endif /* USE_CMSG */
1072 
1073 /*
1074  * Process control messages received on a socket.
1075  */
1076 static void
process_cmsg(isc_socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1077 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1078 #ifdef USE_CMSG
1079 	struct cmsghdr *cmsgp;
1080 	struct in6_pktinfo *pktinfop;
1081 #ifdef SO_TIMESTAMP
1082 	void *timevalp;
1083 #endif /* ifdef SO_TIMESTAMP */
1084 #endif /* ifdef USE_CMSG */
1085 
1086 	/*
1087 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1088 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1089 	 * They are all here, outside of the CPP tests, because it is
1090 	 * more consistent with the usual ISC coding style.
1091 	 */
1092 	UNUSED(sock);
1093 	UNUSED(msg);
1094 	UNUSED(dev);
1095 
1096 #ifdef MSG_TRUNC
1097 	if ((msg->msg_flags & MSG_TRUNC) != 0) {
1098 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1099 	}
1100 #endif /* ifdef MSG_TRUNC */
1101 
1102 #ifdef MSG_CTRUNC
1103 	if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1104 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1105 	}
1106 #endif /* ifdef MSG_CTRUNC */
1107 
1108 #ifndef USE_CMSG
1109 	return;
1110 #else /* ifndef USE_CMSG */
1111 	if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1112 		return;
1113 	}
1114 
1115 #ifdef SO_TIMESTAMP
1116 	timevalp = NULL;
1117 #endif /* ifdef SO_TIMESTAMP */
1118 	pktinfop = NULL;
1119 
1120 	cmsgp = CMSG_FIRSTHDR(msg);
1121 	while (cmsgp != NULL) {
1122 		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1123 
1124 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1125 		    cmsgp->cmsg_type == IPV6_PKTINFO) {
1126 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1127 			memmove(&dev->pktinfo, pktinfop,
1128 				sizeof(struct in6_pktinfo));
1129 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1130 			socket_log(sock, NULL, TRACE,
1131 				   "interface received on ifindex %u",
1132 				   dev->pktinfo.ipi6_ifindex);
1133 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1134 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1135 			}
1136 			goto next;
1137 		}
1138 
1139 #ifdef SO_TIMESTAMP
1140 		if (cmsgp->cmsg_level == SOL_SOCKET &&
1141 		    cmsgp->cmsg_type == SCM_TIMESTAMP) {
1142 			struct timeval tv;
1143 			timevalp = CMSG_DATA(cmsgp);
1144 			memmove(&tv, timevalp, sizeof(tv));
1145 			dev->timestamp.seconds = tv.tv_sec;
1146 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1147 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1148 			goto next;
1149 		}
1150 #endif /* ifdef SO_TIMESTAMP */
1151 
1152 #ifdef IPV6_TCLASS
1153 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1154 		    cmsgp->cmsg_type == IPV6_TCLASS) {
1155 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
1156 			dev->dscp >>= 2;
1157 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1158 			goto next;
1159 		}
1160 #endif /* ifdef IPV6_TCLASS */
1161 
1162 #ifdef IP_TOS
1163 		if (cmsgp->cmsg_level == IPPROTO_IP &&
1164 		    (cmsgp->cmsg_type == IP_TOS
1165 #ifdef IP_RECVTOS
1166 		     || cmsgp->cmsg_type == IP_RECVTOS
1167 #endif /* ifdef IP_RECVTOS */
1168 		     ))
1169 		{
1170 			dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1171 			dev->dscp >>= 2;
1172 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1173 			goto next;
1174 		}
1175 #endif /* ifdef IP_TOS */
1176 	next:
1177 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1178 	}
1179 #endif /* USE_CMSG */
1180 }
1181 
1182 /*
1183  * Construct an iov array and attach it to the msghdr passed in.  This is
1184  * the SEND constructor, which will use the used region of the buffer
1185  * (if using a buffer list) or will use the internal region (if a single
1186  * buffer I/O is requested).
1187  *
1188  * Nothing can be NULL, and the done event must list at least one buffer
1189  * on the buffer linked list for this function to be meaningful.
1190  *
1191  * If write_countp != NULL, *write_countp will hold the number of bytes
1192  * this transaction can send.
1193  */
1194 static void
build_msghdr_send(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1195 build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1196 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1197 	unsigned int iovcount;
1198 	size_t write_count;
1199 	struct cmsghdr *cmsgp;
1200 
1201 	memset(msg, 0, sizeof(*msg));
1202 
1203 	if (!sock->connected) {
1204 		msg->msg_name = (void *)&dev->address.type.sa;
1205 		msg->msg_namelen = dev->address.length;
1206 	} else {
1207 		msg->msg_name = NULL;
1208 		msg->msg_namelen = 0;
1209 	}
1210 
1211 	write_count = dev->region.length - dev->n;
1212 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1213 	iov[0].iov_len = write_count;
1214 	iovcount = 1;
1215 
1216 	msg->msg_iov = iov;
1217 	msg->msg_iovlen = iovcount;
1218 	msg->msg_control = NULL;
1219 	msg->msg_controllen = 0;
1220 	msg->msg_flags = 0;
1221 #if defined(USE_CMSG)
1222 
1223 	if ((sock->type == isc_sockettype_udp) &&
1224 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1225 	{
1226 		struct in6_pktinfo *pktinfop;
1227 
1228 		socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1229 			   dev->pktinfo.ipi6_ifindex);
1230 
1231 		msg->msg_control = (void *)cmsgbuf;
1232 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1233 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1234 
1235 		cmsgp = (struct cmsghdr *)cmsgbuf;
1236 		cmsgp->cmsg_level = IPPROTO_IPV6;
1237 		cmsgp->cmsg_type = IPV6_PKTINFO;
1238 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1239 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1240 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1241 	}
1242 
1243 #if defined(IPV6_USE_MIN_MTU)
1244 	if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1245 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1246 	{
1247 		int use_min_mtu = 1; /* -1, 0, 1 */
1248 
1249 		cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1250 		msg->msg_control = (void *)cmsgbuf;
1251 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1252 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1253 
1254 		cmsgp->cmsg_level = IPPROTO_IPV6;
1255 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1256 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1257 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1258 	}
1259 #endif /* if defined(IPV6_USE_MIN_MTU) */
1260 
1261 	if (isc_dscp_check_value > -1) {
1262 		if (sock->type == isc_sockettype_udp) {
1263 			INSIST((int)dev->dscp == isc_dscp_check_value);
1264 		} else if (sock->type == isc_sockettype_tcp) {
1265 			INSIST((int)sock->dscp == isc_dscp_check_value);
1266 		}
1267 	}
1268 
1269 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1270 	if ((sock->type == isc_sockettype_udp) &&
1271 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1272 	{
1273 		int dscp = (dev->dscp << 2) & 0xff;
1274 
1275 		INSIST(dev->dscp < 0x40);
1276 
1277 #ifdef IP_TOS
1278 		if (sock->pf == AF_INET && sock->pktdscp) {
1279 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1280 						   msg->msg_controllen);
1281 			msg->msg_control = (void *)cmsgbuf;
1282 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1283 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1284 
1285 			cmsgp->cmsg_level = IPPROTO_IP;
1286 			cmsgp->cmsg_type = IP_TOS;
1287 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
1288 			*(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1289 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1290 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1291 				       (void *)&dscp, sizeof(int)) < 0) {
1292 				char strbuf[ISC_STRERRORSIZE];
1293 				strerror_r(errno, strbuf, sizeof(strbuf));
1294 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1295 						 "setsockopt(%d, IP_TOS, %.02x)"
1296 						 " failed: %s",
1297 						 sock->fd, dscp >> 2, strbuf);
1298 			} else {
1299 				sock->dscp = dscp;
1300 			}
1301 		}
1302 #endif /* ifdef IP_TOS */
1303 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1304 		if (sock->pf == AF_INET6 && sock->pktdscp) {
1305 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1306 						   msg->msg_controllen);
1307 			msg->msg_control = (void *)cmsgbuf;
1308 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1309 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1310 
1311 			cmsgp->cmsg_level = IPPROTO_IPV6;
1312 			cmsgp->cmsg_type = IPV6_TCLASS;
1313 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1314 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1315 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1316 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1317 				       (void *)&dscp, sizeof(int)) < 0)
1318 			{
1319 				char strbuf[ISC_STRERRORSIZE];
1320 				strerror_r(errno, strbuf, sizeof(strbuf));
1321 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1322 						 "setsockopt(%d, IPV6_TCLASS, "
1323 						 "%.02x) failed: %s",
1324 						 sock->fd, dscp >> 2, strbuf);
1325 			} else {
1326 				sock->dscp = dscp;
1327 			}
1328 		}
1329 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1330 		if (msg->msg_controllen != 0 &&
1331 		    msg->msg_controllen < SENDCMSGBUFLEN) {
1332 			memset(cmsgbuf + msg->msg_controllen, 0,
1333 			       SENDCMSGBUFLEN - msg->msg_controllen);
1334 		}
1335 	}
1336 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1337 	* defined(IPV6_TCLASS))                           \
1338 	* */
1339 #endif /* USE_CMSG */
1340 
1341 	if (write_countp != NULL) {
1342 		*write_countp = write_count;
1343 	}
1344 }
1345 
1346 /*
1347  * Construct an iov array and attach it to the msghdr passed in.  This is
1348  * the RECV constructor, which will use the available region of the buffer
1349  * (if using a buffer list) or will use the internal region (if a single
1350  * buffer I/O is requested).
1351  *
1352  * Nothing can be NULL, and the done event must list at least one buffer
1353  * on the buffer linked list for this function to be meaningful.
1354  *
1355  * If read_countp != NULL, *read_countp will hold the number of bytes
1356  * this transaction can receive.
1357  */
1358 static void
build_msghdr_recv(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1359 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1360 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1361 	unsigned int iovcount;
1362 	size_t read_count;
1363 
1364 	memset(msg, 0, sizeof(struct msghdr));
1365 
1366 	if (sock->type == isc_sockettype_udp) {
1367 		memset(&dev->address, 0, sizeof(dev->address));
1368 		msg->msg_name = (void *)&dev->address.type.sa;
1369 		msg->msg_namelen = sizeof(dev->address.type);
1370 	} else { /* TCP */
1371 		msg->msg_name = NULL;
1372 		msg->msg_namelen = 0;
1373 		dev->address = sock->peer_address;
1374 	}
1375 
1376 	read_count = dev->region.length - dev->n;
1377 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1378 	iov[0].iov_len = read_count;
1379 	iovcount = 1;
1380 
1381 	/*
1382 	 * If needed, set up to receive that one extra byte.
1383 	 */
1384 #ifdef ISC_PLATFORM_RECVOVERFLOW
1385 	if (sock->type == isc_sockettype_udp) {
1386 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1387 		iov[iovcount].iov_base = (void *)(&sock->overflow);
1388 		iov[iovcount].iov_len = 1;
1389 		iovcount++;
1390 	}
1391 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1392 
1393 	msg->msg_iov = iov;
1394 	msg->msg_iovlen = iovcount;
1395 
1396 #if defined(USE_CMSG)
1397 	msg->msg_control = cmsgbuf;
1398 	msg->msg_controllen = RECVCMSGBUFLEN;
1399 #else  /* if defined(USE_CMSG) */
1400 	msg->msg_control = NULL;
1401 	msg->msg_controllen = 0;
1402 #endif /* USE_CMSG */
1403 	msg->msg_flags = 0;
1404 
1405 	if (read_countp != NULL) {
1406 		*read_countp = read_count;
1407 	}
1408 }
1409 
1410 static void
set_dev_address(const isc_sockaddr_t * address,isc_socket_t * sock,isc_socketevent_t * dev)1411 set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
1412 		isc_socketevent_t *dev) {
1413 	if (sock->type == isc_sockettype_udp) {
1414 		if (address != NULL) {
1415 			dev->address = *address;
1416 		} else {
1417 			dev->address = sock->peer_address;
1418 		}
1419 	} else if (sock->type == isc_sockettype_tcp) {
1420 		INSIST(address == NULL);
1421 		dev->address = sock->peer_address;
1422 	}
1423 }
1424 
1425 static void
destroy_socketevent(isc_event_t * event)1426 destroy_socketevent(isc_event_t *event) {
1427 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1428 
1429 	(ev->destroy)(event);
1430 }
1431 
1432 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1433 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1434 		     isc_taskaction_t action, void *arg) {
1435 	isc_socketevent_t *ev;
1436 
1437 	ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1438 						     action, arg, sizeof(*ev));
1439 
1440 	ev->result = ISC_R_UNSET;
1441 	ISC_LINK_INIT(ev, ev_link);
1442 	ev->region.base = NULL;
1443 	ev->n = 0;
1444 	ev->offset = 0;
1445 	ev->attributes = 0;
1446 	ev->destroy = ev->ev_destroy;
1447 	ev->ev_destroy = destroy_socketevent;
1448 	ev->dscp = 0;
1449 
1450 	return (ev);
1451 }
1452 
1453 #if defined(ISC_SOCKET_DEBUG)
1454 static void
dump_msg(struct msghdr * msg)1455 dump_msg(struct msghdr *msg) {
1456 	unsigned int i;
1457 
1458 	printf("MSGHDR %p\n", msg);
1459 	printf("\tname %p, namelen %ld\n", msg->msg_name,
1460 	       (long)msg->msg_namelen);
1461 	printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1462 	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1463 		printf("\t\t%u\tbase %p, len %ld\n", i,
1464 		       msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1465 	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1466 	       (long)msg->msg_controllen);
1467 }
1468 #endif /* if defined(ISC_SOCKET_DEBUG) */
1469 
1470 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1471 #define DOIO_SOFT    1 /* i/o ok, soft error, no event sent */
1472 #define DOIO_HARD    2 /* i/o error, event sent */
1473 #define DOIO_EOF     3 /* EOF, no event sent */
1474 
1475 static int
doio_recv(isc_socket_t * sock,isc_socketevent_t * dev)1476 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1477 	int cc;
1478 	struct iovec iov[MAXSCATTERGATHER_RECV];
1479 	size_t read_count;
1480 	struct msghdr msghdr;
1481 	int recv_errno;
1482 	char strbuf[ISC_STRERRORSIZE];
1483 	char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1484 
1485 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1486 
1487 #if defined(ISC_SOCKET_DEBUG)
1488 	dump_msg(&msghdr);
1489 #endif /* if defined(ISC_SOCKET_DEBUG) */
1490 
1491 	cc = recvmsg(sock->fd, &msghdr, 0);
1492 	recv_errno = errno;
1493 
1494 #if defined(ISC_SOCKET_DEBUG)
1495 	dump_msg(&msghdr);
1496 #endif /* if defined(ISC_SOCKET_DEBUG) */
1497 
1498 	if (cc < 0) {
1499 		if (SOFT_ERROR(recv_errno)) {
1500 			return (DOIO_SOFT);
1501 		}
1502 
1503 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1504 			strerror_r(recv_errno, strbuf, sizeof(strbuf));
1505 			socket_log(sock, NULL, IOEVENT,
1506 				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1507 				   sock->fd, cc, recv_errno, strbuf);
1508 		}
1509 
1510 #define SOFT_OR_HARD(_system, _isc)                                   \
1511 	if (recv_errno == _system) {                                  \
1512 		if (sock->connected) {                                \
1513 			dev->result = _isc;                           \
1514 			inc_stats(sock->manager->stats,               \
1515 				  sock->statsindex[STATID_RECVFAIL]); \
1516 			return (DOIO_HARD);                           \
1517 		}                                                     \
1518 		return (DOIO_SOFT);                                   \
1519 	}
1520 #define ALWAYS_HARD(_system, _isc)                            \
1521 	if (recv_errno == _system) {                          \
1522 		dev->result = _isc;                           \
1523 		inc_stats(sock->manager->stats,               \
1524 			  sock->statsindex[STATID_RECVFAIL]); \
1525 		return (DOIO_HARD);                           \
1526 	}
1527 
1528 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1529 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1530 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1531 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1532 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1533 		/*
1534 		 * Older operating systems may still return EPROTO in some
1535 		 * situations, for example when receiving ICMP/ICMPv6 errors.
1536 		 * A real life scenario is when ICMPv6 returns code 5 or 6.
1537 		 * These codes are introduced in RFC 4443 from March 2006,
1538 		 * and the document obsoletes RFC 1885. But unfortunately not
1539 		 * all operating systems have caught up with the new standard
1540 		 * (in 2020) and thus a generic protocol error is returned.
1541 		 */
1542 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1543 		/* Should never get this one but it was seen. */
1544 #ifdef ENOPROTOOPT
1545 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1546 #endif /* ifdef ENOPROTOOPT */
1547 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1548 
1549 #undef SOFT_OR_HARD
1550 #undef ALWAYS_HARD
1551 
1552 		dev->result = isc__errno2result(recv_errno);
1553 		inc_stats(sock->manager->stats,
1554 			  sock->statsindex[STATID_RECVFAIL]);
1555 		return (DOIO_HARD);
1556 	}
1557 
1558 	/*
1559 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1560 	 * while on UDP sockets, zero length reads are perfectly valid,
1561 	 * although strange.
1562 	 */
1563 	switch (sock->type) {
1564 	case isc_sockettype_tcp:
1565 	case isc_sockettype_unix:
1566 		if (cc == 0) {
1567 			return (DOIO_EOF);
1568 		}
1569 		break;
1570 	case isc_sockettype_udp:
1571 	case isc_sockettype_raw:
1572 		break;
1573 	default:
1574 		INSIST(0);
1575 		ISC_UNREACHABLE();
1576 	}
1577 
1578 	if (sock->type == isc_sockettype_udp) {
1579 		dev->address.length = msghdr.msg_namelen;
1580 		if (isc_sockaddr_getport(&dev->address) == 0) {
1581 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1582 				socket_log(sock, &dev->address, IOEVENT,
1583 					   "dropping source port zero packet");
1584 			}
1585 			return (DOIO_SOFT);
1586 		}
1587 		/*
1588 		 * Simulate a firewall blocking UDP responses bigger than
1589 		 * 'maxudp' bytes.
1590 		 */
1591 		if (sock->manager->maxudp != 0 &&
1592 		    cc > (int)sock->manager->maxudp) {
1593 			return (DOIO_SOFT);
1594 		}
1595 	}
1596 
1597 	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1598 
1599 	/*
1600 	 * Overflow bit detection.  If we received MORE bytes than we should,
1601 	 * this indicates an overflow situation.  Set the flag in the
1602 	 * dev entry and adjust how much we read by one.
1603 	 */
1604 #ifdef ISC_PLATFORM_RECVOVERFLOW
1605 	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1606 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1607 		cc--;
1608 	}
1609 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1610 
1611 	/*
1612 	 * If there are control messages attached, run through them and pull
1613 	 * out the interesting bits.
1614 	 */
1615 	process_cmsg(sock, &msghdr, dev);
1616 
1617 	/*
1618 	 * update the buffers (if any) and the i/o count
1619 	 */
1620 	dev->n += cc;
1621 
1622 	/*
1623 	 * If we read less than we expected, update counters,
1624 	 * and let the upper layer poke the descriptor.
1625 	 */
1626 	if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1627 		return (DOIO_SOFT);
1628 	}
1629 
1630 	/*
1631 	 * Full reads are posted, or partials if partials are ok.
1632 	 */
1633 	dev->result = ISC_R_SUCCESS;
1634 	return (DOIO_SUCCESS);
1635 }
1636 
1637 /*
1638  * Returns:
1639  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1640  *			ISC_R_SUCCESS.
1641  *
1642  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1643  *			dev->result contains the appropriate error.
1644  *
1645  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1646  *			event was sent.  The operation should be retried.
1647  *
1648  *	No other return values are possible.
1649  */
1650 static int
doio_send(isc_socket_t * sock,isc_socketevent_t * dev)1651 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1652 	int cc;
1653 	struct iovec iov[MAXSCATTERGATHER_SEND];
1654 	size_t write_count;
1655 	struct msghdr msghdr;
1656 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1657 	int attempts = 0;
1658 	int send_errno;
1659 	char strbuf[ISC_STRERRORSIZE];
1660 	char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1661 
1662 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1663 
1664 resend:
1665 	if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1666 	    write_count > sock->manager->maxudp)
1667 	{
1668 		cc = write_count;
1669 	} else {
1670 		cc = sendmsg(sock->fd, &msghdr, 0);
1671 	}
1672 	send_errno = errno;
1673 
1674 	/*
1675 	 * Check for error or block condition.
1676 	 */
1677 	if (cc < 0) {
1678 		if (send_errno == EINTR && ++attempts < NRETRIES) {
1679 			goto resend;
1680 		}
1681 
1682 		if (SOFT_ERROR(send_errno)) {
1683 			if (errno == EWOULDBLOCK || errno == EAGAIN) {
1684 				dev->result = ISC_R_WOULDBLOCK;
1685 			}
1686 			return (DOIO_SOFT);
1687 		}
1688 
1689 #define SOFT_OR_HARD(_system, _isc)                                   \
1690 	if (send_errno == _system) {                                  \
1691 		if (sock->connected) {                                \
1692 			dev->result = _isc;                           \
1693 			inc_stats(sock->manager->stats,               \
1694 				  sock->statsindex[STATID_SENDFAIL]); \
1695 			return (DOIO_HARD);                           \
1696 		}                                                     \
1697 		return (DOIO_SOFT);                                   \
1698 	}
1699 #define ALWAYS_HARD(_system, _isc)                            \
1700 	if (send_errno == _system) {                          \
1701 		dev->result = _isc;                           \
1702 		inc_stats(sock->manager->stats,               \
1703 			  sock->statsindex[STATID_SENDFAIL]); \
1704 		return (DOIO_HARD);                           \
1705 	}
1706 
1707 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1708 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1709 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1710 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1711 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1712 #ifdef EHOSTDOWN
1713 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1714 #endif /* ifdef EHOSTDOWN */
1715 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1716 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1717 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1718 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1719 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1720 
1721 #undef SOFT_OR_HARD
1722 #undef ALWAYS_HARD
1723 
1724 		/*
1725 		 * The other error types depend on whether or not the
1726 		 * socket is UDP or TCP.  If it is UDP, some errors
1727 		 * that we expect to be fatal under TCP are merely
1728 		 * annoying, and are really soft errors.
1729 		 *
1730 		 * However, these soft errors are still returned as
1731 		 * a status.
1732 		 */
1733 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1734 		strerror_r(send_errno, strbuf, sizeof(strbuf));
1735 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1736 				 addrbuf, strbuf);
1737 		dev->result = isc__errno2result(send_errno);
1738 		inc_stats(sock->manager->stats,
1739 			  sock->statsindex[STATID_SENDFAIL]);
1740 		return (DOIO_HARD);
1741 	}
1742 
1743 	if (cc == 0) {
1744 		inc_stats(sock->manager->stats,
1745 			  sock->statsindex[STATID_SENDFAIL]);
1746 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1747 				 "doio_send: send() returned 0");
1748 	}
1749 
1750 	/*
1751 	 * If we write less than we expected, update counters, poke.
1752 	 */
1753 	dev->n += cc;
1754 	if ((size_t)cc != write_count) {
1755 		return (DOIO_SOFT);
1756 	}
1757 
1758 	/*
1759 	 * Exactly what we wanted to write.  We're done with this
1760 	 * entry.  Post its completion event.
1761 	 */
1762 	dev->result = ISC_R_SUCCESS;
1763 	return (DOIO_SUCCESS);
1764 }
1765 
1766 /*
1767  * Kill.
1768  *
1769  * Caller must ensure that the socket is not locked and no external
1770  * references exist.
1771  */
1772 static void
socketclose(isc__socketthread_t * thread,isc_socket_t * sock,int fd)1773 socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
1774 	int lockid = FDLOCK_ID(fd);
1775 	/*
1776 	 * No one has this socket open, so the watcher doesn't have to be
1777 	 * poked, and the socket doesn't have to be locked.
1778 	 */
1779 	LOCK(&thread->fdlock[lockid]);
1780 	thread->fds[fd] = NULL;
1781 	thread->fdstate[fd] = CLOSE_PENDING;
1782 	UNLOCK(&thread->fdlock[lockid]);
1783 	select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
1784 
1785 	inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1786 
1787 	LOCK(&sock->lock);
1788 	if (sock->active == 1) {
1789 		dec_stats(thread->manager->stats,
1790 			  sock->statsindex[STATID_ACTIVE]);
1791 		sock->active = 0;
1792 	}
1793 	UNLOCK(&sock->lock);
1794 
1795 	/*
1796 	 * update manager->maxfd here (XXX: this should be implemented more
1797 	 * efficiently)
1798 	 */
1799 #ifdef USE_SELECT
1800 	LOCK(&thread->manager->lock);
1801 	if (thread->maxfd == fd) {
1802 		int i;
1803 
1804 		thread->maxfd = 0;
1805 		for (i = fd - 1; i >= 0; i--) {
1806 			lockid = FDLOCK_ID(i);
1807 
1808 			LOCK(&thread->fdlock[lockid]);
1809 			if (thread->fdstate[i] == MANAGED) {
1810 				thread->maxfd = i;
1811 				UNLOCK(&thread->fdlock[lockid]);
1812 				break;
1813 			}
1814 			UNLOCK(&thread->fdlock[lockid]);
1815 		}
1816 		if (thread->maxfd < thread->pipe_fds[0]) {
1817 			thread->maxfd = thread->pipe_fds[0];
1818 		}
1819 	}
1820 
1821 	UNLOCK(&thread->manager->lock);
1822 #endif /* USE_SELECT */
1823 }
1824 
1825 static void
destroy(isc_socket_t ** sockp)1826 destroy(isc_socket_t **sockp) {
1827 	int fd = 0;
1828 	isc_socket_t *sock = *sockp;
1829 	isc_socketmgr_t *manager = sock->manager;
1830 	isc__socketthread_t *thread = NULL;
1831 
1832 	socket_log(sock, NULL, CREATION, "destroying");
1833 
1834 	isc_refcount_destroy(&sock->references);
1835 
1836 	LOCK(&sock->lock);
1837 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1838 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1839 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1840 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1841 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1842 
1843 	if (sock->fd >= 0) {
1844 		fd = sock->fd;
1845 		thread = &manager->threads[sock->threadid];
1846 		sock->fd = -1;
1847 		sock->threadid = -1;
1848 	}
1849 	UNLOCK(&sock->lock);
1850 
1851 	if (fd > 0) {
1852 		socketclose(thread, sock, fd);
1853 	}
1854 
1855 	LOCK(&manager->lock);
1856 
1857 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1858 
1859 	if (ISC_LIST_EMPTY(manager->socklist)) {
1860 		SIGNAL(&manager->shutdown_ok);
1861 	}
1862 
1863 	/* can't unlock manager as its memory context is still used */
1864 	free_socket(sockp);
1865 
1866 	UNLOCK(&manager->lock);
1867 }
1868 
1869 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1870 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1871 		isc_socket_t **socketp) {
1872 	isc_socket_t *sock;
1873 
1874 	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1875 
1876 	sock->magic = 0;
1877 	isc_refcount_init(&sock->references, 0);
1878 
1879 	sock->manager = manager;
1880 	sock->type = type;
1881 	sock->fd = -1;
1882 	sock->threadid = -1;
1883 	sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1884 	sock->statsindex = NULL;
1885 	sock->active = 0;
1886 
1887 	ISC_LINK_INIT(sock, link);
1888 
1889 	memset(sock->name, 0, sizeof(sock->name));
1890 	sock->tag = NULL;
1891 
1892 	/*
1893 	 * Set up list of readers and writers to be initially empty.
1894 	 */
1895 	ISC_LIST_INIT(sock->recv_list);
1896 	ISC_LIST_INIT(sock->send_list);
1897 	ISC_LIST_INIT(sock->accept_list);
1898 	ISC_LIST_INIT(sock->connect_list);
1899 
1900 	sock->listener = 0;
1901 	sock->connected = 0;
1902 	sock->connecting = 0;
1903 	sock->bound = 0;
1904 	sock->pktdscp = 0;
1905 
1906 	/*
1907 	 * Initialize the lock.
1908 	 */
1909 	isc_mutex_init(&sock->lock);
1910 
1911 	sock->magic = SOCKET_MAGIC;
1912 	*socketp = sock;
1913 
1914 	return (ISC_R_SUCCESS);
1915 }
1916 
1917 /*
1918  * This event requires that the various lists be empty, that the reference
1919  * count be 1, and that the magic number is valid.  The other socket bits,
1920  * like the lock, must be initialized as well.  The fd associated must be
1921  * marked as closed, by setting it to -1 on close, or this routine will
1922  * also close the socket.
1923  */
1924 static void
free_socket(isc_socket_t ** socketp)1925 free_socket(isc_socket_t **socketp) {
1926 	isc_socket_t *sock = *socketp;
1927 	*socketp = NULL;
1928 
1929 	INSIST(VALID_SOCKET(sock));
1930 	isc_refcount_destroy(&sock->references);
1931 	LOCK(&sock->lock);
1932 	INSIST(!sock->connecting);
1933 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1934 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1935 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1936 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1937 	INSIST(!ISC_LINK_LINKED(sock, link));
1938 	UNLOCK(&sock->lock);
1939 
1940 	sock->magic = 0;
1941 
1942 	isc_mutex_destroy(&sock->lock);
1943 
1944 	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1945 }
1946 
1947 #if defined(SET_RCVBUF)
1948 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1949 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1950 
1951 static void
set_rcvbuf(void)1952 set_rcvbuf(void) {
1953 	int fd;
1954 	int max = rcvbuf, min;
1955 	socklen_t len;
1956 
1957 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1958 	if (fd == -1) {
1959 		switch (errno) {
1960 		case EPROTONOSUPPORT:
1961 		case EPFNOSUPPORT:
1962 		case EAFNOSUPPORT:
1963 		/*
1964 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1965 		 * EAFNOSUPPORT.
1966 		 */
1967 		case EINVAL:
1968 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
1969 			break;
1970 		}
1971 	}
1972 	if (fd == -1) {
1973 		return;
1974 	}
1975 
1976 	len = sizeof(min);
1977 	if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
1978 	    min < rcvbuf)
1979 	{
1980 	again:
1981 		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
1982 			       sizeof(rcvbuf)) == -1)
1983 		{
1984 			if (errno == ENOBUFS && rcvbuf > min) {
1985 				max = rcvbuf - 1;
1986 				rcvbuf = (rcvbuf + min) / 2;
1987 				goto again;
1988 			} else {
1989 				rcvbuf = min;
1990 				goto cleanup;
1991 			}
1992 		} else {
1993 			min = rcvbuf;
1994 		}
1995 		if (min != max) {
1996 			rcvbuf = max;
1997 			goto again;
1998 		}
1999 	}
2000 cleanup:
2001 	close(fd);
2002 }
2003 #endif /* ifdef SO_RCVBUF */
2004 
2005 #if defined(SET_SNDBUF)
2006 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
2007 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2008 
2009 static void
set_sndbuf(void)2010 set_sndbuf(void) {
2011 	int fd;
2012 	int max = sndbuf, min;
2013 	socklen_t len;
2014 
2015 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2016 	if (fd == -1) {
2017 		switch (errno) {
2018 		case EPROTONOSUPPORT:
2019 		case EPFNOSUPPORT:
2020 		case EAFNOSUPPORT:
2021 		/*
2022 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2023 		 * EAFNOSUPPORT.
2024 		 */
2025 		case EINVAL:
2026 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2027 			break;
2028 		}
2029 	}
2030 	if (fd == -1) {
2031 		return;
2032 	}
2033 
2034 	len = sizeof(min);
2035 	if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2036 	    min < sndbuf)
2037 	{
2038 	again:
2039 		if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2040 			       sizeof(sndbuf)) == -1)
2041 		{
2042 			if (errno == ENOBUFS && sndbuf > min) {
2043 				max = sndbuf - 1;
2044 				sndbuf = (sndbuf + min) / 2;
2045 				goto again;
2046 			} else {
2047 				sndbuf = min;
2048 				goto cleanup;
2049 			}
2050 		} else {
2051 			min = sndbuf;
2052 		}
2053 		if (min != max) {
2054 			sndbuf = max;
2055 			goto again;
2056 		}
2057 	}
2058 cleanup:
2059 	close(fd);
2060 }
2061 #endif /* ifdef SO_SNDBUF */
2062 
2063 static void
use_min_mtu(isc_socket_t * sock)2064 use_min_mtu(isc_socket_t *sock) {
2065 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2066 	UNUSED(sock);
2067 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2068 #ifdef IPV6_USE_MIN_MTU
2069 	/* use minimum MTU */
2070 	if (sock->pf == AF_INET6) {
2071 		int on = 1;
2072 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2073 				 (void *)&on, sizeof(on));
2074 	}
2075 #endif /* ifdef IPV6_USE_MIN_MTU */
2076 #if defined(IPV6_MTU)
2077 	/*
2078 	 * Use minimum MTU on IPv6 sockets.
2079 	 */
2080 	if (sock->pf == AF_INET6) {
2081 		int mtu = 1280;
2082 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2083 				 sizeof(mtu));
2084 	}
2085 #endif /* if defined(IPV6_MTU) */
2086 }
2087 
2088 static void
set_tcp_maxseg(isc_socket_t * sock,int size)2089 set_tcp_maxseg(isc_socket_t *sock, int size) {
2090 #ifdef TCP_MAXSEG
2091 	if (sock->type == isc_sockettype_tcp) {
2092 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2093 				 (void *)&size, sizeof(size));
2094 	}
2095 #endif /* ifdef TCP_MAXSEG */
2096 }
2097 
2098 static void
set_ip_disable_pmtud(isc_socket_t * sock)2099 set_ip_disable_pmtud(isc_socket_t *sock) {
2100 	/*
2101 	 * Disable Path MTU Discover on IP packets
2102 	 */
2103 	if (sock->pf == AF_INET6) {
2104 #if defined(IPV6_DONTFRAG)
2105 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
2106 				 &(int){ 0 }, sizeof(int));
2107 #endif
2108 #if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2109 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2110 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2111 #endif
2112 	} else if (sock->pf == AF_INET) {
2113 #if defined(IP_DONTFRAG)
2114 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
2115 				 sizeof(int));
2116 #endif
2117 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2118 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2119 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2120 #endif
2121 	}
2122 }
2123 
2124 static isc_result_t
opensocket(isc_socketmgr_t * manager,isc_socket_t * sock)2125 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2126 	isc_result_t result;
2127 	char strbuf[ISC_STRERRORSIZE];
2128 	const char *err = "socket";
2129 	int tries = 0;
2130 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2131 	int on = 1;
2132 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2133 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2134 	socklen_t optlen;
2135 	int size = 0;
2136 #endif
2137 
2138 again:
2139 	switch (sock->type) {
2140 	case isc_sockettype_udp:
2141 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2142 		break;
2143 	case isc_sockettype_tcp:
2144 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2145 		break;
2146 	case isc_sockettype_unix:
2147 		sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2148 		break;
2149 	case isc_sockettype_raw:
2150 		errno = EPFNOSUPPORT;
2151 		/*
2152 		 * PF_ROUTE is a alias for PF_NETLINK on linux.
2153 		 */
2154 #if defined(PF_ROUTE)
2155 		if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2156 #ifdef NETLINK_ROUTE
2157 			sock->fd = socket(sock->pf, SOCK_RAW, NETLINK_ROUTE);
2158 #else  /* ifdef NETLINK_ROUTE */
2159 			sock->fd = socket(sock->pf, SOCK_RAW, 0);
2160 #endif /* ifdef NETLINK_ROUTE */
2161 			if (sock->fd != -1) {
2162 #ifdef NETLINK_ROUTE
2163 				struct sockaddr_nl sa;
2164 				int n;
2165 
2166 				/*
2167 				 * Do an implicit bind.
2168 				 */
2169 				memset(&sa, 0, sizeof(sa));
2170 				sa.nl_family = AF_NETLINK;
2171 				sa.nl_groups = RTMGRP_IPV4_IFADDR |
2172 					       RTMGRP_IPV6_IFADDR;
2173 				n = bind(sock->fd, (struct sockaddr *)&sa,
2174 					 sizeof(sa));
2175 				if (n < 0) {
2176 					close(sock->fd);
2177 					sock->fd = -1;
2178 				}
2179 #endif /* ifdef NETLINK_ROUTE */
2180 				sock->bound = 1;
2181 			}
2182 		}
2183 #endif /* if defined(PF_ROUTE) */
2184 		break;
2185 	}
2186 	if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2187 		goto again;
2188 	}
2189 
2190 #ifdef F_DUPFD
2191 	/*
2192 	 * Leave a space for stdio and TCP to work in.
2193 	 */
2194 	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2195 	    sock->fd >= 0 && sock->fd < manager->reserved)
2196 	{
2197 		int newfd, tmp;
2198 		newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2199 		tmp = errno;
2200 		(void)close(sock->fd);
2201 		errno = tmp;
2202 		sock->fd = newfd;
2203 		err = "isc_socket_create: fcntl/reserved";
2204 	} else if (sock->fd >= 0 && sock->fd < 20) {
2205 		int newfd, tmp;
2206 		newfd = fcntl(sock->fd, F_DUPFD, 20);
2207 		tmp = errno;
2208 		(void)close(sock->fd);
2209 		errno = tmp;
2210 		sock->fd = newfd;
2211 		err = "isc_socket_create: fcntl";
2212 	}
2213 #endif /* ifdef F_DUPFD */
2214 
2215 	if (sock->fd >= (int)manager->maxsocks) {
2216 		(void)close(sock->fd);
2217 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2218 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2219 			      "socket: file descriptor exceeds limit (%d/%u)",
2220 			      sock->fd, manager->maxsocks);
2221 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2222 		return (ISC_R_NORESOURCES);
2223 	}
2224 
2225 	if (sock->fd < 0) {
2226 		switch (errno) {
2227 		case EMFILE:
2228 		case ENFILE:
2229 			strerror_r(errno, strbuf, sizeof(strbuf));
2230 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2231 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2232 				      "%s: %s", err, strbuf);
2233 		/* fallthrough */
2234 		case ENOBUFS:
2235 			inc_stats(manager->stats,
2236 				  sock->statsindex[STATID_OPENFAIL]);
2237 			return (ISC_R_NORESOURCES);
2238 
2239 		case EPROTONOSUPPORT:
2240 		case EPFNOSUPPORT:
2241 		case EAFNOSUPPORT:
2242 		/*
2243 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2244 		 * EAFNOSUPPORT.
2245 		 */
2246 		case EINVAL:
2247 			inc_stats(manager->stats,
2248 				  sock->statsindex[STATID_OPENFAIL]);
2249 			return (ISC_R_FAMILYNOSUPPORT);
2250 
2251 		default:
2252 			strerror_r(errno, strbuf, sizeof(strbuf));
2253 			UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2254 					 err, strbuf);
2255 			inc_stats(manager->stats,
2256 				  sock->statsindex[STATID_OPENFAIL]);
2257 			return (ISC_R_UNEXPECTED);
2258 		}
2259 	}
2260 
2261 	result = make_nonblock(sock->fd);
2262 	if (result != ISC_R_SUCCESS) {
2263 		(void)close(sock->fd);
2264 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2265 		return (result);
2266 	}
2267 
2268 #ifdef SO_NOSIGPIPE
2269 	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2270 		       sizeof(on)) < 0) {
2271 		strerror_r(errno, strbuf, sizeof(strbuf));
2272 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2273 				 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2274 				 sock->fd, strbuf);
2275 		/* Press on... */
2276 	}
2277 #endif /* ifdef SO_NOSIGPIPE */
2278 
2279 	/*
2280 	 * Use minimum mtu if possible.
2281 	 */
2282 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2283 		use_min_mtu(sock);
2284 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2285 	}
2286 
2287 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2288 	if (sock->type == isc_sockettype_udp) {
2289 #if defined(USE_CMSG)
2290 #if defined(SO_TIMESTAMP)
2291 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2292 			       sizeof(on)) < 0 &&
2293 		    errno != ENOPROTOOPT)
2294 		{
2295 			strerror_r(errno, strbuf, sizeof(strbuf));
2296 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2297 					 "setsockopt(%d, SO_TIMESTAMP) failed: "
2298 					 "%s",
2299 					 sock->fd, strbuf);
2300 			/* Press on... */
2301 		}
2302 #endif /* SO_TIMESTAMP */
2303 
2304 #ifdef IPV6_RECVPKTINFO
2305 		/* RFC 3542 */
2306 		if ((sock->pf == AF_INET6) &&
2307 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2308 				(void *)&on, sizeof(on)) < 0))
2309 		{
2310 			strerror_r(errno, strbuf, sizeof(strbuf));
2311 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2312 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2313 					 "failed: %s",
2314 					 sock->fd, strbuf);
2315 		}
2316 #else  /* ifdef IPV6_RECVPKTINFO */
2317 		/* RFC 2292 */
2318 		if ((sock->pf == AF_INET6) &&
2319 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2320 				(void *)&on, sizeof(on)) < 0))
2321 		{
2322 			strerror_r(errno, strbuf, sizeof(strbuf));
2323 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2324 					 "setsockopt(%d, IPV6_PKTINFO) failed: "
2325 					 "%s",
2326 					 sock->fd, strbuf);
2327 		}
2328 #endif /* IPV6_RECVPKTINFO */
2329 #endif /* defined(USE_CMSG) */
2330 
2331 #if defined(SET_RCVBUF)
2332 		optlen = sizeof(size);
2333 		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2334 			       &optlen) == 0 &&
2335 		    size < rcvbuf)
2336 		{
2337 			RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2338 				      ISC_R_SUCCESS);
2339 			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2340 				       (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2341 			{
2342 				strerror_r(errno, strbuf, sizeof(strbuf));
2343 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2344 						 "setsockopt(%d, SO_RCVBUF, "
2345 						 "%d) failed: %s",
2346 						 sock->fd, rcvbuf, strbuf);
2347 			}
2348 		}
2349 #endif /* if defined(SET_RCVBUF) */
2350 
2351 #if defined(SET_SNDBUF)
2352 		optlen = sizeof(size);
2353 		if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2354 			       &optlen) == 0 &&
2355 		    size < sndbuf)
2356 		{
2357 			RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2358 				      ISC_R_SUCCESS);
2359 			if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2360 				       (void *)&sndbuf, sizeof(sndbuf)) == -1)
2361 			{
2362 				strerror_r(errno, strbuf, sizeof(strbuf));
2363 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2364 						 "setsockopt(%d, SO_SNDBUF, "
2365 						 "%d) failed: %s",
2366 						 sock->fd, sndbuf, strbuf);
2367 			}
2368 		}
2369 #endif /* if defined(SO_SNDBUF) */
2370 	}
2371 #ifdef IPV6_RECVTCLASS
2372 	if ((sock->pf == AF_INET6) &&
2373 	    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2374 			sizeof(on)) < 0))
2375 	{
2376 		strerror_r(errno, strbuf, sizeof(strbuf));
2377 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2378 				 "setsockopt(%d, IPV6_RECVTCLASS) "
2379 				 "failed: %s",
2380 				 sock->fd, strbuf);
2381 	}
2382 #endif /* ifdef IPV6_RECVTCLASS */
2383 #ifdef IP_RECVTOS
2384 	if ((sock->pf == AF_INET) &&
2385 	    (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2386 			sizeof(on)) < 0))
2387 	{
2388 		strerror_r(errno, strbuf, sizeof(strbuf));
2389 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2390 				 "setsockopt(%d, IP_RECVTOS) "
2391 				 "failed: %s",
2392 				 sock->fd, strbuf);
2393 	}
2394 #endif /* ifdef IP_RECVTOS */
2395 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2396 
2397 	set_ip_disable_pmtud(sock);
2398 
2399 	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2400 	if (sock->active == 0) {
2401 		inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2402 		sock->active = 1;
2403 	}
2404 
2405 	return (ISC_R_SUCCESS);
2406 }
2407 
2408 /*
2409  * Create a 'type' socket, managed by 'manager'.  Events will be posted to
2410  * 'task' and when dispatched 'action' will be called with 'arg' as the arg
2411  * value.  The new socket is returned in 'socketp'.
2412  */
2413 static isc_result_t
socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2414 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2415 	      isc_socket_t **socketp) {
2416 	isc_socket_t *sock = NULL;
2417 	isc__socketthread_t *thread;
2418 	isc_result_t result;
2419 	int lockid;
2420 
2421 	REQUIRE(VALID_MANAGER(manager));
2422 	REQUIRE(socketp != NULL && *socketp == NULL);
2423 
2424 	result = allocate_socket(manager, type, &sock);
2425 	if (result != ISC_R_SUCCESS) {
2426 		return (result);
2427 	}
2428 
2429 	switch (sock->type) {
2430 	case isc_sockettype_udp:
2431 		sock->statsindex = (pf == AF_INET) ? udp4statsindex
2432 						   : udp6statsindex;
2433 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2434 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2435 		break;
2436 	case isc_sockettype_tcp:
2437 		sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2438 						   : tcp6statsindex;
2439 		break;
2440 	case isc_sockettype_unix:
2441 		sock->statsindex = unixstatsindex;
2442 		break;
2443 	case isc_sockettype_raw:
2444 		sock->statsindex = rawstatsindex;
2445 		break;
2446 	default:
2447 		INSIST(0);
2448 		ISC_UNREACHABLE();
2449 	}
2450 
2451 	sock->pf = pf;
2452 
2453 	result = opensocket(manager, sock);
2454 	if (result != ISC_R_SUCCESS) {
2455 		free_socket(&sock);
2456 		return (result);
2457 	}
2458 
2459 	if (sock->fd == -1) {
2460 		abort();
2461 	}
2462 	sock->threadid = gen_threadid(sock);
2463 	isc_refcount_increment0(&sock->references);
2464 	thread = &manager->threads[sock->threadid];
2465 	*socketp = sock;
2466 
2467 	/*
2468 	 * Note we don't have to lock the socket like we normally would because
2469 	 * there are no external references to it yet.
2470 	 */
2471 
2472 	lockid = FDLOCK_ID(sock->fd);
2473 	LOCK(&thread->fdlock[lockid]);
2474 	thread->fds[sock->fd] = sock;
2475 	thread->fdstate[sock->fd] = MANAGED;
2476 #if defined(USE_EPOLL)
2477 	thread->epoll_events[sock->fd] = 0;
2478 #endif /* if defined(USE_EPOLL) */
2479 #ifdef USE_DEVPOLL
2480 	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2481 	       thread->fdpollinfo[sock->fd].want_write == 0);
2482 #endif /* ifdef USE_DEVPOLL */
2483 	UNLOCK(&thread->fdlock[lockid]);
2484 
2485 	LOCK(&manager->lock);
2486 	ISC_LIST_APPEND(manager->socklist, sock, link);
2487 #ifdef USE_SELECT
2488 	if (thread->maxfd < sock->fd) {
2489 		thread->maxfd = sock->fd;
2490 	}
2491 #endif /* ifdef USE_SELECT */
2492 	UNLOCK(&manager->lock);
2493 
2494 	socket_log(sock, NULL, CREATION, "created");
2495 
2496 	return (ISC_R_SUCCESS);
2497 }
2498 
2499 /*%
2500  * Create a new 'type' socket managed by 'manager'.  Events
2501  * will be posted to 'task' and when dispatched 'action' will be
2502  * called with 'arg' as the arg value.  The new socket is returned
2503  * in 'socketp'.
2504  */
2505 isc_result_t
isc_socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2506 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2507 		  isc_socket_t **socketp) {
2508 	return (socket_create(manager, pf, type, socketp));
2509 }
2510 
2511 isc_result_t
isc_socket_open(isc_socket_t * sock)2512 isc_socket_open(isc_socket_t *sock) {
2513 	isc_result_t result;
2514 	isc__socketthread_t *thread;
2515 
2516 	REQUIRE(VALID_SOCKET(sock));
2517 
2518 	LOCK(&sock->lock);
2519 
2520 	REQUIRE(isc_refcount_current(&sock->references) >= 1);
2521 	REQUIRE(sock->fd == -1);
2522 	REQUIRE(sock->threadid == -1);
2523 
2524 	result = opensocket(sock->manager, sock);
2525 
2526 	UNLOCK(&sock->lock);
2527 
2528 	if (result != ISC_R_SUCCESS) {
2529 		sock->fd = -1;
2530 	} else {
2531 		sock->threadid = gen_threadid(sock);
2532 		thread = &sock->manager->threads[sock->threadid];
2533 		int lockid = FDLOCK_ID(sock->fd);
2534 
2535 		LOCK(&thread->fdlock[lockid]);
2536 		thread->fds[sock->fd] = sock;
2537 		thread->fdstate[sock->fd] = MANAGED;
2538 #if defined(USE_EPOLL)
2539 		thread->epoll_events[sock->fd] = 0;
2540 #endif /* if defined(USE_EPOLL) */
2541 #ifdef USE_DEVPOLL
2542 		INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2543 		       thread->fdpollinfo[sock->fd].want_write == 0);
2544 #endif /* ifdef USE_DEVPOLL */
2545 		UNLOCK(&thread->fdlock[lockid]);
2546 
2547 #ifdef USE_SELECT
2548 		LOCK(&sock->manager->lock);
2549 		if (thread->maxfd < sock->fd) {
2550 			thread->maxfd = sock->fd;
2551 		}
2552 		UNLOCK(&sock->manager->lock);
2553 #endif /* ifdef USE_SELECT */
2554 	}
2555 
2556 	return (result);
2557 }
2558 
2559 /*
2560  * Attach to a socket.  Caller must explicitly detach when it is done.
2561  */
2562 void
isc_socket_attach(isc_socket_t * sock,isc_socket_t ** socketp)2563 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2564 	REQUIRE(VALID_SOCKET(sock));
2565 	REQUIRE(socketp != NULL && *socketp == NULL);
2566 
2567 	int old_refs = isc_refcount_increment(&sock->references);
2568 	REQUIRE(old_refs > 0);
2569 
2570 	*socketp = sock;
2571 }
2572 
2573 /*
2574  * Dereference a socket.  If this is the last reference to it, clean things
2575  * up by destroying the socket.
2576  */
2577 void
isc_socket_detach(isc_socket_t ** socketp)2578 isc_socket_detach(isc_socket_t **socketp) {
2579 	isc_socket_t *sock;
2580 
2581 	REQUIRE(socketp != NULL);
2582 	sock = *socketp;
2583 	REQUIRE(VALID_SOCKET(sock));
2584 	if (isc_refcount_decrement(&sock->references) == 1) {
2585 		destroy(&sock);
2586 	}
2587 
2588 	*socketp = NULL;
2589 }
2590 
2591 isc_result_t
isc_socket_close(isc_socket_t * sock)2592 isc_socket_close(isc_socket_t *sock) {
2593 	int fd;
2594 	isc_socketmgr_t *manager;
2595 	isc__socketthread_t *thread;
2596 
2597 	REQUIRE(VALID_SOCKET(sock));
2598 
2599 	LOCK(&sock->lock);
2600 
2601 	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2602 
2603 	INSIST(!sock->connecting);
2604 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2605 	INSIST(ISC_LIST_EMPTY(sock->send_list));
2606 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2607 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
2608 
2609 	manager = sock->manager;
2610 	thread = &manager->threads[sock->threadid];
2611 	fd = sock->fd;
2612 	sock->fd = -1;
2613 	sock->threadid = -1;
2614 
2615 	memset(sock->name, 0, sizeof(sock->name));
2616 	sock->tag = NULL;
2617 	sock->listener = 0;
2618 	sock->connected = 0;
2619 	sock->connecting = 0;
2620 	sock->bound = 0;
2621 	isc_sockaddr_any(&sock->peer_address);
2622 
2623 	UNLOCK(&sock->lock);
2624 
2625 	socketclose(thread, sock, fd);
2626 
2627 	return (ISC_R_SUCCESS);
2628 }
2629 
2630 /*
2631  * Dequeue an item off the given socket's read queue, set the result code
2632  * in the done event to the one provided, and send it to the task it was
2633  * destined for.
2634  *
2635  * If the event to be sent is on a list, remove it before sending.  If
2636  * asked to, send and detach from the socket as well.
2637  *
2638  * Caller must have the socket locked if the event is attached to the socket.
2639  */
2640 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2641 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2642 	isc_task_t *task;
2643 
2644 	task = (*dev)->ev_sender;
2645 
2646 	(*dev)->ev_sender = sock;
2647 
2648 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2649 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2650 	}
2651 
2652 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2653 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2654 					 sock->threadid);
2655 	} else {
2656 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2657 	}
2658 }
2659 
2660 /*
2661  * See comments for send_recvdone_event() above.
2662  *
2663  * Caller must have the socket locked if the event is attached to the socket.
2664  */
2665 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2666 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2667 	isc_task_t *task;
2668 
2669 	INSIST(dev != NULL && *dev != NULL);
2670 
2671 	task = (*dev)->ev_sender;
2672 	(*dev)->ev_sender = sock;
2673 
2674 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2675 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2676 	}
2677 
2678 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2679 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2680 					 sock->threadid);
2681 	} else {
2682 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2683 	}
2684 }
2685 
2686 /*
2687  * See comments for send_recvdone_event() above.
2688  *
2689  * Caller must have the socket locked if the event is attached to the socket.
2690  */
2691 static void
send_connectdone_event(isc_socket_t * sock,isc_socket_connev_t ** dev)2692 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
2693 	isc_task_t *task;
2694 
2695 	INSIST(dev != NULL && *dev != NULL);
2696 
2697 	task = (*dev)->ev_sender;
2698 	(*dev)->ev_sender = sock;
2699 
2700 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2701 		ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2702 	}
2703 
2704 	isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2705 }
2706 
2707 /*
2708  * Call accept() on a socket, to get the new file descriptor.  The listen
2709  * socket is used as a prototype to create a new isc_socket_t.  The new
2710  * socket has one outstanding reference.  The task receiving the event
2711  * will be detached from just after the event is delivered.
2712  *
2713  * On entry to this function, the event delivered is the internal
2714  * readable event, and the first item on the accept_list should be
2715  * the done event we want to send.  If the list is empty, this is a no-op,
2716  * so just unlock and return.
2717  */
2718 static void
internal_accept(isc_socket_t * sock)2719 internal_accept(isc_socket_t *sock) {
2720 	isc_socketmgr_t *manager;
2721 	isc__socketthread_t *thread, *nthread;
2722 	isc_socket_newconnev_t *dev;
2723 	isc_task_t *task;
2724 	socklen_t addrlen;
2725 	int fd;
2726 	isc_result_t result = ISC_R_SUCCESS;
2727 	char strbuf[ISC_STRERRORSIZE];
2728 	const char *err = "accept";
2729 
2730 	INSIST(VALID_SOCKET(sock));
2731 	REQUIRE(sock->fd >= 0);
2732 
2733 	socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2734 
2735 	manager = sock->manager;
2736 	INSIST(VALID_MANAGER(manager));
2737 	thread = &manager->threads[sock->threadid];
2738 
2739 	INSIST(sock->listener);
2740 
2741 	/*
2742 	 * Get the first item off the accept list.
2743 	 * If it is empty, unlock the socket and return.
2744 	 */
2745 	dev = ISC_LIST_HEAD(sock->accept_list);
2746 	if (dev == NULL) {
2747 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2748 		UNLOCK(&sock->lock);
2749 		return;
2750 	}
2751 
2752 	/*
2753 	 * Try to accept the new connection.  If the accept fails with
2754 	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2755 	 * again.  Also ignore ECONNRESET, which has been reported to
2756 	 * be spuriously returned on Linux 2.2.19 although it is not
2757 	 * a documented error for accept().  ECONNABORTED has been
2758 	 * reported for Solaris 8.  The rest are thrown in not because
2759 	 * we have seen them but because they are ignored by other
2760 	 * daemons such as BIND 8 and Apache.
2761 	 */
2762 
2763 	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2764 	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2765 	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2766 		    (void *)&addrlen);
2767 
2768 #ifdef F_DUPFD
2769 	/*
2770 	 * Leave a space for stdio to work in.
2771 	 */
2772 	if (fd >= 0 && fd < 20) {
2773 		int newfd, tmp;
2774 		newfd = fcntl(fd, F_DUPFD, 20);
2775 		tmp = errno;
2776 		(void)close(fd);
2777 		errno = tmp;
2778 		fd = newfd;
2779 		err = "accept/fcntl";
2780 	}
2781 #endif /* ifdef F_DUPFD */
2782 
2783 	if (fd < 0) {
2784 		if (SOFT_ERROR(errno)) {
2785 			goto soft_error;
2786 		}
2787 		switch (errno) {
2788 		case ENFILE:
2789 		case EMFILE:
2790 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2791 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2792 				      "%s: too many open file descriptors",
2793 				      err);
2794 			goto soft_error;
2795 
2796 		case ENOBUFS:
2797 		case ENOMEM:
2798 		case ECONNRESET:
2799 		case ECONNABORTED:
2800 		case EHOSTUNREACH:
2801 		case EHOSTDOWN:
2802 		case ENETUNREACH:
2803 		case ENETDOWN:
2804 		case ECONNREFUSED:
2805 #ifdef EPROTO
2806 		case EPROTO:
2807 #endif /* ifdef EPROTO */
2808 #ifdef ENONET
2809 		case ENONET:
2810 #endif /* ifdef ENONET */
2811 			goto soft_error;
2812 		default:
2813 			break;
2814 		}
2815 		strerror_r(errno, strbuf, sizeof(strbuf));
2816 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2817 				 "internal_accept: %s() failed: %s", err,
2818 				 strbuf);
2819 		fd = -1;
2820 		result = ISC_R_UNEXPECTED;
2821 	} else {
2822 		if (addrlen == 0U) {
2823 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2824 					 "internal_accept(): "
2825 					 "accept() failed to return "
2826 					 "remote address");
2827 
2828 			(void)close(fd);
2829 			goto soft_error;
2830 		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2831 			   sock->pf) {
2832 			UNEXPECTED_ERROR(
2833 				__FILE__, __LINE__,
2834 				"internal_accept(): "
2835 				"accept() returned peer address "
2836 				"family %u (expected %u)",
2837 				NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2838 				sock->pf);
2839 			(void)close(fd);
2840 			goto soft_error;
2841 		} else if (fd >= (int)manager->maxsocks) {
2842 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2843 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2844 				      "accept: file descriptor exceeds limit "
2845 				      "(%d/%u)",
2846 				      fd, manager->maxsocks);
2847 			(void)close(fd);
2848 			goto soft_error;
2849 		}
2850 	}
2851 
2852 	if (fd != -1) {
2853 		NEWCONNSOCK(dev)->peer_address.length = addrlen;
2854 		NEWCONNSOCK(dev)->pf = sock->pf;
2855 	}
2856 
2857 	/*
2858 	 * Pull off the done event.
2859 	 */
2860 	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2861 
2862 	/*
2863 	 * Poke watcher if there are more pending accepts.
2864 	 */
2865 	if (ISC_LIST_EMPTY(sock->accept_list)) {
2866 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2867 	}
2868 
2869 	if (fd != -1) {
2870 		result = make_nonblock(fd);
2871 		if (result != ISC_R_SUCCESS) {
2872 			(void)close(fd);
2873 			fd = -1;
2874 		}
2875 	}
2876 
2877 	/*
2878 	 * We need to unlock sock->lock now to be able to lock manager->lock
2879 	 * without risking a deadlock with xmlstats.
2880 	 */
2881 	UNLOCK(&sock->lock);
2882 
2883 	/*
2884 	 * -1 means the new socket didn't happen.
2885 	 */
2886 	if (fd != -1) {
2887 		int lockid = FDLOCK_ID(fd);
2888 
2889 		NEWCONNSOCK(dev)->fd = fd;
2890 		NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2891 		NEWCONNSOCK(dev)->bound = 1;
2892 		NEWCONNSOCK(dev)->connected = 1;
2893 		nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2894 
2895 		/*
2896 		 * We already hold a lock on one fdlock in accepting thread,
2897 		 * we need to make sure that we don't double lock.
2898 		 */
2899 		bool same_bucket = (sock->threadid ==
2900 				    NEWCONNSOCK(dev)->threadid) &&
2901 				   (FDLOCK_ID(sock->fd) == lockid);
2902 
2903 		/*
2904 		 * Use minimum mtu if possible.
2905 		 */
2906 		use_min_mtu(NEWCONNSOCK(dev));
2907 		set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
2908 
2909 		/*
2910 		 * Ensure DSCP settings are inherited across accept.
2911 		 */
2912 		setdscp(NEWCONNSOCK(dev), sock->dscp);
2913 
2914 		/*
2915 		 * Save away the remote address
2916 		 */
2917 		dev->address = NEWCONNSOCK(dev)->peer_address;
2918 
2919 		if (NEWCONNSOCK(dev)->active == 0) {
2920 			inc_stats(manager->stats,
2921 				  NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
2922 			NEWCONNSOCK(dev)->active = 1;
2923 		}
2924 
2925 		if (!same_bucket) {
2926 			LOCK(&nthread->fdlock[lockid]);
2927 		}
2928 		nthread->fds[fd] = NEWCONNSOCK(dev);
2929 		nthread->fdstate[fd] = MANAGED;
2930 #if defined(USE_EPOLL)
2931 		nthread->epoll_events[fd] = 0;
2932 #endif /* if defined(USE_EPOLL) */
2933 		if (!same_bucket) {
2934 			UNLOCK(&nthread->fdlock[lockid]);
2935 		}
2936 
2937 		LOCK(&manager->lock);
2938 
2939 #ifdef USE_SELECT
2940 		if (nthread->maxfd < fd) {
2941 			nthread->maxfd = fd;
2942 		}
2943 #endif /* ifdef USE_SELECT */
2944 
2945 		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
2946 			   "accepted connection, new socket %p",
2947 			   dev->newsocket);
2948 
2949 		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
2950 
2951 		UNLOCK(&manager->lock);
2952 
2953 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2954 	} else {
2955 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2956 		isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
2957 		free_socket((isc_socket_t **)&dev->newsocket);
2958 	}
2959 
2960 	/*
2961 	 * Fill in the done event details and send it off.
2962 	 */
2963 	dev->result = result;
2964 	task = dev->ev_sender;
2965 	dev->ev_sender = sock;
2966 
2967 	isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
2968 	return;
2969 
2970 soft_error:
2971 	watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2972 	UNLOCK(&sock->lock);
2973 
2974 	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2975 	return;
2976 }
2977 
2978 static void
internal_recv(isc_socket_t * sock)2979 internal_recv(isc_socket_t *sock) {
2980 	isc_socketevent_t *dev;
2981 
2982 	INSIST(VALID_SOCKET(sock));
2983 	REQUIRE(sock->fd >= 0);
2984 
2985 	dev = ISC_LIST_HEAD(sock->recv_list);
2986 	if (dev == NULL) {
2987 		goto finish;
2988 	}
2989 
2990 	socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
2991 		   dev, dev->ev_sender);
2992 
2993 	/*
2994 	 * Try to do as much I/O as possible on this socket.  There are no
2995 	 * limits here, currently.
2996 	 */
2997 	while (dev != NULL) {
2998 		switch (doio_recv(sock, dev)) {
2999 		case DOIO_SOFT:
3000 			goto finish;
3001 
3002 		case DOIO_EOF:
3003 			/*
3004 			 * read of 0 means the remote end was closed.
3005 			 * Run through the event queue and dispatch all
3006 			 * the events with an EOF result code.
3007 			 */
3008 			do {
3009 				dev->result = ISC_R_EOF;
3010 				send_recvdone_event(sock, &dev);
3011 				dev = ISC_LIST_HEAD(sock->recv_list);
3012 			} while (dev != NULL);
3013 			goto finish;
3014 
3015 		case DOIO_SUCCESS:
3016 		case DOIO_HARD:
3017 			send_recvdone_event(sock, &dev);
3018 			break;
3019 		}
3020 
3021 		dev = ISC_LIST_HEAD(sock->recv_list);
3022 	}
3023 
3024 finish:
3025 	if (ISC_LIST_EMPTY(sock->recv_list)) {
3026 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3027 			   SELECT_POKE_READ);
3028 	}
3029 }
3030 
3031 static void
internal_send(isc_socket_t * sock)3032 internal_send(isc_socket_t *sock) {
3033 	isc_socketevent_t *dev;
3034 
3035 	INSIST(VALID_SOCKET(sock));
3036 	REQUIRE(sock->fd >= 0);
3037 
3038 	dev = ISC_LIST_HEAD(sock->send_list);
3039 	if (dev == NULL) {
3040 		goto finish;
3041 	}
3042 	socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3043 		   dev->ev_sender);
3044 
3045 	/*
3046 	 * Try to do as much I/O as possible on this socket.  There are no
3047 	 * limits here, currently.
3048 	 */
3049 	while (dev != NULL) {
3050 		switch (doio_send(sock, dev)) {
3051 		case DOIO_SOFT:
3052 			goto finish;
3053 
3054 		case DOIO_HARD:
3055 		case DOIO_SUCCESS:
3056 			send_senddone_event(sock, &dev);
3057 			break;
3058 		}
3059 
3060 		dev = ISC_LIST_HEAD(sock->send_list);
3061 	}
3062 
3063 finish:
3064 	if (ISC_LIST_EMPTY(sock->send_list)) {
3065 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3066 			   SELECT_POKE_WRITE);
3067 	}
3068 }
3069 
3070 /*
3071  * Process read/writes on each fd here.  Avoid locking
3072  * and unlocking twice if both reads and writes are possible.
3073  */
3074 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3075 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3076 	isc_socket_t *sock;
3077 	int lockid = FDLOCK_ID(fd);
3078 
3079 	/*
3080 	 * If the socket is going to be closed, don't do more I/O.
3081 	 */
3082 	LOCK(&thread->fdlock[lockid]);
3083 	if (thread->fdstate[fd] == CLOSE_PENDING) {
3084 		UNLOCK(&thread->fdlock[lockid]);
3085 
3086 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3087 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3088 		return;
3089 	}
3090 
3091 	sock = thread->fds[fd];
3092 	if (sock == NULL) {
3093 		UNLOCK(&thread->fdlock[lockid]);
3094 		return;
3095 	}
3096 
3097 	LOCK(&sock->lock);
3098 
3099 	if (sock->fd < 0) {
3100 		/*
3101 		 * Sock is being closed - the final external reference
3102 		 * is gone but it was not yet removed from event loop
3103 		 * and fdstate[]/fds[] as destroy() is waiting on
3104 		 * thread->fdlock[lockid] or sock->lock that we're holding.
3105 		 * Just release the locks and bail.
3106 		 */
3107 		UNLOCK(&sock->lock);
3108 		UNLOCK(&thread->fdlock[lockid]);
3109 		return;
3110 	}
3111 
3112 	REQUIRE(readable || writeable);
3113 	if (writeable) {
3114 		if (sock->connecting) {
3115 			internal_connect(sock);
3116 		} else {
3117 			internal_send(sock);
3118 		}
3119 	}
3120 
3121 	if (readable) {
3122 		if (sock->listener) {
3123 			internal_accept(sock); /* unlocks sock */
3124 		} else {
3125 			internal_recv(sock);
3126 			UNLOCK(&sock->lock);
3127 		}
3128 	} else {
3129 		UNLOCK(&sock->lock);
3130 	}
3131 
3132 	UNLOCK(&thread->fdlock[lockid]);
3133 
3134 	/*
3135 	 * Socket destruction might be pending, it will resume
3136 	 * after releasing fdlock and sock->lock.
3137 	 */
3138 }
3139 
3140 /*
3141  * process_fds is different for different event loops
3142  * it takes the events from event loops and for each FD
3143  * launches process_fd
3144  */
3145 #ifdef USE_KQUEUE
3146 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3147 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3148 	int i;
3149 	bool readable, writable;
3150 	bool done = false;
3151 	bool have_ctlevent = false;
3152 	if (nevents == thread->nevents) {
3153 		/*
3154 		 * This is not an error, but something unexpected.  If this
3155 		 * happens, it may indicate the need for increasing
3156 		 * ISC_SOCKET_MAXEVENTS.
3157 		 */
3158 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3159 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3160 			   "maximum number of FD events (%d) received",
3161 			   nevents);
3162 	}
3163 
3164 	for (i = 0; i < nevents; i++) {
3165 		REQUIRE(events[i].ident < thread->manager->maxsocks);
3166 		if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3167 			have_ctlevent = true;
3168 			continue;
3169 		}
3170 		readable = (events[i].filter == EVFILT_READ);
3171 		writable = (events[i].filter == EVFILT_WRITE);
3172 		process_fd(thread, events[i].ident, readable, writable);
3173 	}
3174 
3175 	if (have_ctlevent) {
3176 		done = process_ctlfd(thread);
3177 	}
3178 
3179 	return (done);
3180 }
3181 #elif defined(USE_EPOLL)
3182 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3183 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3184 	    int nevents) {
3185 	int i;
3186 	bool done = false;
3187 	bool have_ctlevent = false;
3188 
3189 	if (nevents == thread->nevents) {
3190 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3191 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3192 			   "maximum number of FD events (%d) received",
3193 			   nevents);
3194 	}
3195 
3196 	for (i = 0; i < nevents; i++) {
3197 		REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3198 		if (events[i].data.fd == thread->pipe_fds[0]) {
3199 			have_ctlevent = true;
3200 			continue;
3201 		}
3202 		if ((events[i].events & EPOLLERR) != 0 ||
3203 		    (events[i].events & EPOLLHUP) != 0) {
3204 			/*
3205 			 * epoll does not set IN/OUT bits on an erroneous
3206 			 * condition, so we need to try both anyway.  This is a
3207 			 * bit inefficient, but should be okay for such rare
3208 			 * events.  Note also that the read or write attempt
3209 			 * won't block because we use non-blocking sockets.
3210 			 */
3211 			int fd = events[i].data.fd;
3212 			events[i].events |= thread->epoll_events[fd];
3213 		}
3214 		process_fd(thread, events[i].data.fd,
3215 			   (events[i].events & EPOLLIN) != 0,
3216 			   (events[i].events & EPOLLOUT) != 0);
3217 	}
3218 
3219 	if (have_ctlevent) {
3220 		done = process_ctlfd(thread);
3221 	}
3222 
3223 	return (done);
3224 }
3225 #elif defined(USE_DEVPOLL)
3226 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3227 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3228 	int i;
3229 	bool done = false;
3230 	bool have_ctlevent = false;
3231 
3232 	if (nevents == thread->nevents) {
3233 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3234 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3235 			   "maximum number of FD events (%d) received",
3236 			   nevents);
3237 	}
3238 
3239 	for (i = 0; i < nevents; i++) {
3240 		REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3241 		if (events[i].fd == thread->pipe_fds[0]) {
3242 			have_ctlevent = true;
3243 			continue;
3244 		}
3245 		process_fd(thread, events[i].fd,
3246 			   (events[i].events & POLLIN) != 0,
3247 			   (events[i].events & POLLOUT) != 0);
3248 	}
3249 
3250 	if (have_ctlevent) {
3251 		done = process_ctlfd(thread);
3252 	}
3253 
3254 	return (done);
3255 }
3256 #elif defined(USE_SELECT)
3257 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3258 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3259 	    fd_set *writefds) {
3260 	int i;
3261 
3262 	REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3263 
3264 	for (i = 0; i < maxfd; i++) {
3265 		if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3266 			continue;
3267 		}
3268 		process_fd(thread, i, FD_ISSET(i, readfds),
3269 			   FD_ISSET(i, writefds));
3270 	}
3271 }
3272 #endif /* ifdef USE_KQUEUE */
3273 
3274 static bool
process_ctlfd(isc__socketthread_t * thread)3275 process_ctlfd(isc__socketthread_t *thread) {
3276 	int msg, fd;
3277 
3278 	for (;;) {
3279 		select_readmsg(thread, &fd, &msg);
3280 
3281 		thread_log(thread, IOEVENT,
3282 			   "watcher got message %d for socket %d", msg, fd);
3283 
3284 		/*
3285 		 * Nothing to read?
3286 		 */
3287 		if (msg == SELECT_POKE_NOTHING) {
3288 			break;
3289 		}
3290 
3291 		/*
3292 		 * Handle shutdown message.  We really should
3293 		 * jump out of this loop right away, but
3294 		 * it doesn't matter if we have to do a little
3295 		 * more work first.
3296 		 */
3297 		if (msg == SELECT_POKE_SHUTDOWN) {
3298 			return (true);
3299 		}
3300 
3301 		/*
3302 		 * This is a wakeup on a socket.  Look
3303 		 * at the event queue for both read and write,
3304 		 * and decide if we need to watch on it now
3305 		 * or not.
3306 		 */
3307 		wakeup_socket(thread, fd, msg);
3308 	}
3309 
3310 	return (false);
3311 }
3312 
3313 /*
3314  * This is the thread that will loop forever, always in a select or poll
3315  * call.
3316  *
3317  * When select returns something to do, do whatever's necessary and post
3318  * an event to the task that was requesting the action.
3319  */
3320 static isc_threadresult_t
netthread(void * uap)3321 netthread(void *uap) {
3322 	isc__socketthread_t *thread = uap;
3323 	isc_socketmgr_t *manager = thread->manager;
3324 	(void)manager;
3325 	bool done;
3326 	int cc;
3327 #ifdef USE_KQUEUE
3328 	const char *fnname = "kevent()";
3329 #elif defined(USE_EPOLL)
3330 	const char *fnname = "epoll_wait()";
3331 #elif defined(USE_DEVPOLL)
3332 	isc_result_t result;
3333 	const char *fnname = "ioctl(DP_POLL)";
3334 	struct dvpoll dvp;
3335 	int pass;
3336 #if defined(ISC_SOCKET_USE_POLLWATCH)
3337 	pollstate_t pollstate = poll_idle;
3338 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3339 #elif defined(USE_SELECT)
3340 	const char *fnname = "select()";
3341 	int maxfd;
3342 	int ctlfd;
3343 #endif /* ifdef USE_KQUEUE */
3344 	char strbuf[ISC_STRERRORSIZE];
3345 
3346 #if defined(USE_SELECT)
3347 	/*
3348 	 * Get the control fd here.  This will never change.
3349 	 */
3350 	ctlfd = thread->pipe_fds[0];
3351 #endif /* if defined(USE_SELECT) */
3352 	done = false;
3353 	while (!done) {
3354 		do {
3355 #ifdef USE_KQUEUE
3356 			cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3357 				    thread->nevents, NULL);
3358 #elif defined(USE_EPOLL)
3359 			cc = epoll_wait(thread->epoll_fd, thread->events,
3360 					thread->nevents, -1);
3361 #elif defined(USE_DEVPOLL)
3362 			/*
3363 			 * Re-probe every thousand calls.
3364 			 */
3365 			if (thread->calls++ > 1000U) {
3366 				result = isc_resource_getcurlimit(
3367 					isc_resource_openfiles,
3368 					&thread->open_max);
3369 				if (result != ISC_R_SUCCESS) {
3370 					thread->open_max = 64;
3371 				}
3372 				thread->calls = 0;
3373 			}
3374 			for (pass = 0; pass < 2; pass++) {
3375 				dvp.dp_fds = thread->events;
3376 				dvp.dp_nfds = thread->nevents;
3377 				if (dvp.dp_nfds >= thread->open_max) {
3378 					dvp.dp_nfds = thread->open_max - 1;
3379 				}
3380 #ifndef ISC_SOCKET_USE_POLLWATCH
3381 				dvp.dp_timeout = -1;
3382 #else  /* ifndef ISC_SOCKET_USE_POLLWATCH */
3383 				if (pollstate == poll_idle) {
3384 					dvp.dp_timeout = -1;
3385 				} else {
3386 					dvp.dp_timeout =
3387 						ISC_SOCKET_POLLWATCH_TIMEOUT;
3388 				}
3389 #endif /* ISC_SOCKET_USE_POLLWATCH */
3390 				cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3391 				if (cc == -1 && errno == EINVAL) {
3392 					/*
3393 					 * {OPEN_MAX} may have dropped.  Look
3394 					 * up the current value and try again.
3395 					 */
3396 					result = isc_resource_getcurlimit(
3397 						isc_resource_openfiles,
3398 						&thread->open_max);
3399 					if (result != ISC_R_SUCCESS) {
3400 						thread->open_max = 64;
3401 					}
3402 				} else {
3403 					break;
3404 				}
3405 			}
3406 #elif defined(USE_SELECT)
3407 			/*
3408 			 * We will have only one thread anyway, we can lock
3409 			 * manager lock and don't care
3410 			 */
3411 			LOCK(&manager->lock);
3412 			memmove(thread->read_fds_copy, thread->read_fds,
3413 				thread->fd_bufsize);
3414 			memmove(thread->write_fds_copy, thread->write_fds,
3415 				thread->fd_bufsize);
3416 			maxfd = thread->maxfd + 1;
3417 			UNLOCK(&manager->lock);
3418 
3419 			cc = select(maxfd, thread->read_fds_copy,
3420 				    thread->write_fds_copy, NULL, NULL);
3421 #endif /* USE_KQUEUE */
3422 
3423 			if (cc < 0 && !SOFT_ERROR(errno)) {
3424 				strerror_r(errno, strbuf, sizeof(strbuf));
3425 				FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3426 					    fnname, strbuf);
3427 			}
3428 
3429 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3430 			if (cc == 0) {
3431 				if (pollstate == poll_active) {
3432 					pollstate = poll_checking;
3433 				} else if (pollstate == poll_checking) {
3434 					pollstate = poll_idle;
3435 				}
3436 			} else if (cc > 0) {
3437 				if (pollstate == poll_checking) {
3438 					/*
3439 					 * XXX: We'd like to use a more
3440 					 * verbose log level as it's actually an
3441 					 * unexpected event, but the kernel bug
3442 					 * reportedly happens pretty frequently
3443 					 * (and it can also be a false positive)
3444 					 * so it would be just too noisy.
3445 					 */
3446 					thread_log(thread,
3447 						   ISC_LOGCATEGORY_GENERAL,
3448 						   ISC_LOGMODULE_SOCKET,
3449 						   ISC_LOG_DEBUG(1),
3450 						   "unexpected POLL timeout");
3451 				}
3452 				pollstate = poll_active;
3453 			}
3454 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3455 		} while (cc < 0);
3456 
3457 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3458 		done = process_fds(thread, thread->events, cc);
3459 #elif defined(USE_SELECT)
3460 		process_fds(thread, maxfd, thread->read_fds_copy,
3461 			    thread->write_fds_copy);
3462 
3463 		/*
3464 		 * Process reads on internal, control fd.
3465 		 */
3466 		if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3467 			done = process_ctlfd(thread);
3468 		}
3469 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3470 	* */
3471 	}
3472 
3473 	thread_log(thread, TRACE, "watcher exiting");
3474 	return ((isc_threadresult_t)0);
3475 }
3476 
3477 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager,uint32_t reserved)3478 isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
3479 	REQUIRE(VALID_MANAGER(manager));
3480 
3481 	manager->reserved = reserved;
3482 }
3483 
3484 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager,unsigned int maxudp)3485 isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
3486 	REQUIRE(VALID_MANAGER(manager));
3487 
3488 	manager->maxudp = maxudp;
3489 }
3490 
3491 /*
3492  * Setup socket thread, thread->manager and thread->threadid must be filled.
3493  */
3494 
3495 static isc_result_t
setup_thread(isc__socketthread_t * thread)3496 setup_thread(isc__socketthread_t *thread) {
3497 	isc_result_t result = ISC_R_SUCCESS;
3498 	int i;
3499 	char strbuf[ISC_STRERRORSIZE];
3500 
3501 	REQUIRE(thread != NULL);
3502 	REQUIRE(VALID_MANAGER(thread->manager));
3503 	REQUIRE(thread->threadid >= 0 &&
3504 		thread->threadid < thread->manager->nthreads);
3505 
3506 	thread->fds =
3507 		isc_mem_get(thread->manager->mctx,
3508 			    thread->manager->maxsocks * sizeof(isc_socket_t *));
3509 
3510 	memset(thread->fds, 0,
3511 	       thread->manager->maxsocks * sizeof(isc_socket_t *));
3512 
3513 	thread->fdstate = isc_mem_get(thread->manager->mctx,
3514 				      thread->manager->maxsocks * sizeof(int));
3515 
3516 	memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3517 
3518 	thread->fdlock = isc_mem_get(thread->manager->mctx,
3519 				     FDLOCK_COUNT * sizeof(isc_mutex_t));
3520 
3521 	for (i = 0; i < FDLOCK_COUNT; i++) {
3522 		isc_mutex_init(&thread->fdlock[i]);
3523 	}
3524 
3525 	if (pipe(thread->pipe_fds) != 0) {
3526 		strerror_r(errno, strbuf, sizeof(strbuf));
3527 		UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3528 				 strbuf);
3529 		return (ISC_R_UNEXPECTED);
3530 	}
3531 	RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3532 
3533 #ifdef USE_KQUEUE
3534 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3535 	thread->events = isc_mem_get(thread->manager->mctx,
3536 				     sizeof(struct kevent) * thread->nevents);
3537 
3538 	thread->kqueue_fd = kqueue();
3539 	if (thread->kqueue_fd == -1) {
3540 		result = isc__errno2result(errno);
3541 		strerror_r(errno, strbuf, sizeof(strbuf));
3542 		UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3543 				 strbuf);
3544 		isc_mem_put(thread->manager->mctx, thread->events,
3545 			    sizeof(struct kevent) * thread->nevents);
3546 		return (result);
3547 	}
3548 
3549 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3550 	if (result != ISC_R_SUCCESS) {
3551 		close(thread->kqueue_fd);
3552 		isc_mem_put(thread->manager->mctx, thread->events,
3553 			    sizeof(struct kevent) * thread->nevents);
3554 	}
3555 	return (result);
3556 
3557 #elif defined(USE_EPOLL)
3558 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3559 	thread->epoll_events =
3560 		isc_mem_get(thread->manager->mctx,
3561 			    (thread->manager->maxsocks * sizeof(uint32_t)));
3562 
3563 	memset(thread->epoll_events, 0,
3564 	       thread->manager->maxsocks * sizeof(uint32_t));
3565 
3566 	thread->events =
3567 		isc_mem_get(thread->manager->mctx,
3568 			    sizeof(struct epoll_event) * thread->nevents);
3569 
3570 	thread->epoll_fd = epoll_create(thread->nevents);
3571 	if (thread->epoll_fd == -1) {
3572 		result = isc__errno2result(errno);
3573 		strerror_r(errno, strbuf, sizeof(strbuf));
3574 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3575 				 strbuf);
3576 		return (result);
3577 	}
3578 
3579 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3580 	return (result);
3581 
3582 #elif defined(USE_DEVPOLL)
3583 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3584 	result = isc_resource_getcurlimit(isc_resource_openfiles,
3585 					  &thread->open_max);
3586 	if (result != ISC_R_SUCCESS) {
3587 		thread->open_max = 64;
3588 	}
3589 	thread->calls = 0;
3590 	thread->events = isc_mem_get(thread->manager->mctx,
3591 				     sizeof(struct pollfd) * thread->nevents);
3592 
3593 	/*
3594 	 * Note: fdpollinfo should be able to support all possible FDs, so
3595 	 * it must have maxsocks entries (not nevents).
3596 	 */
3597 	thread->fdpollinfo =
3598 		isc_mem_get(thread->manager->mctx,
3599 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3600 	memset(thread->fdpollinfo, 0,
3601 	       sizeof(pollinfo_t) * thread->manager->maxsocks);
3602 	thread->devpoll_fd = open("/dev/poll", O_RDWR);
3603 	if (thread->devpoll_fd == -1) {
3604 		result = isc__errno2result(errno);
3605 		strerror_r(errno, strbuf, sizeof(strbuf));
3606 		UNEXPECTED_ERROR(__FILE__, __LINE__,
3607 				 "open(/dev/poll) failed: %s", strbuf);
3608 		isc_mem_put(thread->manager->mctx, thread->events,
3609 			    sizeof(struct pollfd) * thread->nevents);
3610 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3611 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3612 		return (result);
3613 	}
3614 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3615 	if (result != ISC_R_SUCCESS) {
3616 		close(thread->devpoll_fd);
3617 		isc_mem_put(thread->manager->mctx, thread->events,
3618 			    sizeof(struct pollfd) * thread->nevents);
3619 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3620 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3621 		return (result);
3622 	}
3623 
3624 	return (ISC_R_SUCCESS);
3625 #elif defined(USE_SELECT)
3626 	UNUSED(result);
3627 
3628 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3629 	/*
3630 	 * Note: this code should also cover the case of MAXSOCKETS <=
3631 	 * FD_SETSIZE, but we separate the cases to avoid possible portability
3632 	 * issues regarding howmany() and the actual representation of fd_set.
3633 	 */
3634 	thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3635 			     sizeof(fd_mask);
3636 #else  /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3637 	thread->fd_bufsize = sizeof(fd_set);
3638 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3639 
3640 	thread->read_fds = isc_mem_get(thread->manager->mctx,
3641 				       thread->fd_bufsize);
3642 	thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3643 					    thread->fd_bufsize);
3644 	thread->write_fds = isc_mem_get(thread->manager->mctx,
3645 					thread->fd_bufsize);
3646 	thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3647 					     thread->fd_bufsize);
3648 	memset(thread->read_fds, 0, thread->fd_bufsize);
3649 	memset(thread->write_fds, 0, thread->fd_bufsize);
3650 
3651 	(void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3652 	thread->maxfd = thread->pipe_fds[0];
3653 
3654 	return (ISC_R_SUCCESS);
3655 #endif /* USE_KQUEUE */
3656 }
3657 
3658 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3659 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3660 	isc_result_t result;
3661 	int i;
3662 
3663 	result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3664 	if (result != ISC_R_SUCCESS) {
3665 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3666 	}
3667 #ifdef USE_KQUEUE
3668 	close(thread->kqueue_fd);
3669 	isc_mem_put(mctx, thread->events,
3670 		    sizeof(struct kevent) * thread->nevents);
3671 #elif defined(USE_EPOLL)
3672 	close(thread->epoll_fd);
3673 
3674 	isc_mem_put(mctx, thread->events,
3675 		    sizeof(struct epoll_event) * thread->nevents);
3676 #elif defined(USE_DEVPOLL)
3677 	close(thread->devpoll_fd);
3678 	isc_mem_put(mctx, thread->events,
3679 		    sizeof(struct pollfd) * thread->nevents);
3680 	isc_mem_put(mctx, thread->fdpollinfo,
3681 		    sizeof(pollinfo_t) * thread->manager->maxsocks);
3682 #elif defined(USE_SELECT)
3683 	if (thread->read_fds != NULL) {
3684 		isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3685 	}
3686 	if (thread->read_fds_copy != NULL) {
3687 		isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3688 	}
3689 	if (thread->write_fds != NULL) {
3690 		isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3691 	}
3692 	if (thread->write_fds_copy != NULL) {
3693 		isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3694 	}
3695 #endif /* USE_KQUEUE */
3696 	for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3697 		if (thread->fdstate[i] == CLOSE_PENDING) {
3698 			/* no need to lock */
3699 			(void)close(i);
3700 		}
3701 	}
3702 
3703 #if defined(USE_EPOLL)
3704 	isc_mem_put(thread->manager->mctx, thread->epoll_events,
3705 		    thread->manager->maxsocks * sizeof(uint32_t));
3706 #endif /* if defined(USE_EPOLL) */
3707 	isc_mem_put(thread->manager->mctx, thread->fds,
3708 		    thread->manager->maxsocks * sizeof(isc_socket_t *));
3709 	isc_mem_put(thread->manager->mctx, thread->fdstate,
3710 		    thread->manager->maxsocks * sizeof(int));
3711 
3712 	for (i = 0; i < FDLOCK_COUNT; i++) {
3713 		isc_mutex_destroy(&thread->fdlock[i]);
3714 	}
3715 	isc_mem_put(thread->manager->mctx, thread->fdlock,
3716 		    FDLOCK_COUNT * sizeof(isc_mutex_t));
3717 }
3718 
3719 isc_result_t
isc__socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3720 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3721 		      unsigned int maxsocks, int nthreads) {
3722 	int i;
3723 	isc_socketmgr_t *manager;
3724 
3725 	REQUIRE(managerp != NULL && *managerp == NULL);
3726 
3727 	if (maxsocks == 0) {
3728 		maxsocks = ISC_SOCKET_MAXSOCKETS;
3729 	}
3730 
3731 	manager = isc_mem_get(mctx, sizeof(*manager));
3732 
3733 	/* zero-clear so that necessary cleanup on failure will be easy */
3734 	memset(manager, 0, sizeof(*manager));
3735 	manager->maxsocks = maxsocks;
3736 	manager->reserved = 0;
3737 	manager->maxudp = 0;
3738 	manager->nthreads = nthreads;
3739 	manager->stats = NULL;
3740 
3741 	manager->magic = SOCKET_MANAGER_MAGIC;
3742 	manager->mctx = NULL;
3743 	ISC_LIST_INIT(manager->socklist);
3744 	isc_mutex_init(&manager->lock);
3745 	isc_condition_init(&manager->shutdown_ok);
3746 
3747 	/*
3748 	 * Start up the select/poll thread.
3749 	 */
3750 	manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3751 						     manager->nthreads);
3752 	isc_mem_attach(mctx, &manager->mctx);
3753 
3754 	for (i = 0; i < manager->nthreads; i++) {
3755 		manager->threads[i].manager = manager;
3756 		manager->threads[i].threadid = i;
3757 		setup_thread(&manager->threads[i]);
3758 		isc_thread_create(netthread, &manager->threads[i],
3759 				  &manager->threads[i].thread);
3760 		char tname[1024];
3761 		sprintf(tname, "isc-socket-%d", i);
3762 		isc_thread_setname(manager->threads[i].thread, tname);
3763 	}
3764 
3765 	*managerp = manager;
3766 
3767 	return (ISC_R_SUCCESS);
3768 }
3769 
3770 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager,unsigned int * nsockp)3771 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3772 	REQUIRE(VALID_MANAGER(manager));
3773 	REQUIRE(nsockp != NULL);
3774 
3775 	*nsockp = manager->maxsocks;
3776 
3777 	return (ISC_R_SUCCESS);
3778 }
3779 
3780 void
isc_socketmgr_setstats(isc_socketmgr_t * manager,isc_stats_t * stats)3781 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3782 	REQUIRE(VALID_MANAGER(manager));
3783 	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3784 	REQUIRE(manager->stats == NULL);
3785 	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3786 
3787 	isc_stats_attach(stats, &manager->stats);
3788 }
3789 
3790 void
isc__socketmgr_destroy(isc_socketmgr_t ** managerp)3791 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
3792 	isc_socketmgr_t *manager;
3793 
3794 	/*
3795 	 * Destroy a socket manager.
3796 	 */
3797 
3798 	REQUIRE(managerp != NULL);
3799 	manager = *managerp;
3800 	REQUIRE(VALID_MANAGER(manager));
3801 
3802 	LOCK(&manager->lock);
3803 
3804 	/*
3805 	 * Wait for all sockets to be destroyed.
3806 	 */
3807 	while (!ISC_LIST_EMPTY(manager->socklist)) {
3808 		manager_log(manager, CREATION, "sockets exist");
3809 		WAIT(&manager->shutdown_ok, &manager->lock);
3810 	}
3811 
3812 	UNLOCK(&manager->lock);
3813 
3814 	/*
3815 	 * Here, poke our select/poll thread.  Do this by closing the write
3816 	 * half of the pipe, which will send EOF to the read half.
3817 	 * This is currently a no-op in the non-threaded case.
3818 	 */
3819 	for (int i = 0; i < manager->nthreads; i++) {
3820 		select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3821 	}
3822 
3823 	/*
3824 	 * Wait for thread to exit.
3825 	 */
3826 	for (int i = 0; i < manager->nthreads; i++) {
3827 		isc_thread_join(manager->threads[i].thread, NULL);
3828 		cleanup_thread(manager->mctx, &manager->threads[i]);
3829 	}
3830 	/*
3831 	 * Clean up.
3832 	 */
3833 	isc_mem_put(manager->mctx, manager->threads,
3834 		    sizeof(isc__socketthread_t) * manager->nthreads);
3835 	(void)isc_condition_destroy(&manager->shutdown_ok);
3836 
3837 	if (manager->stats != NULL) {
3838 		isc_stats_detach(&manager->stats);
3839 	}
3840 	isc_mutex_destroy(&manager->lock);
3841 	manager->magic = 0;
3842 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
3843 
3844 	*managerp = NULL;
3845 }
3846 
3847 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)3848 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3849 	    unsigned int flags) {
3850 	int io_state;
3851 	bool have_lock = false;
3852 	isc_task_t *ntask = NULL;
3853 	isc_result_t result = ISC_R_SUCCESS;
3854 
3855 	dev->ev_sender = task;
3856 
3857 	if (sock->type == isc_sockettype_udp) {
3858 		io_state = doio_recv(sock, dev);
3859 	} else {
3860 		LOCK(&sock->lock);
3861 		have_lock = true;
3862 
3863 		if (ISC_LIST_EMPTY(sock->recv_list)) {
3864 			io_state = doio_recv(sock, dev);
3865 		} else {
3866 			io_state = DOIO_SOFT;
3867 		}
3868 	}
3869 
3870 	switch (io_state) {
3871 	case DOIO_SOFT:
3872 		/*
3873 		 * We couldn't read all or part of the request right now, so
3874 		 * queue it.
3875 		 *
3876 		 * Attach to socket and to task
3877 		 */
3878 		isc_task_attach(task, &ntask);
3879 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3880 
3881 		if (!have_lock) {
3882 			LOCK(&sock->lock);
3883 			have_lock = true;
3884 		}
3885 
3886 		/*
3887 		 * Enqueue the request.  If the socket was previously not being
3888 		 * watched, poke the watcher to start paying attention to it.
3889 		 */
3890 		bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
3891 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3892 		if (do_poke) {
3893 			select_poke(sock->manager, sock->threadid, sock->fd,
3894 				    SELECT_POKE_READ);
3895 		}
3896 
3897 		socket_log(sock, NULL, EVENT,
3898 			   "socket_recv: event %p -> task %p", dev, ntask);
3899 
3900 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
3901 			result = ISC_R_INPROGRESS;
3902 		}
3903 		break;
3904 
3905 	case DOIO_EOF:
3906 		dev->result = ISC_R_EOF;
3907 		/* fallthrough */
3908 
3909 	case DOIO_HARD:
3910 	case DOIO_SUCCESS:
3911 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
3912 			send_recvdone_event(sock, &dev);
3913 		}
3914 		break;
3915 	}
3916 
3917 	if (have_lock) {
3918 		UNLOCK(&sock->lock);
3919 	}
3920 
3921 	return (result);
3922 }
3923 
3924 isc_result_t
isc_socket_recv(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)3925 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3926 		isc_task_t *task, isc_taskaction_t action, void *arg) {
3927 	isc_socketevent_t *dev;
3928 	isc_socketmgr_t *manager;
3929 
3930 	REQUIRE(VALID_SOCKET(sock));
3931 	REQUIRE(action != NULL);
3932 
3933 	manager = sock->manager;
3934 	REQUIRE(VALID_MANAGER(manager));
3935 
3936 	INSIST(sock->bound);
3937 
3938 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
3939 				   action, arg);
3940 	if (dev == NULL) {
3941 		return (ISC_R_NOMEMORY);
3942 	}
3943 
3944 	return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
3945 }
3946 
3947 isc_result_t
isc_socket_recv2(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)3948 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3949 		 isc_task_t *task, isc_socketevent_t *event,
3950 		 unsigned int flags) {
3951 	event->ev_sender = sock;
3952 	event->result = ISC_R_UNSET;
3953 	event->region = *region;
3954 	event->n = 0;
3955 	event->offset = 0;
3956 	event->attributes = 0;
3957 
3958 	/*
3959 	 * UDP sockets are always partial read.
3960 	 */
3961 	if (sock->type == isc_sockettype_udp) {
3962 		event->minimum = 1;
3963 	} else {
3964 		if (minimum == 0) {
3965 			event->minimum = region->length;
3966 		} else {
3967 			event->minimum = minimum;
3968 		}
3969 	}
3970 
3971 	return (socket_recv(sock, event, task, flags));
3972 }
3973 
3974 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)3975 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3976 	    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3977 	    unsigned int flags) {
3978 	int io_state;
3979 	bool have_lock = false;
3980 	isc_task_t *ntask = NULL;
3981 	isc_result_t result = ISC_R_SUCCESS;
3982 
3983 	dev->ev_sender = task;
3984 
3985 	set_dev_address(address, sock, dev);
3986 	if (pktinfo != NULL) {
3987 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
3988 		dev->pktinfo = *pktinfo;
3989 
3990 		if (!isc_sockaddr_issitelocal(&dev->address) &&
3991 		    !isc_sockaddr_islinklocal(&dev->address))
3992 		{
3993 			socket_log(sock, NULL, TRACE,
3994 				   "pktinfo structure provided, ifindex %u "
3995 				   "(set to 0)",
3996 				   pktinfo->ipi6_ifindex);
3997 
3998 			/*
3999 			 * Set the pktinfo index to 0 here, to let the
4000 			 * kernel decide what interface it should send on.
4001 			 */
4002 			dev->pktinfo.ipi6_ifindex = 0;
4003 		}
4004 	}
4005 
4006 	if (sock->type == isc_sockettype_udp) {
4007 		io_state = doio_send(sock, dev);
4008 	} else {
4009 		LOCK(&sock->lock);
4010 		have_lock = true;
4011 
4012 		if (ISC_LIST_EMPTY(sock->send_list)) {
4013 			io_state = doio_send(sock, dev);
4014 		} else {
4015 			io_state = DOIO_SOFT;
4016 		}
4017 	}
4018 
4019 	switch (io_state) {
4020 	case DOIO_SOFT:
4021 		/*
4022 		 * We couldn't send all or part of the request right now, so
4023 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4024 		 */
4025 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4026 			isc_task_attach(task, &ntask);
4027 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4028 
4029 			if (!have_lock) {
4030 				LOCK(&sock->lock);
4031 				have_lock = true;
4032 			}
4033 
4034 			/*
4035 			 * Enqueue the request.  If the socket was previously
4036 			 * not being watched, poke the watcher to start
4037 			 * paying attention to it.
4038 			 */
4039 			bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4040 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4041 			if (do_poke) {
4042 				select_poke(sock->manager, sock->threadid,
4043 					    sock->fd, SELECT_POKE_WRITE);
4044 			}
4045 			socket_log(sock, NULL, EVENT,
4046 				   "socket_send: event %p -> task %p", dev,
4047 				   ntask);
4048 
4049 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4050 				result = ISC_R_INPROGRESS;
4051 			}
4052 			break;
4053 		}
4054 
4055 		/* FALLTHROUGH */
4056 
4057 	case DOIO_HARD:
4058 	case DOIO_SUCCESS:
4059 		if (!have_lock) {
4060 			LOCK(&sock->lock);
4061 			have_lock = true;
4062 		}
4063 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4064 			send_senddone_event(sock, &dev);
4065 		}
4066 		break;
4067 	}
4068 
4069 	if (have_lock) {
4070 		UNLOCK(&sock->lock);
4071 	}
4072 
4073 	return (result);
4074 }
4075 
4076 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4077 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4078 		isc_taskaction_t action, void *arg) {
4079 	/*
4080 	 * REQUIRE() checking is performed in isc_socket_sendto().
4081 	 */
4082 	return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4083 }
4084 
4085 isc_result_t
isc_socket_sendto(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4086 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4087 		  isc_taskaction_t action, void *arg,
4088 		  const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4089 	isc_socketevent_t *dev;
4090 	isc_socketmgr_t *manager;
4091 
4092 	REQUIRE(VALID_SOCKET(sock));
4093 	REQUIRE(region != NULL);
4094 	REQUIRE(task != NULL);
4095 	REQUIRE(action != NULL);
4096 
4097 	manager = sock->manager;
4098 	REQUIRE(VALID_MANAGER(manager));
4099 
4100 	INSIST(sock->bound);
4101 
4102 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4103 				   action, arg);
4104 	if (dev == NULL) {
4105 		return (ISC_R_NOMEMORY);
4106 	}
4107 
4108 	dev->region = *region;
4109 
4110 	return (socket_send(sock, dev, task, address, pktinfo, 0));
4111 }
4112 
4113 isc_result_t
isc_socket_sendto2(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4114 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4115 		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4116 		   isc_socketevent_t *event, unsigned int flags) {
4117 	REQUIRE(VALID_SOCKET(sock));
4118 	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4119 		0);
4120 	if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4121 		REQUIRE(sock->type == isc_sockettype_udp);
4122 	}
4123 	event->ev_sender = sock;
4124 	event->result = ISC_R_UNSET;
4125 	event->region = *region;
4126 	event->n = 0;
4127 	event->offset = 0;
4128 	event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4129 
4130 	return (socket_send(sock, event, task, address, pktinfo, flags));
4131 }
4132 
4133 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4134 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4135 	int s;
4136 	struct stat sb;
4137 	char strbuf[ISC_STRERRORSIZE];
4138 
4139 	if (sockaddr->type.sa.sa_family != AF_UNIX) {
4140 		return;
4141 	}
4142 
4143 #ifndef S_ISSOCK
4144 #if defined(S_IFMT) && defined(S_IFSOCK)
4145 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4146 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4147 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4148 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4149 #endif /* ifndef S_ISSOCK */
4150 
4151 #ifndef S_ISFIFO
4152 #if defined(S_IFMT) && defined(S_IFIFO)
4153 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4154 #elif defined(_S_IFMT) && defined(S_IFIFO)
4155 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4156 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4157 #endif /* ifndef S_ISFIFO */
4158 
4159 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4160 /* cppcheck-suppress preprocessorErrorDirective */
4161 #error \
4162 	You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4163 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4164 
4165 #ifndef S_ISFIFO
4166 #define S_ISFIFO(mode) 0
4167 #endif /* ifndef S_ISFIFO */
4168 
4169 #ifndef S_ISSOCK
4170 #define S_ISSOCK(mode) 0
4171 #endif /* ifndef S_ISSOCK */
4172 
4173 	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4174 		switch (errno) {
4175 		case ENOENT:
4176 			if (active) { /* We exited cleanly last time */
4177 				break;
4178 			}
4179 			/* FALLTHROUGH */
4180 		default:
4181 			strerror_r(errno, strbuf, sizeof(strbuf));
4182 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4183 				      ISC_LOGMODULE_SOCKET,
4184 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4185 				      "isc_socket_cleanunix: stat(%s): %s",
4186 				      sockaddr->type.sunix.sun_path, strbuf);
4187 			return;
4188 		}
4189 	} else {
4190 		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4191 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4192 				      ISC_LOGMODULE_SOCKET,
4193 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4194 				      "isc_socket_cleanunix: %s: not a socket",
4195 				      sockaddr->type.sunix.sun_path);
4196 			return;
4197 		}
4198 	}
4199 
4200 	if (active) {
4201 		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4202 			strerror_r(errno, strbuf, sizeof(strbuf));
4203 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4204 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4205 				      "isc_socket_cleanunix: unlink(%s): %s",
4206 				      sockaddr->type.sunix.sun_path, strbuf);
4207 		}
4208 		return;
4209 	}
4210 
4211 	s = socket(AF_UNIX, SOCK_STREAM, 0);
4212 	if (s < 0) {
4213 		strerror_r(errno, strbuf, sizeof(strbuf));
4214 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4215 			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4216 			      "isc_socket_cleanunix: socket(%s): %s",
4217 			      sockaddr->type.sunix.sun_path, strbuf);
4218 		return;
4219 	}
4220 
4221 	if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4222 		    sizeof(sockaddr->type.sunix)) < 0)
4223 	{
4224 		switch (errno) {
4225 		case ECONNREFUSED:
4226 		case ECONNRESET:
4227 			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4228 				strerror_r(errno, strbuf, sizeof(strbuf));
4229 				isc_log_write(
4230 					isc_lctx, ISC_LOGCATEGORY_GENERAL,
4231 					ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4232 					"isc_socket_cleanunix: "
4233 					"unlink(%s): %s",
4234 					sockaddr->type.sunix.sun_path, strbuf);
4235 			}
4236 			break;
4237 		default:
4238 			strerror_r(errno, strbuf, sizeof(strbuf));
4239 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4240 				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4241 				      "isc_socket_cleanunix: connect(%s): %s",
4242 				      sockaddr->type.sunix.sun_path, strbuf);
4243 			break;
4244 		}
4245 	}
4246 	close(s);
4247 }
4248 
4249 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4250 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4251 		    uint32_t owner, uint32_t group) {
4252 	isc_result_t result = ISC_R_SUCCESS;
4253 	char strbuf[ISC_STRERRORSIZE];
4254 	char path[sizeof(sockaddr->type.sunix.sun_path)];
4255 #ifdef NEED_SECURE_DIRECTORY
4256 	char *slash;
4257 #endif /* ifdef NEED_SECURE_DIRECTORY */
4258 
4259 	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4260 	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4261 	strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4262 
4263 #ifdef NEED_SECURE_DIRECTORY
4264 	slash = strrchr(path, '/');
4265 	if (slash != NULL) {
4266 		if (slash != path) {
4267 			*slash = '\0';
4268 		} else {
4269 			strlcpy(path, "/", sizeof(path));
4270 		}
4271 	} else {
4272 		strlcpy(path, ".", sizeof(path));
4273 	}
4274 #endif /* ifdef NEED_SECURE_DIRECTORY */
4275 
4276 	if (chmod(path, perm) < 0) {
4277 		strerror_r(errno, strbuf, sizeof(strbuf));
4278 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4279 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4280 			      "isc_socket_permunix: chmod(%s, %d): %s", path,
4281 			      perm, strbuf);
4282 		result = ISC_R_FAILURE;
4283 	}
4284 	if (chown(path, owner, group) < 0) {
4285 		strerror_r(errno, strbuf, sizeof(strbuf));
4286 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4287 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4288 			      "isc_socket_permunix: chown(%s, %d, %d): %s",
4289 			      path, owner, group, strbuf);
4290 		result = ISC_R_FAILURE;
4291 	}
4292 	return (result);
4293 }
4294 
4295 isc_result_t
isc_socket_bind(isc_socket_t * sock,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4296 isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
4297 		isc_socket_options_t options) {
4298 	char strbuf[ISC_STRERRORSIZE];
4299 	int on = 1;
4300 
4301 	REQUIRE(VALID_SOCKET(sock));
4302 
4303 	LOCK(&sock->lock);
4304 
4305 	INSIST(!sock->bound);
4306 
4307 	if (sock->pf != sockaddr->type.sa.sa_family) {
4308 		UNLOCK(&sock->lock);
4309 		return (ISC_R_FAMILYMISMATCH);
4310 	}
4311 
4312 	/*
4313 	 * Only set SO_REUSEADDR when we want a specific port.
4314 	 */
4315 #ifdef AF_UNIX
4316 	if (sock->pf == AF_UNIX) {
4317 		goto bind_socket;
4318 	}
4319 #endif /* ifdef AF_UNIX */
4320 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4321 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4322 	{
4323 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4324 			       sizeof(on)) < 0) {
4325 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4326 					 "setsockopt(%d) failed", sock->fd);
4327 		}
4328 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4329 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4330 			       (void *)&on, sizeof(on)) < 0)
4331 		{
4332 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4333 					 "setsockopt(%d) failed", sock->fd);
4334 		}
4335 #elif defined(__linux__) && defined(SO_REUSEPORT)
4336 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4337 			       sizeof(on)) < 0) {
4338 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4339 					 "setsockopt(%d) failed", sock->fd);
4340 		}
4341 #endif		/* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4342 		/* Press on... */
4343 	}
4344 #ifdef AF_UNIX
4345 bind_socket:
4346 #endif /* ifdef AF_UNIX */
4347 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4348 		inc_stats(sock->manager->stats,
4349 			  sock->statsindex[STATID_BINDFAIL]);
4350 
4351 		UNLOCK(&sock->lock);
4352 		switch (errno) {
4353 		case EACCES:
4354 			return (ISC_R_NOPERM);
4355 		case EADDRNOTAVAIL:
4356 			return (ISC_R_ADDRNOTAVAIL);
4357 		case EADDRINUSE:
4358 			return (ISC_R_ADDRINUSE);
4359 		case EINVAL:
4360 			return (ISC_R_BOUND);
4361 		default:
4362 			strerror_r(errno, strbuf, sizeof(strbuf));
4363 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4364 					 strbuf);
4365 			return (ISC_R_UNEXPECTED);
4366 		}
4367 	}
4368 
4369 	socket_log(sock, sockaddr, TRACE, "bound");
4370 	sock->bound = 1;
4371 
4372 	UNLOCK(&sock->lock);
4373 	return (ISC_R_SUCCESS);
4374 }
4375 
4376 /*
4377  * Enable this only for specific OS versions, and only when they have repaired
4378  * their problems with it.  Until then, this is is broken and needs to be
4379  * disabled by default.  See RT22589 for details.
4380  */
4381 #undef ENABLE_ACCEPTFILTER
4382 
4383 isc_result_t
isc_socket_filter(isc_socket_t * sock,const char * filter)4384 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4385 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4386 	char strbuf[ISC_STRERRORSIZE];
4387 	struct accept_filter_arg afa;
4388 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4389 	UNUSED(sock);
4390 	UNUSED(filter);
4391 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4392 
4393 	REQUIRE(VALID_SOCKET(sock));
4394 
4395 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4396 	bzero(&afa, sizeof(afa));
4397 	strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4398 	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4399 		       sizeof(afa)) == -1) {
4400 		strerror_r(errno, strbuf, sizeof(strbuf));
4401 		socket_log(sock, NULL, CREATION,
4402 			   "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4403 		return (ISC_R_FAILURE);
4404 	}
4405 	return (ISC_R_SUCCESS);
4406 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4407 	return (ISC_R_NOTIMPLEMENTED);
4408 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4409 }
4410 
4411 /*
4412  * Try enabling TCP Fast Open for a given socket if the OS supports it.
4413  */
4414 static void
set_tcp_fastopen(isc_socket_t * sock,unsigned int backlog)4415 set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
4416 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4417 	char strbuf[ISC_STRERRORSIZE];
4418 
4419 /*
4420  * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4421  * shipping a default kernel without TFO support, so we special-case it by
4422  * performing an additional runtime check for TFO support using sysctl to
4423  * prevent setsockopt() errors from being logged.
4424  */
4425 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4426 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4427 	unsigned int enabled;
4428 	size_t enabledlen = sizeof(enabled);
4429 	static bool tfo_notice_logged = false;
4430 
4431 	if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4432 		/*
4433 		 * This kernel does not support TCP Fast Open.  There is
4434 		 * nothing more we can do.
4435 		 */
4436 		return;
4437 	} else if (enabled == 0) {
4438 		/*
4439 		 * This kernel does support TCP Fast Open, but it is disabled
4440 		 * by sysctl.  Notify the user, but do not nag.
4441 		 */
4442 		if (!tfo_notice_logged) {
4443 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4444 				      ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4445 				      "TCP_FASTOPEN support is disabled by "
4446 				      "sysctl (" SYSCTL_TFO " = 0)");
4447 			tfo_notice_logged = true;
4448 		}
4449 		return;
4450 	}
4451 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4452 
4453 #ifdef __APPLE__
4454 	backlog = 1;
4455 #else  /* ifdef __APPLE__ */
4456 	backlog = backlog / 2;
4457 	if (backlog == 0) {
4458 		backlog = 1;
4459 	}
4460 #endif /* ifdef __APPLE__ */
4461 	if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4462 		       sizeof(backlog)) < 0)
4463 	{
4464 		strerror_r(errno, strbuf, sizeof(strbuf));
4465 		UNEXPECTED_ERROR(__FILE__, __LINE__,
4466 				 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4467 				 sock->fd, strbuf);
4468 		/* TCP_FASTOPEN is experimental so ignore failures */
4469 	}
4470 #else  /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4471 	UNUSED(sock);
4472 	UNUSED(backlog);
4473 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4474 }
4475 
4476 /*
4477  * Set up to listen on a given socket.  We do this by creating an internal
4478  * event that will be dispatched when the socket has read activity.  The
4479  * watcher will send the internal event to the task when there is a new
4480  * connection.
4481  *
4482  * Unlike in read, we don't preallocate a done event here.  Every time there
4483  * is a new connection we'll have to allocate a new one anyway, so we might
4484  * as well keep things simple rather than having to track them.
4485  */
4486 isc_result_t
isc_socket_listen(isc_socket_t * sock,unsigned int backlog)4487 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4488 	char strbuf[ISC_STRERRORSIZE];
4489 
4490 	REQUIRE(VALID_SOCKET(sock));
4491 
4492 	LOCK(&sock->lock);
4493 
4494 	REQUIRE(!sock->listener);
4495 	REQUIRE(sock->bound);
4496 	REQUIRE(sock->type == isc_sockettype_tcp ||
4497 		sock->type == isc_sockettype_unix);
4498 
4499 	if (backlog == 0) {
4500 		backlog = SOMAXCONN;
4501 	}
4502 
4503 	if (listen(sock->fd, (int)backlog) < 0) {
4504 		UNLOCK(&sock->lock);
4505 		strerror_r(errno, strbuf, sizeof(strbuf));
4506 
4507 		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4508 
4509 		return (ISC_R_UNEXPECTED);
4510 	}
4511 
4512 	set_tcp_fastopen(sock, backlog);
4513 
4514 	sock->listener = 1;
4515 
4516 	UNLOCK(&sock->lock);
4517 	return (ISC_R_SUCCESS);
4518 }
4519 
4520 /*
4521  * This should try to do aggressive accept() XXXMLG
4522  */
4523 isc_result_t
isc_socket_accept(isc_socket_t * sock,isc_task_t * task,isc_taskaction_t action,void * arg)4524 isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
4525 		  void *arg) {
4526 	isc_socket_newconnev_t *dev;
4527 	isc_socketmgr_t *manager;
4528 	isc_task_t *ntask = NULL;
4529 	isc_socket_t *nsock;
4530 	isc_result_t result;
4531 	bool do_poke = false;
4532 
4533 	REQUIRE(VALID_SOCKET(sock));
4534 	manager = sock->manager;
4535 	REQUIRE(VALID_MANAGER(manager));
4536 
4537 	LOCK(&sock->lock);
4538 
4539 	REQUIRE(sock->listener);
4540 
4541 	/*
4542 	 * Sender field is overloaded here with the task we will be sending
4543 	 * this event to.  Just before the actual event is delivered the
4544 	 * actual ev_sender will be touched up to be the socket.
4545 	 */
4546 	dev = (isc_socket_newconnev_t *)isc_event_allocate(
4547 		manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4548 		sizeof(*dev));
4549 	ISC_LINK_INIT(dev, ev_link);
4550 
4551 	result = allocate_socket(manager, sock->type, &nsock);
4552 	if (result != ISC_R_SUCCESS) {
4553 		isc_event_free(ISC_EVENT_PTR(&dev));
4554 		UNLOCK(&sock->lock);
4555 		return (result);
4556 	}
4557 
4558 	/*
4559 	 * Attach to socket and to task.
4560 	 */
4561 	isc_task_attach(task, &ntask);
4562 	if (isc_task_exiting(ntask)) {
4563 		free_socket(&nsock);
4564 		isc_task_detach(&ntask);
4565 		isc_event_free(ISC_EVENT_PTR(&dev));
4566 		UNLOCK(&sock->lock);
4567 		return (ISC_R_SHUTTINGDOWN);
4568 	}
4569 	isc_refcount_increment0(&nsock->references);
4570 	nsock->statsindex = sock->statsindex;
4571 
4572 	dev->ev_sender = ntask;
4573 	dev->newsocket = nsock;
4574 
4575 	/*
4576 	 * Poke watcher here.  We still have the socket locked, so there
4577 	 * is no race condition.  We will keep the lock for such a short
4578 	 * bit of time waking it up now or later won't matter all that much.
4579 	 */
4580 	do_poke = ISC_LIST_EMPTY(sock->accept_list);
4581 	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4582 	if (do_poke) {
4583 		select_poke(manager, sock->threadid, sock->fd,
4584 			    SELECT_POKE_ACCEPT);
4585 	}
4586 	UNLOCK(&sock->lock);
4587 	return (ISC_R_SUCCESS);
4588 }
4589 
4590 isc_result_t
isc_socket_connect(isc_socket_t * sock,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4591 isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
4592 		   isc_task_t *task, isc_taskaction_t action, void *arg) {
4593 	isc_socket_connev_t *dev;
4594 	isc_task_t *ntask = NULL;
4595 	isc_socketmgr_t *manager;
4596 	int cc;
4597 	char strbuf[ISC_STRERRORSIZE];
4598 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4599 
4600 	REQUIRE(VALID_SOCKET(sock));
4601 	REQUIRE(addr != NULL);
4602 	REQUIRE(task != NULL);
4603 	REQUIRE(action != NULL);
4604 
4605 	manager = sock->manager;
4606 	REQUIRE(VALID_MANAGER(manager));
4607 	REQUIRE(addr != NULL);
4608 
4609 	if (isc_sockaddr_ismulticast(addr)) {
4610 		return (ISC_R_MULTICAST);
4611 	}
4612 
4613 	LOCK(&sock->lock);
4614 
4615 	dev = (isc_socket_connev_t *)isc_event_allocate(
4616 		manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4617 		sizeof(*dev));
4618 	ISC_LINK_INIT(dev, ev_link);
4619 
4620 	if (sock->connecting) {
4621 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4622 		goto queue;
4623 	}
4624 
4625 	if (sock->connected) {
4626 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4627 		dev->result = ISC_R_SUCCESS;
4628 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4629 
4630 		UNLOCK(&sock->lock);
4631 
4632 		return (ISC_R_SUCCESS);
4633 	}
4634 
4635 	/*
4636 	 * Try to do the connect right away, as there can be only one
4637 	 * outstanding, and it might happen to complete.
4638 	 */
4639 	sock->peer_address = *addr;
4640 	cc = connect(sock->fd, &addr->type.sa, addr->length);
4641 	if (cc < 0) {
4642 		/*
4643 		 * The socket is nonblocking and the connection cannot be
4644 		 * completed immediately.  It is possible to select(2) or
4645 		 * poll(2) for completion by selecting the socket for writing.
4646 		 * After select(2) indicates writability, use getsockopt(2) to
4647 		 * read the SO_ERROR option at level SOL_SOCKET to determine
4648 		 * whether connect() completed successfully (SO_ERROR is zero)
4649 		 * or unsuccessfully (SO_ERROR is one of the usual error codes
4650 		 * listed here, explaining the reason for the failure).
4651 		 */
4652 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4653 			cc = 0;
4654 			goto success;
4655 		}
4656 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4657 			goto queue;
4658 		}
4659 
4660 		switch (errno) {
4661 #define ERROR_MATCH(a, b)        \
4662 	case a:                  \
4663 		dev->result = b; \
4664 		goto err_exit;
4665 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4666 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4667 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4668 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4669 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4670 #ifdef EHOSTDOWN
4671 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4672 #endif /* ifdef EHOSTDOWN */
4673 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4674 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4675 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4676 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4677 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4678 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4679 #undef ERROR_MATCH
4680 		}
4681 
4682 		sock->connected = 0;
4683 
4684 		strerror_r(errno, strbuf, sizeof(strbuf));
4685 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4686 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4687 				 addrbuf, errno, strbuf);
4688 
4689 		UNLOCK(&sock->lock);
4690 		inc_stats(sock->manager->stats,
4691 			  sock->statsindex[STATID_CONNECTFAIL]);
4692 		isc_event_free(ISC_EVENT_PTR(&dev));
4693 		return (ISC_R_UNEXPECTED);
4694 
4695 	err_exit:
4696 		sock->connected = 0;
4697 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4698 
4699 		UNLOCK(&sock->lock);
4700 		inc_stats(sock->manager->stats,
4701 			  sock->statsindex[STATID_CONNECTFAIL]);
4702 		return (ISC_R_SUCCESS);
4703 	}
4704 
4705 	/*
4706 	 * If connect completed, fire off the done event.
4707 	 */
4708 success:
4709 	if (cc == 0) {
4710 		sock->connected = 1;
4711 		sock->bound = 1;
4712 		dev->result = ISC_R_SUCCESS;
4713 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4714 
4715 		UNLOCK(&sock->lock);
4716 
4717 		inc_stats(sock->manager->stats,
4718 			  sock->statsindex[STATID_CONNECT]);
4719 
4720 		return (ISC_R_SUCCESS);
4721 	}
4722 
4723 queue:
4724 
4725 	/*
4726 	 * Attach to task.
4727 	 */
4728 	isc_task_attach(task, &ntask);
4729 
4730 	dev->ev_sender = ntask;
4731 
4732 	/*
4733 	 * Poke watcher here.  We still have the socket locked, so there
4734 	 * is no race condition.  We will keep the lock for such a short
4735 	 * bit of time waking it up now or later won't matter all that much.
4736 	 */
4737 	bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4738 	ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4739 	if (do_poke && !sock->connecting) {
4740 		sock->connecting = 1;
4741 		select_poke(manager, sock->threadid, sock->fd,
4742 			    SELECT_POKE_CONNECT);
4743 	}
4744 
4745 	UNLOCK(&sock->lock);
4746 	return (ISC_R_SUCCESS);
4747 }
4748 
4749 /*
4750  * Called when a socket with a pending connect() finishes.
4751  */
4752 static void
internal_connect(isc_socket_t * sock)4753 internal_connect(isc_socket_t *sock) {
4754 	isc_socket_connev_t *dev;
4755 	int cc;
4756 	isc_result_t result;
4757 	socklen_t optlen;
4758 	char strbuf[ISC_STRERRORSIZE];
4759 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4760 
4761 	INSIST(VALID_SOCKET(sock));
4762 	REQUIRE(sock->fd >= 0);
4763 
4764 	/*
4765 	 * Get the first item off the connect list.
4766 	 * If it is empty, unlock the socket and return.
4767 	 */
4768 	dev = ISC_LIST_HEAD(sock->connect_list);
4769 	if (dev == NULL) {
4770 		INSIST(!sock->connecting);
4771 		goto finish;
4772 	}
4773 
4774 	INSIST(sock->connecting);
4775 	sock->connecting = 0;
4776 
4777 	/*
4778 	 * Get any possible error status here.
4779 	 */
4780 	optlen = sizeof(cc);
4781 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4782 		       (void *)&optlen) != 0)
4783 	{
4784 		cc = errno;
4785 	} else {
4786 		errno = cc;
4787 	}
4788 
4789 	if (errno != 0) {
4790 		/*
4791 		 * If the error is EAGAIN, just re-select on this
4792 		 * fd and pretend nothing strange happened.
4793 		 */
4794 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4795 			sock->connecting = 1;
4796 			return;
4797 		}
4798 
4799 		inc_stats(sock->manager->stats,
4800 			  sock->statsindex[STATID_CONNECTFAIL]);
4801 
4802 		/*
4803 		 * Translate other errors into ISC_R_* flavors.
4804 		 */
4805 		switch (errno) {
4806 #define ERROR_MATCH(a, b)   \
4807 	case a:             \
4808 		result = b; \
4809 		break;
4810 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4811 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4812 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4813 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4814 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4815 #ifdef EHOSTDOWN
4816 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4817 #endif /* ifdef EHOSTDOWN */
4818 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4819 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4820 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4821 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4822 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4823 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4824 #undef ERROR_MATCH
4825 		default:
4826 			result = ISC_R_UNEXPECTED;
4827 			isc_sockaddr_format(&sock->peer_address, peerbuf,
4828 					    sizeof(peerbuf));
4829 			strerror_r(errno, strbuf, sizeof(strbuf));
4830 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4831 					 "internal_connect: connect(%s) %s",
4832 					 peerbuf, strbuf);
4833 		}
4834 	} else {
4835 		inc_stats(sock->manager->stats,
4836 			  sock->statsindex[STATID_CONNECT]);
4837 		result = ISC_R_SUCCESS;
4838 		sock->connected = 1;
4839 		sock->bound = 1;
4840 	}
4841 
4842 	do {
4843 		dev->result = result;
4844 		send_connectdone_event(sock, &dev);
4845 		dev = ISC_LIST_HEAD(sock->connect_list);
4846 	} while (dev != NULL);
4847 
4848 finish:
4849 	unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
4850 		   SELECT_POKE_CONNECT);
4851 }
4852 
4853 isc_result_t
isc_socket_getpeername(isc_socket_t * sock,isc_sockaddr_t * addressp)4854 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4855 	isc_result_t result;
4856 
4857 	REQUIRE(VALID_SOCKET(sock));
4858 	REQUIRE(addressp != NULL);
4859 
4860 	LOCK(&sock->lock);
4861 
4862 	if (sock->connected) {
4863 		*addressp = sock->peer_address;
4864 		result = ISC_R_SUCCESS;
4865 	} else {
4866 		result = ISC_R_NOTCONNECTED;
4867 	}
4868 
4869 	UNLOCK(&sock->lock);
4870 
4871 	return (result);
4872 }
4873 
4874 isc_result_t
isc_socket_getsockname(isc_socket_t * sock,isc_sockaddr_t * addressp)4875 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4876 	socklen_t len;
4877 	isc_result_t result;
4878 	char strbuf[ISC_STRERRORSIZE];
4879 
4880 	REQUIRE(VALID_SOCKET(sock));
4881 	REQUIRE(addressp != NULL);
4882 
4883 	LOCK(&sock->lock);
4884 
4885 	if (!sock->bound) {
4886 		result = ISC_R_NOTBOUND;
4887 		goto out;
4888 	}
4889 
4890 	result = ISC_R_SUCCESS;
4891 
4892 	len = sizeof(addressp->type);
4893 	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
4894 		strerror_r(errno, strbuf, sizeof(strbuf));
4895 		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
4896 		result = ISC_R_UNEXPECTED;
4897 		goto out;
4898 	}
4899 	addressp->length = (unsigned int)len;
4900 
4901 out:
4902 	UNLOCK(&sock->lock);
4903 
4904 	return (result);
4905 }
4906 
4907 /*
4908  * Run through the list of events on this socket, and cancel the ones
4909  * queued for task "task" of type "how".  "how" is a bitmask.
4910  */
4911 void
isc_socket_cancel(isc_socket_t * sock,isc_task_t * task,unsigned int how)4912 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
4913 	REQUIRE(VALID_SOCKET(sock));
4914 
4915 	/*
4916 	 * Quick exit if there is nothing to do.  Don't even bother locking
4917 	 * in this case.
4918 	 */
4919 	if (how == 0) {
4920 		return;
4921 	}
4922 
4923 	LOCK(&sock->lock);
4924 
4925 	/*
4926 	 * All of these do the same thing, more or less.
4927 	 * Each will:
4928 	 *	o If the internal event is marked as "posted" try to
4929 	 *	  remove it from the task's queue.  If this fails, mark it
4930 	 *	  as canceled instead, and let the task clean it up later.
4931 	 *	o For each I/O request for that task of that type, post
4932 	 *	  its done event with status of "ISC_R_CANCELED".
4933 	 *	o Reset any state needed.
4934 	 */
4935 	if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
4936 	    !ISC_LIST_EMPTY(sock->recv_list)) {
4937 		isc_socketevent_t *dev;
4938 		isc_socketevent_t *next;
4939 		isc_task_t *current_task;
4940 
4941 		dev = ISC_LIST_HEAD(sock->recv_list);
4942 
4943 		while (dev != NULL) {
4944 			current_task = dev->ev_sender;
4945 			next = ISC_LIST_NEXT(dev, ev_link);
4946 
4947 			if ((task == NULL) || (task == current_task)) {
4948 				dev->result = ISC_R_CANCELED;
4949 				send_recvdone_event(sock, &dev);
4950 			}
4951 			dev = next;
4952 		}
4953 	}
4954 
4955 	if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
4956 	    !ISC_LIST_EMPTY(sock->send_list)) {
4957 		isc_socketevent_t *dev;
4958 		isc_socketevent_t *next;
4959 		isc_task_t *current_task;
4960 
4961 		dev = ISC_LIST_HEAD(sock->send_list);
4962 
4963 		while (dev != NULL) {
4964 			current_task = dev->ev_sender;
4965 			next = ISC_LIST_NEXT(dev, ev_link);
4966 
4967 			if ((task == NULL) || (task == current_task)) {
4968 				dev->result = ISC_R_CANCELED;
4969 				send_senddone_event(sock, &dev);
4970 			}
4971 			dev = next;
4972 		}
4973 	}
4974 
4975 	if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
4976 	    !ISC_LIST_EMPTY(sock->accept_list)) {
4977 		isc_socket_newconnev_t *dev;
4978 		isc_socket_newconnev_t *next;
4979 		isc_task_t *current_task;
4980 
4981 		dev = ISC_LIST_HEAD(sock->accept_list);
4982 		while (dev != NULL) {
4983 			current_task = dev->ev_sender;
4984 			next = ISC_LIST_NEXT(dev, ev_link);
4985 
4986 			if ((task == NULL) || (task == current_task)) {
4987 				ISC_LIST_UNLINK(sock->accept_list, dev,
4988 						ev_link);
4989 
4990 				isc_refcount_decrementz(
4991 					&NEWCONNSOCK(dev)->references);
4992 				free_socket((isc_socket_t **)&dev->newsocket);
4993 
4994 				dev->result = ISC_R_CANCELED;
4995 				dev->ev_sender = sock;
4996 				isc_task_sendtoanddetach(&current_task,
4997 							 ISC_EVENT_PTR(&dev),
4998 							 sock->threadid);
4999 			}
5000 
5001 			dev = next;
5002 		}
5003 	}
5004 
5005 	if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5006 	    !ISC_LIST_EMPTY(sock->connect_list))
5007 	{
5008 		isc_socket_connev_t *dev;
5009 		isc_socket_connev_t *next;
5010 		isc_task_t *current_task;
5011 
5012 		INSIST(sock->connecting);
5013 		sock->connecting = 0;
5014 
5015 		dev = ISC_LIST_HEAD(sock->connect_list);
5016 
5017 		while (dev != NULL) {
5018 			current_task = dev->ev_sender;
5019 			next = ISC_LIST_NEXT(dev, ev_link);
5020 
5021 			if ((task == NULL) || (task == current_task)) {
5022 				dev->result = ISC_R_CANCELED;
5023 				send_connectdone_event(sock, &dev);
5024 			}
5025 			dev = next;
5026 		}
5027 	}
5028 
5029 	UNLOCK(&sock->lock);
5030 }
5031 
5032 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock)5033 isc_socket_gettype(isc_socket_t *sock) {
5034 	REQUIRE(VALID_SOCKET(sock));
5035 
5036 	return (sock->type);
5037 }
5038 
5039 void
isc_socket_ipv6only(isc_socket_t * sock,bool yes)5040 isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
5041 #if defined(IPV6_V6ONLY)
5042 	int onoff = yes ? 1 : 0;
5043 #else  /* if defined(IPV6_V6ONLY) */
5044 	UNUSED(yes);
5045 	UNUSED(sock);
5046 #endif /* if defined(IPV6_V6ONLY) */
5047 
5048 	REQUIRE(VALID_SOCKET(sock));
5049 
5050 #ifdef IPV6_V6ONLY
5051 	if (sock->pf == AF_INET6) {
5052 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5053 			       (void *)&onoff, sizeof(int)) < 0)
5054 		{
5055 			char strbuf[ISC_STRERRORSIZE];
5056 			strerror_r(errno, strbuf, sizeof(strbuf));
5057 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5058 					 "setsockopt(%d, IPV6_V6ONLY) failed: "
5059 					 "%s",
5060 					 sock->fd, strbuf);
5061 		}
5062 	}
5063 #endif /* ifdef IPV6_V6ONLY */
5064 }
5065 
5066 static void
setdscp(isc_socket_t * sock,isc_dscp_t dscp)5067 setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
5068 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5069 	int value = dscp << 2;
5070 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5071 
5072 	sock->dscp = dscp;
5073 
5074 #ifdef IP_TOS
5075 	if (sock->pf == AF_INET) {
5076 		if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5077 			       sizeof(value)) < 0) {
5078 			char strbuf[ISC_STRERRORSIZE];
5079 			strerror_r(errno, strbuf, sizeof(strbuf));
5080 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5081 					 "setsockopt(%d, IP_TOS, %.02x) "
5082 					 "failed: %s",
5083 					 sock->fd, value >> 2, strbuf);
5084 		}
5085 	}
5086 #endif /* ifdef IP_TOS */
5087 #ifdef IPV6_TCLASS
5088 	if (sock->pf == AF_INET6) {
5089 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5090 			       (void *)&value, sizeof(value)) < 0)
5091 		{
5092 			char strbuf[ISC_STRERRORSIZE];
5093 			strerror_r(errno, strbuf, sizeof(strbuf));
5094 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5095 					 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5096 					 "failed: %s",
5097 					 sock->fd, dscp >> 2, strbuf);
5098 		}
5099 	}
5100 #endif /* ifdef IPV6_TCLASS */
5101 }
5102 
5103 void
isc_socket_dscp(isc_socket_t * sock,isc_dscp_t dscp)5104 isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
5105 	REQUIRE(VALID_SOCKET(sock));
5106 	REQUIRE(dscp < 0x40);
5107 
5108 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5109 	UNUSED(dscp);
5110 #else  /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5111 	if (dscp < 0) {
5112 		return;
5113 	}
5114 
5115 	/* The DSCP value must not be changed once it has been set. */
5116 	if (isc_dscp_check_value != -1) {
5117 		INSIST(dscp == isc_dscp_check_value);
5118 	}
5119 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5120 
5121 	setdscp(sock, dscp);
5122 }
5123 
5124 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5125 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5126 		       isc_taskaction_t action, void *arg) {
5127 	return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5128 }
5129 
5130 void
isc_socket_setname(isc_socket_t * sock,const char * name,void * tag)5131 isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
5132 	/*
5133 	 * Name 'sock'.
5134 	 */
5135 
5136 	REQUIRE(VALID_SOCKET(sock));
5137 
5138 	LOCK(&sock->lock);
5139 	strlcpy(sock->name, name, sizeof(sock->name));
5140 	sock->tag = tag;
5141 	UNLOCK(&sock->lock);
5142 }
5143 
5144 const char *
isc_socket_getname(isc_socket_t * sock)5145 isc_socket_getname(isc_socket_t *sock) {
5146 	return (sock->name);
5147 }
5148 
5149 void *
isc_socket_gettag(isc_socket_t * sock)5150 isc_socket_gettag(isc_socket_t *sock) {
5151 	return (sock->tag);
5152 }
5153 
5154 int
isc_socket_getfd(isc_socket_t * sock)5155 isc_socket_getfd(isc_socket_t *sock) {
5156 	return ((short)sock->fd);
5157 }
5158 
5159 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5160 static bool hasreuseport = false;
5161 
5162 static void
init_hasreuseport(void)5163 init_hasreuseport(void) {
5164 /*
5165  * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5166  * We only want to use it on Linux, if it's available.
5167  */
5168 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5169 	(defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5170 	int sock, yes = 1;
5171 	sock = socket(AF_INET, SOCK_DGRAM, 0);
5172 	if (sock < 0) {
5173 		sock = socket(AF_INET6, SOCK_DGRAM, 0);
5174 		if (sock < 0) {
5175 			return;
5176 		}
5177 	}
5178 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5179 		       sizeof(yes)) < 0) {
5180 		close(sock);
5181 		return;
5182 #if defined(__FreeBSD_kernel__)
5183 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5184 			      sizeof(yes)) < 0)
5185 #else  /* if defined(__FreeBSD_kernel__) */
5186 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5187 			      sizeof(yes)) < 0)
5188 #endif /* if defined(__FreeBSD_kernel__) */
5189 	{
5190 		close(sock);
5191 		return;
5192 	}
5193 	hasreuseport = true;
5194 	close(sock);
5195 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5196 	* (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5197 }
5198 
5199 bool
isc_socket_hasreuseport(void)5200 isc_socket_hasreuseport(void) {
5201 	RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5202 		      ISC_R_SUCCESS);
5203 	return (hasreuseport);
5204 }
5205 
5206 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5207 static const char *
_socktype(isc_sockettype_t type)5208 _socktype(isc_sockettype_t type) {
5209 	switch (type) {
5210 	case isc_sockettype_udp:
5211 		return ("udp");
5212 	case isc_sockettype_tcp:
5213 		return ("tcp");
5214 	case isc_sockettype_unix:
5215 		return ("unix");
5216 	default:
5217 		return ("not-initialized");
5218 	}
5219 }
5220 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5221 
5222 #ifdef HAVE_LIBXML2
5223 #define TRY0(a)                     \
5224 	do {                        \
5225 		xmlrc = (a);        \
5226 		if (xmlrc < 0)      \
5227 			goto error; \
5228 	} while (0)
5229 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr,void * writer0)5230 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
5231 	isc_socket_t *sock = NULL;
5232 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5233 	isc_sockaddr_t addr;
5234 	socklen_t len;
5235 	int xmlrc;
5236 	xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5237 
5238 	LOCK(&mgr->lock);
5239 
5240 	TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5241 	sock = ISC_LIST_HEAD(mgr->socklist);
5242 	while (sock != NULL) {
5243 		LOCK(&sock->lock);
5244 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5245 
5246 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5247 		TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5248 		TRY0(xmlTextWriterEndElement(writer));
5249 
5250 		if (sock->name[0] != 0) {
5251 			TRY0(xmlTextWriterStartElement(writer,
5252 						       ISC_XMLCHAR "name"));
5253 			TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5254 							    sock->name));
5255 			TRY0(xmlTextWriterEndElement(writer)); /* name */
5256 		}
5257 
5258 		TRY0(xmlTextWriterStartElement(writer,
5259 					       ISC_XMLCHAR "references"));
5260 		TRY0(xmlTextWriterWriteFormatString(
5261 			writer, "%d",
5262 			(int)isc_refcount_current(&sock->references)));
5263 		TRY0(xmlTextWriterEndElement(writer));
5264 
5265 		TRY0(xmlTextWriterWriteElement(
5266 			writer, ISC_XMLCHAR "type",
5267 			ISC_XMLCHAR _socktype(sock->type)));
5268 
5269 		if (sock->connected) {
5270 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5271 					    sizeof(peerbuf));
5272 			TRY0(xmlTextWriterWriteElement(
5273 				writer, ISC_XMLCHAR "peer-address",
5274 				ISC_XMLCHAR peerbuf));
5275 		}
5276 
5277 		len = sizeof(addr);
5278 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5279 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5280 			TRY0(xmlTextWriterWriteElement(
5281 				writer, ISC_XMLCHAR "local-address",
5282 				ISC_XMLCHAR peerbuf));
5283 		}
5284 
5285 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5286 		if (sock->listener) {
5287 			TRY0(xmlTextWriterWriteElement(writer,
5288 						       ISC_XMLCHAR "state",
5289 						       ISC_XMLCHAR "listener"));
5290 		}
5291 		if (sock->connected) {
5292 			TRY0(xmlTextWriterWriteElement(
5293 				writer, ISC_XMLCHAR "state",
5294 				ISC_XMLCHAR "connected"));
5295 		}
5296 		if (sock->connecting) {
5297 			TRY0(xmlTextWriterWriteElement(
5298 				writer, ISC_XMLCHAR "state",
5299 				ISC_XMLCHAR "connecting"));
5300 		}
5301 		if (sock->bound) {
5302 			TRY0(xmlTextWriterWriteElement(writer,
5303 						       ISC_XMLCHAR "state",
5304 						       ISC_XMLCHAR "bound"));
5305 		}
5306 
5307 		TRY0(xmlTextWriterEndElement(writer)); /* states */
5308 
5309 		TRY0(xmlTextWriterEndElement(writer)); /* socket */
5310 
5311 		UNLOCK(&sock->lock);
5312 		sock = ISC_LIST_NEXT(sock, link);
5313 	}
5314 	TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5315 
5316 error:
5317 	if (sock != NULL) {
5318 		UNLOCK(&sock->lock);
5319 	}
5320 
5321 	UNLOCK(&mgr->lock);
5322 
5323 	return (xmlrc);
5324 }
5325 #endif /* HAVE_LIBXML2 */
5326 
5327 #ifdef HAVE_JSON_C
5328 #define CHECKMEM(m)                              \
5329 	do {                                     \
5330 		if (m == NULL) {                 \
5331 			result = ISC_R_NOMEMORY; \
5332 			goto error;              \
5333 		}                                \
5334 	} while (0)
5335 
5336 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr,void * stats0)5337 isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
5338 	isc_result_t result = ISC_R_SUCCESS;
5339 	isc_socket_t *sock = NULL;
5340 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5341 	isc_sockaddr_t addr;
5342 	socklen_t len;
5343 	json_object *obj, *array = json_object_new_array();
5344 	json_object *stats = (json_object *)stats0;
5345 
5346 	CHECKMEM(array);
5347 
5348 	LOCK(&mgr->lock);
5349 
5350 	sock = ISC_LIST_HEAD(mgr->socklist);
5351 	while (sock != NULL) {
5352 		json_object *states, *entry = json_object_new_object();
5353 		char buf[255];
5354 
5355 		CHECKMEM(entry);
5356 		json_object_array_add(array, entry);
5357 
5358 		LOCK(&sock->lock);
5359 
5360 		snprintf(buf, sizeof(buf), "%p", sock);
5361 		obj = json_object_new_string(buf);
5362 		CHECKMEM(obj);
5363 		json_object_object_add(entry, "id", obj);
5364 
5365 		if (sock->name[0] != 0) {
5366 			obj = json_object_new_string(sock->name);
5367 			CHECKMEM(obj);
5368 			json_object_object_add(entry, "name", obj);
5369 		}
5370 
5371 		obj = json_object_new_int(
5372 			(int)isc_refcount_current(&sock->references));
5373 		CHECKMEM(obj);
5374 		json_object_object_add(entry, "references", obj);
5375 
5376 		obj = json_object_new_string(_socktype(sock->type));
5377 		CHECKMEM(obj);
5378 		json_object_object_add(entry, "type", obj);
5379 
5380 		if (sock->connected) {
5381 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5382 					    sizeof(peerbuf));
5383 			obj = json_object_new_string(peerbuf);
5384 			CHECKMEM(obj);
5385 			json_object_object_add(entry, "peer-address", obj);
5386 		}
5387 
5388 		len = sizeof(addr);
5389 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5390 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5391 			obj = json_object_new_string(peerbuf);
5392 			CHECKMEM(obj);
5393 			json_object_object_add(entry, "local-address", obj);
5394 		}
5395 
5396 		states = json_object_new_array();
5397 		CHECKMEM(states);
5398 		json_object_object_add(entry, "states", states);
5399 
5400 		if (sock->listener) {
5401 			obj = json_object_new_string("listener");
5402 			CHECKMEM(obj);
5403 			json_object_array_add(states, obj);
5404 		}
5405 
5406 		if (sock->connected) {
5407 			obj = json_object_new_string("connected");
5408 			CHECKMEM(obj);
5409 			json_object_array_add(states, obj);
5410 		}
5411 
5412 		if (sock->connecting) {
5413 			obj = json_object_new_string("connecting");
5414 			CHECKMEM(obj);
5415 			json_object_array_add(states, obj);
5416 		}
5417 
5418 		if (sock->bound) {
5419 			obj = json_object_new_string("bound");
5420 			CHECKMEM(obj);
5421 			json_object_array_add(states, obj);
5422 		}
5423 
5424 		UNLOCK(&sock->lock);
5425 		sock = ISC_LIST_NEXT(sock, link);
5426 	}
5427 
5428 	json_object_object_add(stats, "sockets", array);
5429 	array = NULL;
5430 	result = ISC_R_SUCCESS;
5431 
5432 error:
5433 	if (array != NULL) {
5434 		json_object_put(array);
5435 	}
5436 
5437 	if (sock != NULL) {
5438 		UNLOCK(&sock->lock);
5439 	}
5440 
5441 	UNLOCK(&mgr->lock);
5442 
5443 	return (result);
5444 }
5445 #endif /* HAVE_JSON_C */
5446