1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * SPDX-License-Identifier: MPL-2.0
5  *
6  * This Source Code Form is subject to the terms of the Mozilla Public
7  * License, v. 2.0.  If a copy of the MPL was not distributed with this
8  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9  *
10  * See the COPYRIGHT file distributed with this work for additional
11  * information regarding copyright ownership.
12  */
13 
14 /*! \file */
15 
16 #include <inttypes.h>
17 #include <stdbool.h>
18 #include <sys/param.h>
19 #include <sys/socket.h>
20 #include <sys/stat.h>
21 #include <sys/types.h>
22 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
23 #include <sys/sysctl.h>
24 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
25 #include <sys/time.h>
26 #include <sys/uio.h>
27 
28 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
29 #include <linux/netlink.h>
30 #include <linux/rtnetlink.h>
31 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
32 	*/
33 
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <unistd.h>
39 
40 #include <isc/app.h>
41 #include <isc/buffer.h>
42 #include <isc/condition.h>
43 #include <isc/formatcheck.h>
44 #include <isc/list.h>
45 #include <isc/log.h>
46 #include <isc/mem.h>
47 #include <isc/mutex.h>
48 #include <isc/net.h>
49 #include <isc/once.h>
50 #include <isc/platform.h>
51 #include <isc/print.h>
52 #include <isc/refcount.h>
53 #include <isc/region.h>
54 #include <isc/resource.h>
55 #include <isc/socket.h>
56 #include <isc/stats.h>
57 #include <isc/strerr.h>
58 #include <isc/string.h>
59 #include <isc/task.h>
60 #include <isc/thread.h>
61 #include <isc/util.h>
62 
63 #ifdef ISC_PLATFORM_HAVESYSUNH
64 #include <sys/un.h>
65 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
66 #ifdef HAVE_KQUEUE
67 #include <sys/event.h>
68 #endif /* ifdef HAVE_KQUEUE */
69 #ifdef HAVE_EPOLL_CREATE1
70 #include <sys/epoll.h>
71 #endif /* ifdef HAVE_EPOLL_CREATE1 */
72 #if defined(HAVE_SYS_DEVPOLL_H)
73 #include <sys/devpoll.h>
74 #elif defined(HAVE_DEVPOLL_H)
75 #include <devpoll.h>
76 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
77 
78 #include <netinet/tcp.h>
79 
80 #include "errno2result.h"
81 
82 #ifdef ENABLE_TCP_FASTOPEN
83 #include <netinet/tcp.h>
84 #endif /* ifdef ENABLE_TCP_FASTOPEN */
85 
86 #ifdef HAVE_JSON_C
87 #include <json_object.h>
88 #endif /* HAVE_JSON_C */
89 
90 #ifdef HAVE_LIBXML2
91 #include <libxml/xmlwriter.h>
92 #define ISC_XMLCHAR (const xmlChar *)
93 #endif /* HAVE_LIBXML2 */
94 
95 /*%
96  * Choose the most preferable multiplex method.
97  */
98 #if defined(HAVE_KQUEUE)
99 #define USE_KQUEUE
100 #elif defined(HAVE_EPOLL_CREATE1)
101 #define USE_EPOLL
102 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
103 #define USE_DEVPOLL
104 typedef struct {
105 	unsigned int want_read : 1, want_write : 1;
106 } pollinfo_t;
107 #else /* if defined(HAVE_KQUEUE) */
108 #define USE_SELECT
109 #endif /* HAVE_KQUEUE */
110 
111 /*
112  * Set by the -T dscp option on the command line. If set to a value
113  * other than -1, we check to make sure DSCP values match it, and
114  * assert if not.
115  */
116 int isc_dscp_check_value = -1;
117 
118 /*%
119  * Maximum number of allowable open sockets.  This is also the maximum
120  * allowable socket file descriptor.
121  *
122  * Care should be taken before modifying this value for select():
123  * The API standard doesn't ensure select() accept more than (the system default
124  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
125  * the vast majority of cases.  This constant should therefore be increased only
126  * when absolutely necessary and possible, i.e., the server is exhausting all
127  * available file descriptors (up to FD_SETSIZE) and the select() function
128  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
129  * always by true, but we keep using some of them to ensure as much
130  * portability as possible).  Note also that overall server performance
131  * may be rather worsened with a larger value of this constant due to
132  * inherent scalability problems of select().
133  *
134  * As a special note, this value shouldn't have to be touched if
135  * this is a build for an authoritative only DNS server.
136  */
137 #ifndef ISC_SOCKET_MAXSOCKETS
138 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
139 #ifdef TUNE_LARGE
140 #define ISC_SOCKET_MAXSOCKETS 21000
141 #else /* ifdef TUNE_LARGE */
142 #define ISC_SOCKET_MAXSOCKETS 4096
143 #endif /* TUNE_LARGE */
144 #elif defined(USE_SELECT)
145 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
146 #endif /* USE_KQUEUE... */
147 #endif /* ISC_SOCKET_MAXSOCKETS */
148 
149 #ifdef USE_SELECT
150 /*%
151  * Mac OS X needs a special definition to support larger values in select().
152  * We always define this because a larger value can be specified run-time.
153  */
154 #ifdef __APPLE__
155 #define _DARWIN_UNLIMITED_SELECT
156 #endif /* __APPLE__ */
157 #endif /* USE_SELECT */
158 
159 #ifdef ISC_SOCKET_USE_POLLWATCH
160 /*%
161  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
162  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
163  * some of the specified FD.  The idea is based on the observation that it's
164  * likely for a busy server to keep receiving packets.  It specifically works
165  * as follows: the socket watcher is first initialized with the state of
166  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
167  * event occurs.  When it wakes up for a socket I/O event, it moves to the
168  * poll_active state, and sets the poll timeout to a short period
169  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
170  * watcher goes to the poll_checking state with the same timeout period.
171  * In this state, the watcher tries to detect whether this is a break
172  * during intermittent events or the kernel bug is triggered.  If the next
173  * polling reports an event within the short period, the previous timeout is
174  * likely to be a kernel bug, and so the watcher goes back to the active state.
175  * Otherwise, it moves to the idle state again.
176  *
177  * It's not clear whether this is a thread-related bug, but since we've only
178  * seen this with threads, this workaround is used only when enabling threads.
179  */
180 
181 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
182 
183 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
184 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
185 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
186 #endif /* ISC_SOCKET_USE_POLLWATCH */
187 
188 /*%
189  * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
190  */
191 #define FDLOCK_BITS  10
192 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
193 #define FDLOCK_ID(fd)                                   \
194 	(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
195 	 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
196 
197 /*%
198  * Maximum number of events communicated with the kernel.  There should normally
199  * be no need for having a large number.
200  */
201 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
202 #ifndef ISC_SOCKET_MAXEVENTS
203 #ifdef TUNE_LARGE
204 #define ISC_SOCKET_MAXEVENTS 2048
205 #else /* ifdef TUNE_LARGE */
206 #define ISC_SOCKET_MAXEVENTS 64
207 #endif /* TUNE_LARGE */
208 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
209 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
210 	* */
211 
212 /*%
213  * Some systems define the socket length argument as an int, some as size_t,
214  * some as socklen_t.  This is here so it can be easily changed if needed.
215  */
216 #ifndef socklen_t
217 #define socklen_t unsigned int
218 #endif /* ifndef socklen_t */
219 
220 /*%
221  * Define what the possible "soft" errors can be.  These are non-fatal returns
222  * of various network related functions, like recv() and so on.
223  *
224  * For some reason, BSDI (and perhaps others) will sometimes return <0
225  * from recv() but will have errno==0.  This is broken, but we have to
226  * work around it here.
227  */
228 #define SOFT_ERROR(e)                                             \
229 	((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
230 	 (e) == EINTR || (e) == 0)
231 
232 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
233 
234 /*!<
235  * DLVL(90)  --  Function entry/exit and other tracing.
236  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
237  * DLVL(60)  --  Socket data send/receive
238  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
239  * DLVL(20)  --  Socket creation/destruction.
240  */
241 #define TRACE_LEVEL	  90
242 #define CORRECTNESS_LEVEL 70
243 #define IOEVENT_LEVEL	  60
244 #define EVENT_LEVEL	  50
245 #define CREATION_LEVEL	  20
246 
247 #define TRACE	    DLVL(TRACE_LEVEL)
248 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
249 #define IOEVENT	    DLVL(IOEVENT_LEVEL)
250 #define EVENT	    DLVL(EVENT_LEVEL)
251 #define CREATION    DLVL(CREATION_LEVEL)
252 
253 typedef isc_event_t intev_t;
254 
255 #define SOCKET_MAGIC	ISC_MAGIC('I', 'O', 'i', 'o')
256 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
257 
258 /*!
259  * IPv6 control information.  If the socket is an IPv6 socket we want
260  * to collect the destination address and interface so the client can
261  * set them on outgoing packets.
262  */
263 #ifndef USE_CMSG
264 #define USE_CMSG 1
265 #endif /* ifndef USE_CMSG */
266 
267 /*%
268  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
269  * a setsockopt() like interface to request timestamps, and if the OS
270  * doesn't do it for us, call gettimeofday() on every UDP receive?
271  */
272 #ifdef SO_TIMESTAMP
273 #ifndef USE_CMSG
274 #define USE_CMSG 1
275 #endif /* ifndef USE_CMSG */
276 #endif /* ifdef SO_TIMESTAMP */
277 
278 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
279 #define SET_RCVBUF
280 #endif
281 
282 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
283 #define SET_SNDBUF
284 #endif
285 
286 /*%
287  * Instead of calculating the cmsgbuf lengths every time we take
288  * a rule of thumb approach - sizes are taken from x86_64 linux,
289  * multiplied by 2, everything should fit. Those sizes are not
290  * large enough to cause any concern.
291  */
292 #if defined(USE_CMSG)
293 #define CMSG_SP_IN6PKT 40
294 #else /* if defined(USE_CMSG) */
295 #define CMSG_SP_IN6PKT 0
296 #endif /* if defined(USE_CMSG) */
297 
298 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
299 #define CMSG_SP_TIMESTAMP 32
300 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
301 #define CMSG_SP_TIMESTAMP 0
302 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
303 
304 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
305 #define CMSG_SP_TCTOS 24
306 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
307 #define CMSG_SP_TCTOS 0
308 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
309 
310 #define CMSG_SP_INT 24
311 
312 /* Align cmsg buffers to be safe on SPARC etc. */
313 #define RECVCMSGBUFLEN                                                       \
314 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
315 			  1,                                                 \
316 		  sizeof(void *))
317 #define SENDCMSGBUFLEN                                                    \
318 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
319 		  sizeof(void *))
320 
321 /*%
322  * The number of times a send operation is repeated if the result is EINTR.
323  */
324 #define NRETRIES 10
325 
326 typedef struct isc__socketthread isc__socketthread_t;
327 
328 #define NEWCONNSOCK(ev) ((ev)->newsocket)
329 
330 struct isc_socket {
331 	/* Not locked. */
332 	unsigned int magic;
333 	isc_socketmgr_t *manager;
334 	isc_mutex_t lock;
335 	isc_sockettype_t type;
336 	const isc_statscounter_t *statsindex;
337 	isc_refcount_t references;
338 
339 	/* Locked by socket lock. */
340 	ISC_LINK(isc_socket_t) link;
341 	int fd;
342 	int pf;
343 	int threadid;
344 	char name[16];
345 	void *tag;
346 
347 	ISC_LIST(isc_socketevent_t) send_list;
348 	ISC_LIST(isc_socketevent_t) recv_list;
349 	ISC_LIST(isc_socket_newconnev_t) accept_list;
350 	ISC_LIST(isc_socket_connev_t) connect_list;
351 
352 	isc_sockaddr_t peer_address; /* remote address */
353 
354 	unsigned int listener : 1,	       /* listener socket */
355 		connected : 1, connecting : 1, /* connect pending
356 						* */
357 		bound  : 1,		       /* bound to local addr */
358 		dupped : 1, active : 1,	       /* currently active */
359 		pktdscp : 1;		       /* per packet dscp */
360 
361 #ifdef ISC_PLATFORM_RECVOVERFLOW
362 	unsigned char overflow; /* used for MSG_TRUNC fake */
363 #endif				/* ifdef ISC_PLATFORM_RECVOVERFLOW */
364 
365 	unsigned int dscp;
366 };
367 
368 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
369 #define VALID_MANAGER(m)     ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
370 
371 struct isc_socketmgr {
372 	/* Not locked. */
373 	unsigned int magic;
374 	isc_mem_t *mctx;
375 	isc_mutex_t lock;
376 	isc_stats_t *stats;
377 	int nthreads;
378 	isc__socketthread_t *threads;
379 	unsigned int maxsocks;
380 	/* Locked by manager lock. */
381 	ISC_LIST(isc_socket_t) socklist;
382 	int reserved; /* unlocked */
383 	isc_condition_t shutdown_ok;
384 	size_t maxudp;
385 };
386 
387 struct isc__socketthread {
388 	isc_socketmgr_t *manager;
389 	int threadid;
390 	isc_thread_t thread;
391 	int pipe_fds[2];
392 	isc_mutex_t *fdlock;
393 	/* Locked by fdlock. */
394 	isc_socket_t **fds;
395 	int *fdstate;
396 #ifdef USE_KQUEUE
397 	int kqueue_fd;
398 	int nevents;
399 	struct kevent *events;
400 #endif /* USE_KQUEUE */
401 #ifdef USE_EPOLL
402 	int epoll_fd;
403 	int nevents;
404 	struct epoll_event *events;
405 	uint32_t *epoll_events;
406 #endif /* USE_EPOLL */
407 #ifdef USE_DEVPOLL
408 	int devpoll_fd;
409 	isc_resourcevalue_t open_max;
410 	unsigned int calls;
411 	int nevents;
412 	struct pollfd *events;
413 	pollinfo_t *fdpollinfo;
414 #endif /* USE_DEVPOLL */
415 #ifdef USE_SELECT
416 	int fd_bufsize;
417 	fd_set *read_fds;
418 	fd_set *read_fds_copy;
419 	fd_set *write_fds;
420 	fd_set *write_fds_copy;
421 	int maxfd;
422 #endif /* USE_SELECT */
423 };
424 
425 #define CLOSED	      0 /* this one must be zero */
426 #define MANAGED	      1
427 #define CLOSE_PENDING 2
428 
429 /*
430  * send() and recv() iovec counts
431  */
432 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
433 #ifdef ISC_PLATFORM_RECVOVERFLOW
434 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
435 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
436 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
437 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
438 
439 static isc_result_t
440 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
441 	      isc_socket_t **socketp, isc_socket_t *dup_socket);
442 static void
443 send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
444 static void
445 send_senddone_event(isc_socket_t *, isc_socketevent_t **);
446 static void
447 send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
448 static void
449 free_socket(isc_socket_t **);
450 static isc_result_t
451 allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
452 static void
453 destroy(isc_socket_t **);
454 static void
455 internal_accept(isc_socket_t *);
456 static void
457 internal_connect(isc_socket_t *);
458 static void
459 internal_recv(isc_socket_t *);
460 static void
461 internal_send(isc_socket_t *);
462 static void
463 process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
464 static void
465 build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
466 		  struct iovec *, size_t *);
467 static void
468 build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
469 		  struct iovec *, size_t *);
470 static bool
471 process_ctlfd(isc__socketthread_t *thread);
472 static void
473 setdscp(isc_socket_t *sock, isc_dscp_t dscp);
474 
475 #define SELECT_POKE_SHUTDOWN (-1)
476 #define SELECT_POKE_NOTHING  (-2)
477 #define SELECT_POKE_READ     (-3)
478 #define SELECT_POKE_ACCEPT   (-3) /*%< Same as _READ */
479 #define SELECT_POKE_WRITE    (-4)
480 #define SELECT_POKE_CONNECT  (-4) /*%< Same as _WRITE */
481 #define SELECT_POKE_CLOSE    (-5)
482 
483 /*%
484  * Shortcut index arrays to get access to statistics counters.
485  */
486 enum {
487 	STATID_OPEN = 0,
488 	STATID_OPENFAIL = 1,
489 	STATID_CLOSE = 2,
490 	STATID_BINDFAIL = 3,
491 	STATID_CONNECTFAIL = 4,
492 	STATID_CONNECT = 5,
493 	STATID_ACCEPTFAIL = 6,
494 	STATID_ACCEPT = 7,
495 	STATID_SENDFAIL = 8,
496 	STATID_RECVFAIL = 9,
497 	STATID_ACTIVE = 10
498 };
499 static const isc_statscounter_t udp4statsindex[] = {
500 	isc_sockstatscounter_udp4open,
501 	isc_sockstatscounter_udp4openfail,
502 	isc_sockstatscounter_udp4close,
503 	isc_sockstatscounter_udp4bindfail,
504 	isc_sockstatscounter_udp4connectfail,
505 	isc_sockstatscounter_udp4connect,
506 	-1,
507 	-1,
508 	isc_sockstatscounter_udp4sendfail,
509 	isc_sockstatscounter_udp4recvfail,
510 	isc_sockstatscounter_udp4active
511 };
512 static const isc_statscounter_t udp6statsindex[] = {
513 	isc_sockstatscounter_udp6open,
514 	isc_sockstatscounter_udp6openfail,
515 	isc_sockstatscounter_udp6close,
516 	isc_sockstatscounter_udp6bindfail,
517 	isc_sockstatscounter_udp6connectfail,
518 	isc_sockstatscounter_udp6connect,
519 	-1,
520 	-1,
521 	isc_sockstatscounter_udp6sendfail,
522 	isc_sockstatscounter_udp6recvfail,
523 	isc_sockstatscounter_udp6active
524 };
525 static const isc_statscounter_t tcp4statsindex[] = {
526 	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
527 	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
528 	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
529 	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
530 	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
531 	isc_sockstatscounter_tcp4active
532 };
533 static const isc_statscounter_t tcp6statsindex[] = {
534 	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
535 	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
536 	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
537 	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
538 	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
539 	isc_sockstatscounter_tcp6active
540 };
541 static const isc_statscounter_t unixstatsindex[] = {
542 	isc_sockstatscounter_unixopen,	      isc_sockstatscounter_unixopenfail,
543 	isc_sockstatscounter_unixclose,	      isc_sockstatscounter_unixbindfail,
544 	isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
545 	isc_sockstatscounter_unixacceptfail,  isc_sockstatscounter_unixaccept,
546 	isc_sockstatscounter_unixsendfail,    isc_sockstatscounter_unixrecvfail,
547 	isc_sockstatscounter_unixactive
548 };
549 static const isc_statscounter_t rawstatsindex[] = {
550 	isc_sockstatscounter_rawopen,
551 	isc_sockstatscounter_rawopenfail,
552 	isc_sockstatscounter_rawclose,
553 	-1,
554 	-1,
555 	-1,
556 	-1,
557 	-1,
558 	-1,
559 	isc_sockstatscounter_rawrecvfail,
560 	isc_sockstatscounter_rawactive
561 };
562 
563 static int
564 gen_threadid(isc_socket_t *sock);
565 
566 static int
gen_threadid(isc_socket_t * sock)567 gen_threadid(isc_socket_t *sock) {
568 	return (sock->fd % sock->manager->nthreads);
569 }
570 
571 static void
572 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
573 	    isc_logmodule_t *module, int level, const char *fmt, ...)
574 	ISC_FORMAT_PRINTF(5, 6);
575 static void
manager_log(isc_socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)576 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
577 	    isc_logmodule_t *module, int level, const char *fmt, ...) {
578 	char msgbuf[2048];
579 	va_list ap;
580 
581 	if (!isc_log_wouldlog(isc_lctx, level)) {
582 		return;
583 	}
584 
585 	va_start(ap, fmt);
586 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
587 	va_end(ap);
588 
589 	isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
590 		      sockmgr, msgbuf);
591 }
592 
593 static void
594 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
595 	   isc_logmodule_t *module, int level, const char *fmt, ...)
596 	ISC_FORMAT_PRINTF(5, 6);
597 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)598 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
599 	   isc_logmodule_t *module, int level, const char *fmt, ...) {
600 	char msgbuf[2048];
601 	va_list ap;
602 
603 	if (!isc_log_wouldlog(isc_lctx, level)) {
604 		return;
605 	}
606 
607 	va_start(ap, fmt);
608 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
609 	va_end(ap);
610 
611 	isc_log_write(isc_lctx, category, module, level,
612 		      "sockmgr %p thread %d: %s", thread->manager,
613 		      thread->threadid, msgbuf);
614 }
615 
616 static void
617 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
618 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
619 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
620 static void
socket_log(isc_socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)621 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
622 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
623 	   const char *fmt, ...) {
624 	char msgbuf[2048];
625 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
626 	va_list ap;
627 
628 	if (!isc_log_wouldlog(isc_lctx, level)) {
629 		return;
630 	}
631 
632 	va_start(ap, fmt);
633 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
634 	va_end(ap);
635 
636 	if (address == NULL) {
637 		isc_log_write(isc_lctx, category, module, level,
638 			      "socket %p: %s", sock, msgbuf);
639 	} else {
640 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
641 		isc_log_write(isc_lctx, category, module, level,
642 			      "socket %p %s: %s", sock, peerbuf, msgbuf);
643 	}
644 }
645 
646 /*%
647  * Increment socket-related statistics counters.
648  */
649 static inline void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)650 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
651 	REQUIRE(counterid != -1);
652 
653 	if (stats != NULL) {
654 		isc_stats_increment(stats, counterid);
655 	}
656 }
657 
658 /*%
659  * Decrement socket-related statistics counters.
660  */
661 static inline void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)662 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
663 	REQUIRE(counterid != -1);
664 
665 	if (stats != NULL) {
666 		isc_stats_decrement(stats, counterid);
667 	}
668 }
669 
670 static inline isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)671 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
672 	isc_result_t result = ISC_R_SUCCESS;
673 
674 #ifdef USE_KQUEUE
675 	struct kevent evchange;
676 
677 	memset(&evchange, 0, sizeof(evchange));
678 	if (msg == SELECT_POKE_READ) {
679 		evchange.filter = EVFILT_READ;
680 	} else {
681 		evchange.filter = EVFILT_WRITE;
682 	}
683 	evchange.flags = EV_ADD;
684 	evchange.ident = fd;
685 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
686 		result = isc__errno2result(errno);
687 	}
688 
689 	return (result);
690 #elif defined(USE_EPOLL)
691 	struct epoll_event event;
692 	uint32_t oldevents;
693 	int ret;
694 	int op;
695 
696 	oldevents = thread->epoll_events[fd];
697 	if (msg == SELECT_POKE_READ) {
698 		thread->epoll_events[fd] |= EPOLLIN;
699 	} else {
700 		thread->epoll_events[fd] |= EPOLLOUT;
701 	}
702 
703 	event.events = thread->epoll_events[fd];
704 	memset(&event.data, 0, sizeof(event.data));
705 	event.data.fd = fd;
706 
707 	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
708 	if (thread->fds[fd] != NULL) {
709 		LOCK(&thread->fds[fd]->lock);
710 	}
711 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
712 	if (thread->fds[fd] != NULL) {
713 		UNLOCK(&thread->fds[fd]->lock);
714 	}
715 	if (ret == -1) {
716 		if (errno == EEXIST) {
717 			UNEXPECTED_ERROR(__FILE__, __LINE__,
718 					 "epoll_ctl(ADD/MOD) returned "
719 					 "EEXIST for fd %d",
720 					 fd);
721 		}
722 		result = isc__errno2result(errno);
723 	}
724 
725 	return (result);
726 #elif defined(USE_DEVPOLL)
727 	struct pollfd pfd;
728 
729 	memset(&pfd, 0, sizeof(pfd));
730 	if (msg == SELECT_POKE_READ) {
731 		pfd.events = POLLIN;
732 	} else {
733 		pfd.events = POLLOUT;
734 	}
735 	pfd.fd = fd;
736 	pfd.revents = 0;
737 	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
738 		result = isc__errno2result(errno);
739 	} else {
740 		if (msg == SELECT_POKE_READ) {
741 			thread->fdpollinfo[fd].want_read = 1;
742 		} else {
743 			thread->fdpollinfo[fd].want_write = 1;
744 		}
745 	}
746 
747 	return (result);
748 #elif defined(USE_SELECT)
749 	LOCK(&thread->manager->lock);
750 	if (msg == SELECT_POKE_READ) {
751 		FD_SET(fd, thread->read_fds);
752 	}
753 	if (msg == SELECT_POKE_WRITE) {
754 		FD_SET(fd, thread->write_fds);
755 	}
756 	UNLOCK(&thread->manager->lock);
757 
758 	return (result);
759 #endif /* ifdef USE_KQUEUE */
760 }
761 
762 static inline isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)763 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
764 	isc_result_t result = ISC_R_SUCCESS;
765 
766 #ifdef USE_KQUEUE
767 	struct kevent evchange;
768 
769 	memset(&evchange, 0, sizeof(evchange));
770 	if (msg == SELECT_POKE_READ) {
771 		evchange.filter = EVFILT_READ;
772 	} else {
773 		evchange.filter = EVFILT_WRITE;
774 	}
775 	evchange.flags = EV_DELETE;
776 	evchange.ident = fd;
777 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
778 		result = isc__errno2result(errno);
779 	}
780 
781 	return (result);
782 #elif defined(USE_EPOLL)
783 	struct epoll_event event;
784 	int ret;
785 	int op;
786 
787 	if (msg == SELECT_POKE_READ) {
788 		thread->epoll_events[fd] &= ~(EPOLLIN);
789 	} else {
790 		thread->epoll_events[fd] &= ~(EPOLLOUT);
791 	}
792 
793 	event.events = thread->epoll_events[fd];
794 	memset(&event.data, 0, sizeof(event.data));
795 	event.data.fd = fd;
796 
797 	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
798 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
799 	if (ret == -1 && errno != ENOENT) {
800 		char strbuf[ISC_STRERRORSIZE];
801 		strerror_r(errno, strbuf, sizeof(strbuf));
802 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
803 				 fd, strbuf);
804 		result = ISC_R_UNEXPECTED;
805 	}
806 	return (result);
807 #elif defined(USE_DEVPOLL)
808 	struct pollfd pfds[2];
809 	size_t writelen = sizeof(pfds[0]);
810 
811 	memset(pfds, 0, sizeof(pfds));
812 	pfds[0].events = POLLREMOVE;
813 	pfds[0].fd = fd;
814 
815 	/*
816 	 * Canceling read or write polling via /dev/poll is tricky.  Since it
817 	 * only provides a way of canceling per FD, we may need to re-poll the
818 	 * socket for the other operation.
819 	 */
820 	if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
821 		pfds[1].events = POLLOUT;
822 		pfds[1].fd = fd;
823 		writelen += sizeof(pfds[1]);
824 	}
825 	if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
826 		pfds[1].events = POLLIN;
827 		pfds[1].fd = fd;
828 		writelen += sizeof(pfds[1]);
829 	}
830 
831 	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
832 		result = isc__errno2result(errno);
833 	} else {
834 		if (msg == SELECT_POKE_READ) {
835 			thread->fdpollinfo[fd].want_read = 0;
836 		} else {
837 			thread->fdpollinfo[fd].want_write = 0;
838 		}
839 	}
840 
841 	return (result);
842 #elif defined(USE_SELECT)
843 	LOCK(&thread->manager->lock);
844 	if (msg == SELECT_POKE_READ) {
845 		FD_CLR(fd, thread->read_fds);
846 	} else if (msg == SELECT_POKE_WRITE) {
847 		FD_CLR(fd, thread->write_fds);
848 	}
849 	UNLOCK(&thread->manager->lock);
850 
851 	return (result);
852 #endif /* ifdef USE_KQUEUE */
853 }
854 
855 /*
856  * A poke message was received, perform a proper watch/unwatch
857  * on a fd provided
858  */
859 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)860 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
861 	isc_result_t result;
862 	int lockid = FDLOCK_ID(fd);
863 
864 	/*
865 	 * This is a wakeup on a socket.  If the socket is not in the
866 	 * process of being closed, start watching it for either reads
867 	 * or writes.
868 	 */
869 
870 	INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
871 
872 	if (msg == SELECT_POKE_CLOSE) {
873 		LOCK(&thread->fdlock[lockid]);
874 		INSIST(thread->fdstate[fd] == CLOSE_PENDING);
875 		thread->fdstate[fd] = CLOSED;
876 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
877 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
878 		(void)close(fd);
879 		UNLOCK(&thread->fdlock[lockid]);
880 		return;
881 	}
882 
883 	LOCK(&thread->fdlock[lockid]);
884 	if (thread->fdstate[fd] == CLOSE_PENDING) {
885 		/*
886 		 * We accept (and ignore) any error from unwatch_fd() as we are
887 		 * closing the socket, hoping it doesn't leave dangling state in
888 		 * the kernel.
889 		 * Note that unwatch_fd() must be called after releasing the
890 		 * fdlock; otherwise it could cause deadlock due to a lock order
891 		 * reversal.
892 		 */
893 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
894 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
895 		UNLOCK(&thread->fdlock[lockid]);
896 		return;
897 	}
898 	if (thread->fdstate[fd] != MANAGED) {
899 		UNLOCK(&thread->fdlock[lockid]);
900 		return;
901 	}
902 
903 	/*
904 	 * Set requested bit.
905 	 */
906 	result = watch_fd(thread, fd, msg);
907 	if (result != ISC_R_SUCCESS) {
908 		/*
909 		 * XXXJT: what should we do?  Ignoring the failure of watching
910 		 * a socket will make the application dysfunctional, but there
911 		 * seems to be no reasonable recovery process.
912 		 */
913 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
914 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
915 			      "failed to start watching FD (%d): %s", fd,
916 			      isc_result_totext(result));
917 	}
918 	UNLOCK(&thread->fdlock[lockid]);
919 }
920 
921 /*
922  * Poke the select loop when there is something for us to do.
923  * The write is required (by POSIX) to complete.  That is, we
924  * will not get partial writes.
925  */
926 static void
select_poke(isc_socketmgr_t * mgr,int threadid,int fd,int msg)927 select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
928 	int cc;
929 	int buf[2];
930 	char strbuf[ISC_STRERRORSIZE];
931 
932 	buf[0] = fd;
933 	buf[1] = msg;
934 
935 	do {
936 		cc = write(mgr->threads[threadid].pipe_fds[1], buf,
937 			   sizeof(buf));
938 #ifdef ENOSR
939 		/*
940 		 * Treat ENOSR as EAGAIN but loop slowly as it is
941 		 * unlikely to clear fast.
942 		 */
943 		if (cc < 0 && errno == ENOSR) {
944 			sleep(1);
945 			errno = EAGAIN;
946 		}
947 #endif /* ifdef ENOSR */
948 	} while (cc < 0 && SOFT_ERROR(errno));
949 
950 	if (cc < 0) {
951 		strerror_r(errno, strbuf, sizeof(strbuf));
952 		FATAL_ERROR(__FILE__, __LINE__,
953 			    "write() failed during watcher poke: %s", strbuf);
954 	}
955 
956 	INSIST(cc == sizeof(buf));
957 }
958 
959 /*
960  * Read a message on the internal fd.
961  */
962 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)963 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
964 	int buf[2];
965 	int cc;
966 	char strbuf[ISC_STRERRORSIZE];
967 
968 	cc = read(thread->pipe_fds[0], buf, sizeof(buf));
969 	if (cc < 0) {
970 		*msg = SELECT_POKE_NOTHING;
971 		*fd = -1; /* Silence compiler. */
972 		if (SOFT_ERROR(errno)) {
973 			return;
974 		}
975 
976 		strerror_r(errno, strbuf, sizeof(strbuf));
977 		FATAL_ERROR(__FILE__, __LINE__,
978 			    "read() failed during watcher poke: %s", strbuf);
979 	}
980 	INSIST(cc == sizeof(buf));
981 
982 	*fd = buf[0];
983 	*msg = buf[1];
984 }
985 
986 /*
987  * Make a fd non-blocking.
988  */
989 static isc_result_t
make_nonblock(int fd)990 make_nonblock(int fd) {
991 	int ret;
992 	char strbuf[ISC_STRERRORSIZE];
993 #ifdef USE_FIONBIO_IOCTL
994 	int on = 1;
995 #else  /* ifdef USE_FIONBIO_IOCTL */
996 	int flags;
997 #endif /* ifdef USE_FIONBIO_IOCTL */
998 
999 #ifdef USE_FIONBIO_IOCTL
1000 	ret = ioctl(fd, FIONBIO, (char *)&on);
1001 #else  /* ifdef USE_FIONBIO_IOCTL */
1002 	flags = fcntl(fd, F_GETFL, 0);
1003 	flags |= PORT_NONBLOCK;
1004 	ret = fcntl(fd, F_SETFL, flags);
1005 #endif /* ifdef USE_FIONBIO_IOCTL */
1006 
1007 	if (ret == -1) {
1008 		strerror_r(errno, strbuf, sizeof(strbuf));
1009 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1010 #ifdef USE_FIONBIO_IOCTL
1011 				 "ioctl(%d, FIONBIO, &on): %s", fd,
1012 #else  /* ifdef USE_FIONBIO_IOCTL */
1013 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1014 #endif /* ifdef USE_FIONBIO_IOCTL */
1015 				 strbuf);
1016 
1017 		return (ISC_R_UNEXPECTED);
1018 	}
1019 
1020 	return (ISC_R_SUCCESS);
1021 }
1022 
1023 #ifdef USE_CMSG
1024 /*
1025  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1026  * In order to ensure as much portability as possible, we provide wrapper
1027  * functions of these macros.
1028  * Note that cmsg_space() could run slow on OSes that do not have
1029  * CMSG_SPACE.
1030  */
1031 static inline socklen_t
cmsg_len(socklen_t len)1032 cmsg_len(socklen_t len) {
1033 #ifdef CMSG_LEN
1034 	return (CMSG_LEN(len));
1035 #else  /* ifdef CMSG_LEN */
1036 	socklen_t hdrlen;
1037 
1038 	/*
1039 	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1040 	 * is correct.
1041 	 */
1042 	hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1043 	return (hdrlen + len);
1044 #endif /* ifdef CMSG_LEN */
1045 }
1046 
1047 static inline socklen_t
cmsg_space(socklen_t len)1048 cmsg_space(socklen_t len) {
1049 #ifdef CMSG_SPACE
1050 	return (CMSG_SPACE(len));
1051 #else  /* ifdef CMSG_SPACE */
1052 	struct msghdr msg;
1053 	struct cmsghdr *cmsgp;
1054 	/*
1055 	 * XXX: The buffer length is an ad-hoc value, but should be enough
1056 	 * in a practical sense.
1057 	 */
1058 	char dummybuf[sizeof(struct cmsghdr) + 1024];
1059 
1060 	memset(&msg, 0, sizeof(msg));
1061 	msg.msg_control = dummybuf;
1062 	msg.msg_controllen = sizeof(dummybuf);
1063 
1064 	cmsgp = (struct cmsghdr *)dummybuf;
1065 	cmsgp->cmsg_len = cmsg_len(len);
1066 
1067 	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1068 	if (cmsgp != NULL) {
1069 		return ((char *)cmsgp - (char *)msg.msg_control);
1070 	} else {
1071 		return (0);
1072 	}
1073 #endif /* ifdef CMSG_SPACE */
1074 }
1075 #endif /* USE_CMSG */
1076 
1077 /*
1078  * Process control messages received on a socket.
1079  */
1080 static void
process_cmsg(isc_socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1081 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1082 #ifdef USE_CMSG
1083 	struct cmsghdr *cmsgp;
1084 	struct in6_pktinfo *pktinfop;
1085 #ifdef SO_TIMESTAMP
1086 	void *timevalp;
1087 #endif /* ifdef SO_TIMESTAMP */
1088 #endif /* ifdef USE_CMSG */
1089 
1090 	/*
1091 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1092 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1093 	 * They are all here, outside of the CPP tests, because it is
1094 	 * more consistent with the usual ISC coding style.
1095 	 */
1096 	UNUSED(sock);
1097 	UNUSED(msg);
1098 	UNUSED(dev);
1099 
1100 #ifdef MSG_TRUNC
1101 	if ((msg->msg_flags & MSG_TRUNC) != 0) {
1102 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1103 	}
1104 #endif /* ifdef MSG_TRUNC */
1105 
1106 #ifdef MSG_CTRUNC
1107 	if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1108 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1109 	}
1110 #endif /* ifdef MSG_CTRUNC */
1111 
1112 #ifndef USE_CMSG
1113 	return;
1114 #else /* ifndef USE_CMSG */
1115 	if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1116 		return;
1117 	}
1118 
1119 #ifdef SO_TIMESTAMP
1120 	timevalp = NULL;
1121 #endif /* ifdef SO_TIMESTAMP */
1122 	pktinfop = NULL;
1123 
1124 	cmsgp = CMSG_FIRSTHDR(msg);
1125 	while (cmsgp != NULL) {
1126 		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1127 
1128 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1129 		    cmsgp->cmsg_type == IPV6_PKTINFO) {
1130 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1131 			memmove(&dev->pktinfo, pktinfop,
1132 				sizeof(struct in6_pktinfo));
1133 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1134 			socket_log(sock, NULL, TRACE,
1135 				   "interface received on ifindex %u",
1136 				   dev->pktinfo.ipi6_ifindex);
1137 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1138 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1139 			}
1140 			goto next;
1141 		}
1142 
1143 #ifdef SO_TIMESTAMP
1144 		if (cmsgp->cmsg_level == SOL_SOCKET &&
1145 		    cmsgp->cmsg_type == SCM_TIMESTAMP) {
1146 			struct timeval tv;
1147 			timevalp = CMSG_DATA(cmsgp);
1148 			memmove(&tv, timevalp, sizeof(tv));
1149 			dev->timestamp.seconds = tv.tv_sec;
1150 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1151 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1152 			goto next;
1153 		}
1154 #endif /* ifdef SO_TIMESTAMP */
1155 
1156 #ifdef IPV6_TCLASS
1157 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1158 		    cmsgp->cmsg_type == IPV6_TCLASS) {
1159 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
1160 			dev->dscp >>= 2;
1161 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1162 			goto next;
1163 		}
1164 #endif /* ifdef IPV6_TCLASS */
1165 
1166 #ifdef IP_TOS
1167 		if (cmsgp->cmsg_level == IPPROTO_IP &&
1168 		    (cmsgp->cmsg_type == IP_TOS
1169 #ifdef IP_RECVTOS
1170 		     || cmsgp->cmsg_type == IP_RECVTOS
1171 #endif /* ifdef IP_RECVTOS */
1172 		     ))
1173 		{
1174 			dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1175 			dev->dscp >>= 2;
1176 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1177 			goto next;
1178 		}
1179 #endif /* ifdef IP_TOS */
1180 	next:
1181 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1182 	}
1183 #endif /* USE_CMSG */
1184 }
1185 
1186 /*
1187  * Construct an iov array and attach it to the msghdr passed in.  This is
1188  * the SEND constructor, which will use the used region of the buffer
1189  * (if using a buffer list) or will use the internal region (if a single
1190  * buffer I/O is requested).
1191  *
1192  * Nothing can be NULL, and the done event must list at least one buffer
1193  * on the buffer linked list for this function to be meaningful.
1194  *
1195  * If write_countp != NULL, *write_countp will hold the number of bytes
1196  * this transaction can send.
1197  */
1198 static void
build_msghdr_send(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1199 build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1200 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1201 	unsigned int iovcount;
1202 	size_t write_count;
1203 	struct cmsghdr *cmsgp;
1204 
1205 	memset(msg, 0, sizeof(*msg));
1206 
1207 	if (!sock->connected) {
1208 		msg->msg_name = (void *)&dev->address.type.sa;
1209 		msg->msg_namelen = dev->address.length;
1210 	} else {
1211 		msg->msg_name = NULL;
1212 		msg->msg_namelen = 0;
1213 	}
1214 
1215 	write_count = dev->region.length - dev->n;
1216 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1217 	iov[0].iov_len = write_count;
1218 	iovcount = 1;
1219 
1220 	msg->msg_iov = iov;
1221 	msg->msg_iovlen = iovcount;
1222 	msg->msg_control = NULL;
1223 	msg->msg_controllen = 0;
1224 	msg->msg_flags = 0;
1225 #if defined(USE_CMSG)
1226 
1227 	if ((sock->type == isc_sockettype_udp) &&
1228 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1229 	{
1230 		struct in6_pktinfo *pktinfop;
1231 
1232 		socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1233 			   dev->pktinfo.ipi6_ifindex);
1234 
1235 		msg->msg_control = (void *)cmsgbuf;
1236 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1237 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1238 
1239 		cmsgp = (struct cmsghdr *)cmsgbuf;
1240 		cmsgp->cmsg_level = IPPROTO_IPV6;
1241 		cmsgp->cmsg_type = IPV6_PKTINFO;
1242 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1243 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1244 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1245 	}
1246 
1247 #if defined(IPV6_USE_MIN_MTU)
1248 	if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1249 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1250 	{
1251 		int use_min_mtu = 1; /* -1, 0, 1 */
1252 
1253 		cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1254 		msg->msg_control = (void *)cmsgbuf;
1255 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1256 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1257 
1258 		cmsgp->cmsg_level = IPPROTO_IPV6;
1259 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1260 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1261 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1262 	}
1263 #endif /* if defined(IPV6_USE_MIN_MTU) */
1264 
1265 	if (isc_dscp_check_value > -1) {
1266 		if (sock->type == isc_sockettype_udp) {
1267 			INSIST((int)dev->dscp == isc_dscp_check_value);
1268 		} else if (sock->type == isc_sockettype_tcp) {
1269 			INSIST((int)sock->dscp == isc_dscp_check_value);
1270 		}
1271 	}
1272 
1273 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1274 	if ((sock->type == isc_sockettype_udp) &&
1275 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1276 	{
1277 		int dscp = (dev->dscp << 2) & 0xff;
1278 
1279 		INSIST(dev->dscp < 0x40);
1280 
1281 #ifdef IP_TOS
1282 		if (sock->pf == AF_INET && sock->pktdscp) {
1283 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1284 						   msg->msg_controllen);
1285 			msg->msg_control = (void *)cmsgbuf;
1286 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1287 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1288 
1289 			cmsgp->cmsg_level = IPPROTO_IP;
1290 			cmsgp->cmsg_type = IP_TOS;
1291 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
1292 			*(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1293 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1294 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1295 				       (void *)&dscp, sizeof(int)) < 0) {
1296 				char strbuf[ISC_STRERRORSIZE];
1297 				strerror_r(errno, strbuf, sizeof(strbuf));
1298 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1299 						 "setsockopt(%d, IP_TOS, %.02x)"
1300 						 " failed: %s",
1301 						 sock->fd, dscp >> 2, strbuf);
1302 			} else {
1303 				sock->dscp = dscp;
1304 			}
1305 		}
1306 #endif /* ifdef IP_TOS */
1307 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1308 		if (sock->pf == AF_INET6 && sock->pktdscp) {
1309 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1310 						   msg->msg_controllen);
1311 			msg->msg_control = (void *)cmsgbuf;
1312 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1313 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1314 
1315 			cmsgp->cmsg_level = IPPROTO_IPV6;
1316 			cmsgp->cmsg_type = IPV6_TCLASS;
1317 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1318 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1319 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1320 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1321 				       (void *)&dscp, sizeof(int)) < 0)
1322 			{
1323 				char strbuf[ISC_STRERRORSIZE];
1324 				strerror_r(errno, strbuf, sizeof(strbuf));
1325 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1326 						 "setsockopt(%d, IPV6_TCLASS, "
1327 						 "%.02x) failed: %s",
1328 						 sock->fd, dscp >> 2, strbuf);
1329 			} else {
1330 				sock->dscp = dscp;
1331 			}
1332 		}
1333 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1334 		if (msg->msg_controllen != 0 &&
1335 		    msg->msg_controllen < SENDCMSGBUFLEN) {
1336 			memset(cmsgbuf + msg->msg_controllen, 0,
1337 			       SENDCMSGBUFLEN - msg->msg_controllen);
1338 		}
1339 	}
1340 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1341 	* defined(IPV6_TCLASS))                           \
1342 	* */
1343 #endif /* USE_CMSG */
1344 
1345 	if (write_countp != NULL) {
1346 		*write_countp = write_count;
1347 	}
1348 }
1349 
1350 /*
1351  * Construct an iov array and attach it to the msghdr passed in.  This is
1352  * the RECV constructor, which will use the available region of the buffer
1353  * (if using a buffer list) or will use the internal region (if a single
1354  * buffer I/O is requested).
1355  *
1356  * Nothing can be NULL, and the done event must list at least one buffer
1357  * on the buffer linked list for this function to be meaningful.
1358  *
1359  * If read_countp != NULL, *read_countp will hold the number of bytes
1360  * this transaction can receive.
1361  */
1362 static void
build_msghdr_recv(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1363 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1364 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1365 	unsigned int iovcount;
1366 	size_t read_count;
1367 
1368 	memset(msg, 0, sizeof(struct msghdr));
1369 
1370 	if (sock->type == isc_sockettype_udp) {
1371 		memset(&dev->address, 0, sizeof(dev->address));
1372 		msg->msg_name = (void *)&dev->address.type.sa;
1373 		msg->msg_namelen = sizeof(dev->address.type);
1374 	} else { /* TCP */
1375 		msg->msg_name = NULL;
1376 		msg->msg_namelen = 0;
1377 		dev->address = sock->peer_address;
1378 	}
1379 
1380 	read_count = dev->region.length - dev->n;
1381 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1382 	iov[0].iov_len = read_count;
1383 	iovcount = 1;
1384 
1385 	/*
1386 	 * If needed, set up to receive that one extra byte.
1387 	 */
1388 #ifdef ISC_PLATFORM_RECVOVERFLOW
1389 	if (sock->type == isc_sockettype_udp) {
1390 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1391 		iov[iovcount].iov_base = (void *)(&sock->overflow);
1392 		iov[iovcount].iov_len = 1;
1393 		iovcount++;
1394 	}
1395 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1396 
1397 	msg->msg_iov = iov;
1398 	msg->msg_iovlen = iovcount;
1399 
1400 #if defined(USE_CMSG)
1401 	msg->msg_control = cmsgbuf;
1402 	msg->msg_controllen = RECVCMSGBUFLEN;
1403 #else  /* if defined(USE_CMSG) */
1404 	msg->msg_control = NULL;
1405 	msg->msg_controllen = 0;
1406 #endif /* USE_CMSG */
1407 	msg->msg_flags = 0;
1408 
1409 	if (read_countp != NULL) {
1410 		*read_countp = read_count;
1411 	}
1412 }
1413 
1414 static void
set_dev_address(const isc_sockaddr_t * address,isc_socket_t * sock,isc_socketevent_t * dev)1415 set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
1416 		isc_socketevent_t *dev) {
1417 	if (sock->type == isc_sockettype_udp) {
1418 		if (address != NULL) {
1419 			dev->address = *address;
1420 		} else {
1421 			dev->address = sock->peer_address;
1422 		}
1423 	} else if (sock->type == isc_sockettype_tcp) {
1424 		INSIST(address == NULL);
1425 		dev->address = sock->peer_address;
1426 	}
1427 }
1428 
1429 static void
destroy_socketevent(isc_event_t * event)1430 destroy_socketevent(isc_event_t *event) {
1431 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1432 
1433 	(ev->destroy)(event);
1434 }
1435 
1436 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1437 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1438 		     isc_taskaction_t action, void *arg) {
1439 	isc_socketevent_t *ev;
1440 
1441 	ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1442 						     action, arg, sizeof(*ev));
1443 
1444 	ev->result = ISC_R_UNSET;
1445 	ISC_LINK_INIT(ev, ev_link);
1446 	ev->region.base = NULL;
1447 	ev->n = 0;
1448 	ev->offset = 0;
1449 	ev->attributes = 0;
1450 	ev->destroy = ev->ev_destroy;
1451 	ev->ev_destroy = destroy_socketevent;
1452 	ev->dscp = 0;
1453 
1454 	return (ev);
1455 }
1456 
1457 #if defined(ISC_SOCKET_DEBUG)
1458 static void
dump_msg(struct msghdr * msg)1459 dump_msg(struct msghdr *msg) {
1460 	unsigned int i;
1461 
1462 	printf("MSGHDR %p\n", msg);
1463 	printf("\tname %p, namelen %ld\n", msg->msg_name,
1464 	       (long)msg->msg_namelen);
1465 	printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1466 	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1467 		printf("\t\t%u\tbase %p, len %ld\n", i,
1468 		       msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1469 	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1470 	       (long)msg->msg_controllen);
1471 }
1472 #endif /* if defined(ISC_SOCKET_DEBUG) */
1473 
1474 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1475 #define DOIO_SOFT    1 /* i/o ok, soft error, no event sent */
1476 #define DOIO_HARD    2 /* i/o error, event sent */
1477 #define DOIO_EOF     3 /* EOF, no event sent */
1478 
1479 static int
doio_recv(isc_socket_t * sock,isc_socketevent_t * dev)1480 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1481 	int cc;
1482 	struct iovec iov[MAXSCATTERGATHER_RECV];
1483 	size_t read_count;
1484 	struct msghdr msghdr;
1485 	int recv_errno;
1486 	char strbuf[ISC_STRERRORSIZE];
1487 	char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1488 
1489 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1490 
1491 #if defined(ISC_SOCKET_DEBUG)
1492 	dump_msg(&msghdr);
1493 #endif /* if defined(ISC_SOCKET_DEBUG) */
1494 
1495 	cc = recvmsg(sock->fd, &msghdr, 0);
1496 	recv_errno = errno;
1497 
1498 #if defined(ISC_SOCKET_DEBUG)
1499 	dump_msg(&msghdr);
1500 #endif /* if defined(ISC_SOCKET_DEBUG) */
1501 
1502 	if (cc < 0) {
1503 		if (SOFT_ERROR(recv_errno)) {
1504 			return (DOIO_SOFT);
1505 		}
1506 
1507 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1508 			strerror_r(recv_errno, strbuf, sizeof(strbuf));
1509 			socket_log(sock, NULL, IOEVENT,
1510 				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1511 				   sock->fd, cc, recv_errno, strbuf);
1512 		}
1513 
1514 #define SOFT_OR_HARD(_system, _isc)                                   \
1515 	if (recv_errno == _system) {                                  \
1516 		if (sock->connected) {                                \
1517 			dev->result = _isc;                           \
1518 			inc_stats(sock->manager->stats,               \
1519 				  sock->statsindex[STATID_RECVFAIL]); \
1520 			return (DOIO_HARD);                           \
1521 		}                                                     \
1522 		return (DOIO_SOFT);                                   \
1523 	}
1524 #define ALWAYS_HARD(_system, _isc)                            \
1525 	if (recv_errno == _system) {                          \
1526 		dev->result = _isc;                           \
1527 		inc_stats(sock->manager->stats,               \
1528 			  sock->statsindex[STATID_RECVFAIL]); \
1529 		return (DOIO_HARD);                           \
1530 	}
1531 
1532 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1533 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1534 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1535 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1536 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1537 		/*
1538 		 * Older operating systems may still return EPROTO in some
1539 		 * situations, for example when receiving ICMP/ICMPv6 errors.
1540 		 * A real life scenario is when ICMPv6 returns code 5 or 6.
1541 		 * These codes are introduced in RFC 4443 from March 2006,
1542 		 * and the document obsoletes RFC 1885. But unfortunately not
1543 		 * all operating systems have caught up with the new standard
1544 		 * (in 2020) and thus a generic protocol error is returned.
1545 		 */
1546 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1547 		/* Should never get this one but it was seen. */
1548 #ifdef ENOPROTOOPT
1549 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1550 #endif /* ifdef ENOPROTOOPT */
1551 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1552 
1553 #undef SOFT_OR_HARD
1554 #undef ALWAYS_HARD
1555 
1556 		dev->result = isc__errno2result(recv_errno);
1557 		inc_stats(sock->manager->stats,
1558 			  sock->statsindex[STATID_RECVFAIL]);
1559 		return (DOIO_HARD);
1560 	}
1561 
1562 	/*
1563 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1564 	 * while on UDP sockets, zero length reads are perfectly valid,
1565 	 * although strange.
1566 	 */
1567 	switch (sock->type) {
1568 	case isc_sockettype_tcp:
1569 	case isc_sockettype_unix:
1570 		if (cc == 0) {
1571 			return (DOIO_EOF);
1572 		}
1573 		break;
1574 	case isc_sockettype_udp:
1575 	case isc_sockettype_raw:
1576 		break;
1577 	default:
1578 		INSIST(0);
1579 		ISC_UNREACHABLE();
1580 	}
1581 
1582 	if (sock->type == isc_sockettype_udp) {
1583 		dev->address.length = msghdr.msg_namelen;
1584 		if (isc_sockaddr_getport(&dev->address) == 0) {
1585 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1586 				socket_log(sock, &dev->address, IOEVENT,
1587 					   "dropping source port zero packet");
1588 			}
1589 			return (DOIO_SOFT);
1590 		}
1591 		/*
1592 		 * Simulate a firewall blocking UDP responses bigger than
1593 		 * 'maxudp' bytes.
1594 		 */
1595 		if (sock->manager->maxudp != 0 &&
1596 		    cc > (int)sock->manager->maxudp) {
1597 			return (DOIO_SOFT);
1598 		}
1599 	}
1600 
1601 	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1602 
1603 	/*
1604 	 * Overflow bit detection.  If we received MORE bytes than we should,
1605 	 * this indicates an overflow situation.  Set the flag in the
1606 	 * dev entry and adjust how much we read by one.
1607 	 */
1608 #ifdef ISC_PLATFORM_RECVOVERFLOW
1609 	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1610 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1611 		cc--;
1612 	}
1613 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1614 
1615 	/*
1616 	 * If there are control messages attached, run through them and pull
1617 	 * out the interesting bits.
1618 	 */
1619 	process_cmsg(sock, &msghdr, dev);
1620 
1621 	/*
1622 	 * update the buffers (if any) and the i/o count
1623 	 */
1624 	dev->n += cc;
1625 
1626 	/*
1627 	 * If we read less than we expected, update counters,
1628 	 * and let the upper layer poke the descriptor.
1629 	 */
1630 	if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1631 		return (DOIO_SOFT);
1632 	}
1633 
1634 	/*
1635 	 * Full reads are posted, or partials if partials are ok.
1636 	 */
1637 	dev->result = ISC_R_SUCCESS;
1638 	return (DOIO_SUCCESS);
1639 }
1640 
1641 /*
1642  * Returns:
1643  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1644  *			ISC_R_SUCCESS.
1645  *
1646  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1647  *			dev->result contains the appropriate error.
1648  *
1649  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1650  *			event was sent.  The operation should be retried.
1651  *
1652  *	No other return values are possible.
1653  */
1654 static int
doio_send(isc_socket_t * sock,isc_socketevent_t * dev)1655 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1656 	int cc;
1657 	struct iovec iov[MAXSCATTERGATHER_SEND];
1658 	size_t write_count;
1659 	struct msghdr msghdr;
1660 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1661 	int attempts = 0;
1662 	int send_errno;
1663 	char strbuf[ISC_STRERRORSIZE];
1664 	char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1665 
1666 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1667 
1668 resend:
1669 	if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1670 	    write_count > sock->manager->maxudp)
1671 	{
1672 		cc = write_count;
1673 	} else {
1674 		cc = sendmsg(sock->fd, &msghdr, 0);
1675 	}
1676 	send_errno = errno;
1677 
1678 	/*
1679 	 * Check for error or block condition.
1680 	 */
1681 	if (cc < 0) {
1682 		if (send_errno == EINTR && ++attempts < NRETRIES) {
1683 			goto resend;
1684 		}
1685 
1686 		if (SOFT_ERROR(send_errno)) {
1687 			if (errno == EWOULDBLOCK || errno == EAGAIN) {
1688 				dev->result = ISC_R_WOULDBLOCK;
1689 			}
1690 			return (DOIO_SOFT);
1691 		}
1692 
1693 #define SOFT_OR_HARD(_system, _isc)                                   \
1694 	if (send_errno == _system) {                                  \
1695 		if (sock->connected) {                                \
1696 			dev->result = _isc;                           \
1697 			inc_stats(sock->manager->stats,               \
1698 				  sock->statsindex[STATID_SENDFAIL]); \
1699 			return (DOIO_HARD);                           \
1700 		}                                                     \
1701 		return (DOIO_SOFT);                                   \
1702 	}
1703 #define ALWAYS_HARD(_system, _isc)                            \
1704 	if (send_errno == _system) {                          \
1705 		dev->result = _isc;                           \
1706 		inc_stats(sock->manager->stats,               \
1707 			  sock->statsindex[STATID_SENDFAIL]); \
1708 		return (DOIO_HARD);                           \
1709 	}
1710 
1711 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1712 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1713 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1714 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1715 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1716 #ifdef EHOSTDOWN
1717 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1718 #endif /* ifdef EHOSTDOWN */
1719 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1720 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1721 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1722 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1723 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1724 
1725 #undef SOFT_OR_HARD
1726 #undef ALWAYS_HARD
1727 
1728 		/*
1729 		 * The other error types depend on whether or not the
1730 		 * socket is UDP or TCP.  If it is UDP, some errors
1731 		 * that we expect to be fatal under TCP are merely
1732 		 * annoying, and are really soft errors.
1733 		 *
1734 		 * However, these soft errors are still returned as
1735 		 * a status.
1736 		 */
1737 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1738 		strerror_r(send_errno, strbuf, sizeof(strbuf));
1739 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1740 				 addrbuf, strbuf);
1741 		dev->result = isc__errno2result(send_errno);
1742 		inc_stats(sock->manager->stats,
1743 			  sock->statsindex[STATID_SENDFAIL]);
1744 		return (DOIO_HARD);
1745 	}
1746 
1747 	if (cc == 0) {
1748 		inc_stats(sock->manager->stats,
1749 			  sock->statsindex[STATID_SENDFAIL]);
1750 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1751 				 "doio_send: send() returned 0");
1752 	}
1753 
1754 	/*
1755 	 * If we write less than we expected, update counters, poke.
1756 	 */
1757 	dev->n += cc;
1758 	if ((size_t)cc != write_count) {
1759 		return (DOIO_SOFT);
1760 	}
1761 
1762 	/*
1763 	 * Exactly what we wanted to write.  We're done with this
1764 	 * entry.  Post its completion event.
1765 	 */
1766 	dev->result = ISC_R_SUCCESS;
1767 	return (DOIO_SUCCESS);
1768 }
1769 
1770 /*
1771  * Kill.
1772  *
1773  * Caller must ensure that the socket is not locked and no external
1774  * references exist.
1775  */
1776 static void
socketclose(isc__socketthread_t * thread,isc_socket_t * sock,int fd)1777 socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
1778 	int lockid = FDLOCK_ID(fd);
1779 	/*
1780 	 * No one has this socket open, so the watcher doesn't have to be
1781 	 * poked, and the socket doesn't have to be locked.
1782 	 */
1783 	LOCK(&thread->fdlock[lockid]);
1784 	thread->fds[fd] = NULL;
1785 	thread->fdstate[fd] = CLOSE_PENDING;
1786 	UNLOCK(&thread->fdlock[lockid]);
1787 	select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
1788 
1789 	inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1790 
1791 	LOCK(&sock->lock);
1792 	if (sock->active == 1) {
1793 		dec_stats(thread->manager->stats,
1794 			  sock->statsindex[STATID_ACTIVE]);
1795 		sock->active = 0;
1796 	}
1797 	UNLOCK(&sock->lock);
1798 
1799 	/*
1800 	 * update manager->maxfd here (XXX: this should be implemented more
1801 	 * efficiently)
1802 	 */
1803 #ifdef USE_SELECT
1804 	LOCK(&thread->manager->lock);
1805 	if (thread->maxfd == fd) {
1806 		int i;
1807 
1808 		thread->maxfd = 0;
1809 		for (i = fd - 1; i >= 0; i--) {
1810 			lockid = FDLOCK_ID(i);
1811 
1812 			LOCK(&thread->fdlock[lockid]);
1813 			if (thread->fdstate[i] == MANAGED) {
1814 				thread->maxfd = i;
1815 				UNLOCK(&thread->fdlock[lockid]);
1816 				break;
1817 			}
1818 			UNLOCK(&thread->fdlock[lockid]);
1819 		}
1820 		if (thread->maxfd < thread->pipe_fds[0]) {
1821 			thread->maxfd = thread->pipe_fds[0];
1822 		}
1823 	}
1824 
1825 	UNLOCK(&thread->manager->lock);
1826 #endif /* USE_SELECT */
1827 }
1828 
1829 static void
destroy(isc_socket_t ** sockp)1830 destroy(isc_socket_t **sockp) {
1831 	int fd = 0;
1832 	isc_socket_t *sock = *sockp;
1833 	isc_socketmgr_t *manager = sock->manager;
1834 	isc__socketthread_t *thread = NULL;
1835 
1836 	socket_log(sock, NULL, CREATION, "destroying");
1837 
1838 	isc_refcount_destroy(&sock->references);
1839 
1840 	LOCK(&sock->lock);
1841 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1842 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1843 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1844 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1845 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1846 
1847 	if (sock->fd >= 0) {
1848 		fd = sock->fd;
1849 		thread = &manager->threads[sock->threadid];
1850 		sock->fd = -1;
1851 		sock->threadid = -1;
1852 	}
1853 	UNLOCK(&sock->lock);
1854 
1855 	if (fd > 0) {
1856 		socketclose(thread, sock, fd);
1857 	}
1858 
1859 	LOCK(&manager->lock);
1860 
1861 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1862 
1863 	if (ISC_LIST_EMPTY(manager->socklist)) {
1864 		SIGNAL(&manager->shutdown_ok);
1865 	}
1866 
1867 	/* can't unlock manager as its memory context is still used */
1868 	free_socket(sockp);
1869 
1870 	UNLOCK(&manager->lock);
1871 }
1872 
1873 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1874 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1875 		isc_socket_t **socketp) {
1876 	isc_socket_t *sock;
1877 
1878 	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1879 
1880 	sock->magic = 0;
1881 	isc_refcount_init(&sock->references, 0);
1882 
1883 	sock->manager = manager;
1884 	sock->type = type;
1885 	sock->fd = -1;
1886 	sock->threadid = -1;
1887 	sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1888 	sock->dupped = 0;
1889 	sock->statsindex = NULL;
1890 	sock->active = 0;
1891 
1892 	ISC_LINK_INIT(sock, link);
1893 
1894 	memset(sock->name, 0, sizeof(sock->name));
1895 	sock->tag = NULL;
1896 
1897 	/*
1898 	 * Set up list of readers and writers to be initially empty.
1899 	 */
1900 	ISC_LIST_INIT(sock->recv_list);
1901 	ISC_LIST_INIT(sock->send_list);
1902 	ISC_LIST_INIT(sock->accept_list);
1903 	ISC_LIST_INIT(sock->connect_list);
1904 
1905 	sock->listener = 0;
1906 	sock->connected = 0;
1907 	sock->connecting = 0;
1908 	sock->bound = 0;
1909 	sock->pktdscp = 0;
1910 
1911 	/*
1912 	 * Initialize the lock.
1913 	 */
1914 	isc_mutex_init(&sock->lock);
1915 
1916 	sock->magic = SOCKET_MAGIC;
1917 	*socketp = sock;
1918 
1919 	return (ISC_R_SUCCESS);
1920 }
1921 
1922 /*
1923  * This event requires that the various lists be empty, that the reference
1924  * count be 1, and that the magic number is valid.  The other socket bits,
1925  * like the lock, must be initialized as well.  The fd associated must be
1926  * marked as closed, by setting it to -1 on close, or this routine will
1927  * also close the socket.
1928  */
1929 static void
free_socket(isc_socket_t ** socketp)1930 free_socket(isc_socket_t **socketp) {
1931 	isc_socket_t *sock = *socketp;
1932 	*socketp = NULL;
1933 
1934 	INSIST(VALID_SOCKET(sock));
1935 	isc_refcount_destroy(&sock->references);
1936 	LOCK(&sock->lock);
1937 	INSIST(!sock->connecting);
1938 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1939 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1940 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1941 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1942 	INSIST(!ISC_LINK_LINKED(sock, link));
1943 	UNLOCK(&sock->lock);
1944 
1945 	sock->magic = 0;
1946 
1947 	isc_mutex_destroy(&sock->lock);
1948 
1949 	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1950 }
1951 
1952 #if defined(SET_RCVBUF)
1953 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1954 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1955 
1956 static void
set_rcvbuf(void)1957 set_rcvbuf(void) {
1958 	int fd;
1959 	int max = rcvbuf, min;
1960 	socklen_t len;
1961 
1962 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1963 	if (fd == -1) {
1964 		switch (errno) {
1965 		case EPROTONOSUPPORT:
1966 		case EPFNOSUPPORT:
1967 		case EAFNOSUPPORT:
1968 		/*
1969 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1970 		 * EAFNOSUPPORT.
1971 		 */
1972 		case EINVAL:
1973 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
1974 			break;
1975 		}
1976 	}
1977 	if (fd == -1) {
1978 		return;
1979 	}
1980 
1981 	len = sizeof(min);
1982 	if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
1983 	    min < rcvbuf)
1984 	{
1985 	again:
1986 		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
1987 			       sizeof(rcvbuf)) == -1)
1988 		{
1989 			if (errno == ENOBUFS && rcvbuf > min) {
1990 				max = rcvbuf - 1;
1991 				rcvbuf = (rcvbuf + min) / 2;
1992 				goto again;
1993 			} else {
1994 				rcvbuf = min;
1995 				goto cleanup;
1996 			}
1997 		} else {
1998 			min = rcvbuf;
1999 		}
2000 		if (min != max) {
2001 			rcvbuf = max;
2002 			goto again;
2003 		}
2004 	}
2005 cleanup:
2006 	close(fd);
2007 }
2008 #endif /* ifdef SO_RCVBUF */
2009 
2010 #if defined(SET_SNDBUF)
2011 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
2012 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2013 
2014 static void
set_sndbuf(void)2015 set_sndbuf(void) {
2016 	int fd;
2017 	int max = sndbuf, min;
2018 	socklen_t len;
2019 
2020 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2021 	if (fd == -1) {
2022 		switch (errno) {
2023 		case EPROTONOSUPPORT:
2024 		case EPFNOSUPPORT:
2025 		case EAFNOSUPPORT:
2026 		/*
2027 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2028 		 * EAFNOSUPPORT.
2029 		 */
2030 		case EINVAL:
2031 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2032 			break;
2033 		}
2034 	}
2035 	if (fd == -1) {
2036 		return;
2037 	}
2038 
2039 	len = sizeof(min);
2040 	if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2041 	    min < sndbuf)
2042 	{
2043 	again:
2044 		if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2045 			       sizeof(sndbuf)) == -1)
2046 		{
2047 			if (errno == ENOBUFS && sndbuf > min) {
2048 				max = sndbuf - 1;
2049 				sndbuf = (sndbuf + min) / 2;
2050 				goto again;
2051 			} else {
2052 				sndbuf = min;
2053 				goto cleanup;
2054 			}
2055 		} else {
2056 			min = sndbuf;
2057 		}
2058 		if (min != max) {
2059 			sndbuf = max;
2060 			goto again;
2061 		}
2062 	}
2063 cleanup:
2064 	close(fd);
2065 }
2066 #endif /* ifdef SO_SNDBUF */
2067 
2068 static void
use_min_mtu(isc_socket_t * sock)2069 use_min_mtu(isc_socket_t *sock) {
2070 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2071 	UNUSED(sock);
2072 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2073 #ifdef IPV6_USE_MIN_MTU
2074 	/* use minimum MTU */
2075 	if (sock->pf == AF_INET6) {
2076 		int on = 1;
2077 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2078 				 (void *)&on, sizeof(on));
2079 	}
2080 #endif /* ifdef IPV6_USE_MIN_MTU */
2081 #if defined(IPV6_MTU)
2082 	/*
2083 	 * Use minimum MTU on IPv6 sockets.
2084 	 */
2085 	if (sock->pf == AF_INET6) {
2086 		int mtu = 1280;
2087 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2088 				 sizeof(mtu));
2089 	}
2090 #endif /* if defined(IPV6_MTU) */
2091 }
2092 
2093 static void
set_tcp_maxseg(isc_socket_t * sock,int size)2094 set_tcp_maxseg(isc_socket_t *sock, int size) {
2095 #ifdef TCP_MAXSEG
2096 	if (sock->type == isc_sockettype_tcp) {
2097 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2098 				 (void *)&size, sizeof(size));
2099 	}
2100 #endif /* ifdef TCP_MAXSEG */
2101 }
2102 
2103 static void
set_ip_disable_pmtud(isc_socket_t * sock)2104 set_ip_disable_pmtud(isc_socket_t *sock) {
2105 	/*
2106 	 * Disable Path MTU Discover on IP packets
2107 	 */
2108 	if (sock->pf == AF_INET6) {
2109 #if defined(IPV6_DONTFRAG)
2110 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
2111 				 &(int){ 0 }, sizeof(int));
2112 #endif
2113 #if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2114 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2115 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2116 #endif
2117 	} else if (sock->pf == AF_INET) {
2118 #if defined(IP_DONTFRAG)
2119 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
2120 				 sizeof(int));
2121 #endif
2122 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2123 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2124 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2125 #endif
2126 	}
2127 }
2128 
2129 static isc_result_t
opensocket(isc_socketmgr_t * manager,isc_socket_t * sock,isc_socket_t * dup_socket)2130 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock,
2131 	   isc_socket_t *dup_socket) {
2132 	isc_result_t result;
2133 	char strbuf[ISC_STRERRORSIZE];
2134 	const char *err = "socket";
2135 	int tries = 0;
2136 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2137 	int on = 1;
2138 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2139 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2140 	socklen_t optlen;
2141 	int size = 0;
2142 #endif
2143 
2144 again:
2145 	if (dup_socket == NULL) {
2146 		switch (sock->type) {
2147 		case isc_sockettype_udp:
2148 			sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2149 			break;
2150 		case isc_sockettype_tcp:
2151 			sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2152 			break;
2153 		case isc_sockettype_unix:
2154 			sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2155 			break;
2156 		case isc_sockettype_raw:
2157 			errno = EPFNOSUPPORT;
2158 			/*
2159 			 * PF_ROUTE is a alias for PF_NETLINK on linux.
2160 			 */
2161 #if defined(PF_ROUTE)
2162 			if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2163 #ifdef NETLINK_ROUTE
2164 				sock->fd = socket(sock->pf, SOCK_RAW,
2165 						  NETLINK_ROUTE);
2166 #else  /* ifdef NETLINK_ROUTE */
2167 				sock->fd = socket(sock->pf, SOCK_RAW, 0);
2168 #endif /* ifdef NETLINK_ROUTE */
2169 				if (sock->fd != -1) {
2170 #ifdef NETLINK_ROUTE
2171 					struct sockaddr_nl sa;
2172 					int n;
2173 
2174 					/*
2175 					 * Do an implicit bind.
2176 					 */
2177 					memset(&sa, 0, sizeof(sa));
2178 					sa.nl_family = AF_NETLINK;
2179 					sa.nl_groups = RTMGRP_IPV4_IFADDR |
2180 						       RTMGRP_IPV6_IFADDR;
2181 					n = bind(sock->fd,
2182 						 (struct sockaddr *)&sa,
2183 						 sizeof(sa));
2184 					if (n < 0) {
2185 						close(sock->fd);
2186 						sock->fd = -1;
2187 					}
2188 #endif /* ifdef NETLINK_ROUTE */
2189 					sock->bound = 1;
2190 				}
2191 			}
2192 #endif /* if defined(PF_ROUTE) */
2193 			break;
2194 		}
2195 	} else {
2196 		sock->fd = dup(dup_socket->fd);
2197 		sock->dupped = 1;
2198 		sock->bound = dup_socket->bound;
2199 	}
2200 	if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2201 		goto again;
2202 	}
2203 
2204 #ifdef F_DUPFD
2205 	/*
2206 	 * Leave a space for stdio and TCP to work in.
2207 	 */
2208 	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2209 	    sock->fd >= 0 && sock->fd < manager->reserved)
2210 	{
2211 		int newfd, tmp;
2212 		newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2213 		tmp = errno;
2214 		(void)close(sock->fd);
2215 		errno = tmp;
2216 		sock->fd = newfd;
2217 		err = "isc_socket_create: fcntl/reserved";
2218 	} else if (sock->fd >= 0 && sock->fd < 20) {
2219 		int newfd, tmp;
2220 		newfd = fcntl(sock->fd, F_DUPFD, 20);
2221 		tmp = errno;
2222 		(void)close(sock->fd);
2223 		errno = tmp;
2224 		sock->fd = newfd;
2225 		err = "isc_socket_create: fcntl";
2226 	}
2227 #endif /* ifdef F_DUPFD */
2228 
2229 	if (sock->fd >= (int)manager->maxsocks) {
2230 		(void)close(sock->fd);
2231 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2232 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2233 			      "socket: file descriptor exceeds limit (%d/%u)",
2234 			      sock->fd, manager->maxsocks);
2235 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2236 		return (ISC_R_NORESOURCES);
2237 	}
2238 
2239 	if (sock->fd < 0) {
2240 		switch (errno) {
2241 		case EMFILE:
2242 		case ENFILE:
2243 			strerror_r(errno, strbuf, sizeof(strbuf));
2244 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2245 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2246 				      "%s: %s", err, strbuf);
2247 		/* fallthrough */
2248 		case ENOBUFS:
2249 			inc_stats(manager->stats,
2250 				  sock->statsindex[STATID_OPENFAIL]);
2251 			return (ISC_R_NORESOURCES);
2252 
2253 		case EPROTONOSUPPORT:
2254 		case EPFNOSUPPORT:
2255 		case EAFNOSUPPORT:
2256 		/*
2257 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2258 		 * EAFNOSUPPORT.
2259 		 */
2260 		case EINVAL:
2261 			inc_stats(manager->stats,
2262 				  sock->statsindex[STATID_OPENFAIL]);
2263 			return (ISC_R_FAMILYNOSUPPORT);
2264 
2265 		default:
2266 			strerror_r(errno, strbuf, sizeof(strbuf));
2267 			UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2268 					 err, strbuf);
2269 			inc_stats(manager->stats,
2270 				  sock->statsindex[STATID_OPENFAIL]);
2271 			return (ISC_R_UNEXPECTED);
2272 		}
2273 	}
2274 
2275 	if (dup_socket != NULL) {
2276 		goto setup_done;
2277 	}
2278 
2279 	result = make_nonblock(sock->fd);
2280 	if (result != ISC_R_SUCCESS) {
2281 		(void)close(sock->fd);
2282 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2283 		return (result);
2284 	}
2285 
2286 #ifdef SO_NOSIGPIPE
2287 	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2288 		       sizeof(on)) < 0) {
2289 		strerror_r(errno, strbuf, sizeof(strbuf));
2290 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2291 				 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2292 				 sock->fd, strbuf);
2293 		/* Press on... */
2294 	}
2295 #endif /* ifdef SO_NOSIGPIPE */
2296 
2297 	/*
2298 	 * Use minimum mtu if possible.
2299 	 */
2300 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2301 		use_min_mtu(sock);
2302 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2303 	}
2304 
2305 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2306 	if (sock->type == isc_sockettype_udp) {
2307 #if defined(USE_CMSG)
2308 #if defined(SO_TIMESTAMP)
2309 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2310 			       sizeof(on)) < 0 &&
2311 		    errno != ENOPROTOOPT)
2312 		{
2313 			strerror_r(errno, strbuf, sizeof(strbuf));
2314 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2315 					 "setsockopt(%d, SO_TIMESTAMP) failed: "
2316 					 "%s",
2317 					 sock->fd, strbuf);
2318 			/* Press on... */
2319 		}
2320 #endif /* SO_TIMESTAMP */
2321 
2322 #ifdef IPV6_RECVPKTINFO
2323 		/* RFC 3542 */
2324 		if ((sock->pf == AF_INET6) &&
2325 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2326 				(void *)&on, sizeof(on)) < 0))
2327 		{
2328 			strerror_r(errno, strbuf, sizeof(strbuf));
2329 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2330 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2331 					 "failed: %s",
2332 					 sock->fd, strbuf);
2333 		}
2334 #else  /* ifdef IPV6_RECVPKTINFO */
2335 		/* RFC 2292 */
2336 		if ((sock->pf == AF_INET6) &&
2337 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2338 				(void *)&on, sizeof(on)) < 0))
2339 		{
2340 			strerror_r(errno, strbuf, sizeof(strbuf));
2341 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2342 					 "setsockopt(%d, IPV6_PKTINFO) failed: "
2343 					 "%s",
2344 					 sock->fd, strbuf);
2345 		}
2346 #endif /* IPV6_RECVPKTINFO */
2347 #endif /* defined(USE_CMSG) */
2348 
2349 #if defined(SET_RCVBUF)
2350 		optlen = sizeof(size);
2351 		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2352 			       &optlen) == 0 &&
2353 		    size < rcvbuf)
2354 		{
2355 			RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2356 				      ISC_R_SUCCESS);
2357 			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2358 				       (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2359 			{
2360 				strerror_r(errno, strbuf, sizeof(strbuf));
2361 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2362 						 "setsockopt(%d, SO_RCVBUF, "
2363 						 "%d) failed: %s",
2364 						 sock->fd, rcvbuf, strbuf);
2365 			}
2366 		}
2367 #endif /* if defined(SET_RCVBUF) */
2368 
2369 #if defined(SET_SNDBUF)
2370 		optlen = sizeof(size);
2371 		if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2372 			       &optlen) == 0 &&
2373 		    size < sndbuf)
2374 		{
2375 			RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2376 				      ISC_R_SUCCESS);
2377 			if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2378 				       (void *)&sndbuf, sizeof(sndbuf)) == -1)
2379 			{
2380 				strerror_r(errno, strbuf, sizeof(strbuf));
2381 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2382 						 "setsockopt(%d, SO_SNDBUF, "
2383 						 "%d) failed: %s",
2384 						 sock->fd, sndbuf, strbuf);
2385 			}
2386 		}
2387 #endif /* if defined(SO_SNDBUF) */
2388 	}
2389 #ifdef IPV6_RECVTCLASS
2390 	if ((sock->pf == AF_INET6) &&
2391 	    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2392 			sizeof(on)) < 0))
2393 	{
2394 		strerror_r(errno, strbuf, sizeof(strbuf));
2395 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2396 				 "setsockopt(%d, IPV6_RECVTCLASS) "
2397 				 "failed: %s",
2398 				 sock->fd, strbuf);
2399 	}
2400 #endif /* ifdef IPV6_RECVTCLASS */
2401 #ifdef IP_RECVTOS
2402 	if ((sock->pf == AF_INET) &&
2403 	    (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2404 			sizeof(on)) < 0))
2405 	{
2406 		strerror_r(errno, strbuf, sizeof(strbuf));
2407 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2408 				 "setsockopt(%d, IP_RECVTOS) "
2409 				 "failed: %s",
2410 				 sock->fd, strbuf);
2411 	}
2412 #endif /* ifdef IP_RECVTOS */
2413 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2414 
2415 	set_ip_disable_pmtud(sock);
2416 
2417 setup_done:
2418 	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2419 	if (sock->active == 0) {
2420 		inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2421 		sock->active = 1;
2422 	}
2423 
2424 	return (ISC_R_SUCCESS);
2425 }
2426 
2427 /*
2428  * Create a 'type' socket or duplicate an existing socket, managed
2429  * by 'manager'.  Events will be posted to 'task' and when dispatched
2430  * 'action' will be called with 'arg' as the arg value.  The new
2431  * socket is returned in 'socketp'.
2432  */
2433 static isc_result_t
socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp,isc_socket_t * dup_socket)2434 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2435 	      isc_socket_t **socketp, isc_socket_t *dup_socket) {
2436 	isc_socket_t *sock = NULL;
2437 	isc__socketthread_t *thread;
2438 	isc_result_t result;
2439 	int lockid;
2440 
2441 	REQUIRE(VALID_MANAGER(manager));
2442 	REQUIRE(socketp != NULL && *socketp == NULL);
2443 
2444 	result = allocate_socket(manager, type, &sock);
2445 	if (result != ISC_R_SUCCESS) {
2446 		return (result);
2447 	}
2448 
2449 	switch (sock->type) {
2450 	case isc_sockettype_udp:
2451 		sock->statsindex = (pf == AF_INET) ? udp4statsindex
2452 						   : udp6statsindex;
2453 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2454 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2455 		break;
2456 	case isc_sockettype_tcp:
2457 		sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2458 						   : tcp6statsindex;
2459 		break;
2460 	case isc_sockettype_unix:
2461 		sock->statsindex = unixstatsindex;
2462 		break;
2463 	case isc_sockettype_raw:
2464 		sock->statsindex = rawstatsindex;
2465 		break;
2466 	default:
2467 		INSIST(0);
2468 		ISC_UNREACHABLE();
2469 	}
2470 
2471 	sock->pf = pf;
2472 
2473 	result = opensocket(manager, sock, dup_socket);
2474 	if (result != ISC_R_SUCCESS) {
2475 		free_socket(&sock);
2476 		return (result);
2477 	}
2478 
2479 	if (sock->fd == -1) {
2480 		abort();
2481 	}
2482 	sock->threadid = gen_threadid(sock);
2483 	isc_refcount_increment0(&sock->references);
2484 	thread = &manager->threads[sock->threadid];
2485 	*socketp = sock;
2486 
2487 	/*
2488 	 * Note we don't have to lock the socket like we normally would because
2489 	 * there are no external references to it yet.
2490 	 */
2491 
2492 	lockid = FDLOCK_ID(sock->fd);
2493 	LOCK(&thread->fdlock[lockid]);
2494 	thread->fds[sock->fd] = sock;
2495 	thread->fdstate[sock->fd] = MANAGED;
2496 #if defined(USE_EPOLL)
2497 	thread->epoll_events[sock->fd] = 0;
2498 #endif /* if defined(USE_EPOLL) */
2499 #ifdef USE_DEVPOLL
2500 	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2501 	       thread->fdpollinfo[sock->fd].want_write == 0);
2502 #endif /* ifdef USE_DEVPOLL */
2503 	UNLOCK(&thread->fdlock[lockid]);
2504 
2505 	LOCK(&manager->lock);
2506 	ISC_LIST_APPEND(manager->socklist, sock, link);
2507 #ifdef USE_SELECT
2508 	if (thread->maxfd < sock->fd) {
2509 		thread->maxfd = sock->fd;
2510 	}
2511 #endif /* ifdef USE_SELECT */
2512 	UNLOCK(&manager->lock);
2513 
2514 	socket_log(sock, NULL, CREATION,
2515 		   dup_socket != NULL ? "dupped" : "created");
2516 
2517 	return (ISC_R_SUCCESS);
2518 }
2519 
2520 /*%
2521  * Create a new 'type' socket managed by 'manager'.  Events
2522  * will be posted to 'task' and when dispatched 'action' will be
2523  * called with 'arg' as the arg value.  The new socket is returned
2524  * in 'socketp'.
2525  */
2526 isc_result_t
isc_socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2527 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2528 		  isc_socket_t **socketp) {
2529 	return (socket_create(manager0, pf, type, socketp, NULL));
2530 }
2531 
2532 /*%
2533  * Duplicate an existing socket.  The new socket is returned
2534  * in 'socketp'.
2535  */
2536 isc_result_t
isc_socket_dup(isc_socket_t * sock,isc_socket_t ** socketp)2537 isc_socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
2538 	REQUIRE(VALID_SOCKET(sock));
2539 	REQUIRE(socketp != NULL && *socketp == NULL);
2540 
2541 	return (socket_create(sock->manager, sock->pf, sock->type, socketp,
2542 			      sock));
2543 }
2544 
2545 isc_result_t
isc_socket_open(isc_socket_t * sock)2546 isc_socket_open(isc_socket_t *sock) {
2547 	isc_result_t result;
2548 	isc__socketthread_t *thread;
2549 
2550 	REQUIRE(VALID_SOCKET(sock));
2551 
2552 	LOCK(&sock->lock);
2553 
2554 	REQUIRE(isc_refcount_current(&sock->references) >= 1);
2555 	REQUIRE(sock->fd == -1);
2556 	REQUIRE(sock->threadid == -1);
2557 
2558 	result = opensocket(sock->manager, sock, NULL);
2559 
2560 	UNLOCK(&sock->lock);
2561 
2562 	if (result != ISC_R_SUCCESS) {
2563 		sock->fd = -1;
2564 	} else {
2565 		sock->threadid = gen_threadid(sock);
2566 		thread = &sock->manager->threads[sock->threadid];
2567 		int lockid = FDLOCK_ID(sock->fd);
2568 
2569 		LOCK(&thread->fdlock[lockid]);
2570 		thread->fds[sock->fd] = sock;
2571 		thread->fdstate[sock->fd] = MANAGED;
2572 #if defined(USE_EPOLL)
2573 		thread->epoll_events[sock->fd] = 0;
2574 #endif /* if defined(USE_EPOLL) */
2575 #ifdef USE_DEVPOLL
2576 		INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2577 		       thread->fdpollinfo[sock->fd].want_write == 0);
2578 #endif /* ifdef USE_DEVPOLL */
2579 		UNLOCK(&thread->fdlock[lockid]);
2580 
2581 #ifdef USE_SELECT
2582 		LOCK(&sock->manager->lock);
2583 		if (thread->maxfd < sock->fd) {
2584 			thread->maxfd = sock->fd;
2585 		}
2586 		UNLOCK(&sock->manager->lock);
2587 #endif /* ifdef USE_SELECT */
2588 	}
2589 
2590 	return (result);
2591 }
2592 
2593 /*
2594  * Attach to a socket.  Caller must explicitly detach when it is done.
2595  */
2596 void
isc_socket_attach(isc_socket_t * sock,isc_socket_t ** socketp)2597 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2598 	REQUIRE(VALID_SOCKET(sock));
2599 	REQUIRE(socketp != NULL && *socketp == NULL);
2600 
2601 	int old_refs = isc_refcount_increment(&sock->references);
2602 	REQUIRE(old_refs > 0);
2603 
2604 	*socketp = sock;
2605 }
2606 
2607 /*
2608  * Dereference a socket.  If this is the last reference to it, clean things
2609  * up by destroying the socket.
2610  */
2611 void
isc_socket_detach(isc_socket_t ** socketp)2612 isc_socket_detach(isc_socket_t **socketp) {
2613 	isc_socket_t *sock;
2614 
2615 	REQUIRE(socketp != NULL);
2616 	sock = *socketp;
2617 	REQUIRE(VALID_SOCKET(sock));
2618 	if (isc_refcount_decrement(&sock->references) == 1) {
2619 		destroy(&sock);
2620 	}
2621 
2622 	*socketp = NULL;
2623 }
2624 
2625 isc_result_t
isc_socket_close(isc_socket_t * sock)2626 isc_socket_close(isc_socket_t *sock) {
2627 	int fd;
2628 	isc_socketmgr_t *manager;
2629 	isc__socketthread_t *thread;
2630 	fflush(stdout);
2631 	REQUIRE(VALID_SOCKET(sock));
2632 
2633 	LOCK(&sock->lock);
2634 
2635 	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2636 
2637 	INSIST(!sock->connecting);
2638 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2639 	INSIST(ISC_LIST_EMPTY(sock->send_list));
2640 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2641 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
2642 
2643 	manager = sock->manager;
2644 	thread = &manager->threads[sock->threadid];
2645 	fd = sock->fd;
2646 	sock->fd = -1;
2647 	sock->threadid = -1;
2648 
2649 	sock->dupped = 0;
2650 	memset(sock->name, 0, sizeof(sock->name));
2651 	sock->tag = NULL;
2652 	sock->listener = 0;
2653 	sock->connected = 0;
2654 	sock->connecting = 0;
2655 	sock->bound = 0;
2656 	isc_sockaddr_any(&sock->peer_address);
2657 
2658 	UNLOCK(&sock->lock);
2659 
2660 	socketclose(thread, sock, fd);
2661 
2662 	return (ISC_R_SUCCESS);
2663 }
2664 
2665 /*
2666  * Dequeue an item off the given socket's read queue, set the result code
2667  * in the done event to the one provided, and send it to the task it was
2668  * destined for.
2669  *
2670  * If the event to be sent is on a list, remove it before sending.  If
2671  * asked to, send and detach from the socket as well.
2672  *
2673  * Caller must have the socket locked if the event is attached to the socket.
2674  */
2675 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2676 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2677 	isc_task_t *task;
2678 
2679 	task = (*dev)->ev_sender;
2680 
2681 	(*dev)->ev_sender = sock;
2682 
2683 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2684 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2685 	}
2686 
2687 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2688 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2689 					 sock->threadid);
2690 	} else {
2691 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2692 	}
2693 }
2694 
2695 /*
2696  * See comments for send_recvdone_event() above.
2697  *
2698  * Caller must have the socket locked if the event is attached to the socket.
2699  */
2700 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2701 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2702 	isc_task_t *task;
2703 
2704 	INSIST(dev != NULL && *dev != NULL);
2705 
2706 	task = (*dev)->ev_sender;
2707 	(*dev)->ev_sender = sock;
2708 
2709 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2710 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2711 	}
2712 
2713 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2714 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2715 					 sock->threadid);
2716 	} else {
2717 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2718 	}
2719 }
2720 
2721 /*
2722  * See comments for send_recvdone_event() above.
2723  *
2724  * Caller must have the socket locked if the event is attached to the socket.
2725  */
2726 static void
send_connectdone_event(isc_socket_t * sock,isc_socket_connev_t ** dev)2727 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
2728 	isc_task_t *task;
2729 
2730 	INSIST(dev != NULL && *dev != NULL);
2731 
2732 	task = (*dev)->ev_sender;
2733 	(*dev)->ev_sender = sock;
2734 
2735 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2736 		ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2737 	}
2738 
2739 	isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2740 }
2741 
2742 /*
2743  * Call accept() on a socket, to get the new file descriptor.  The listen
2744  * socket is used as a prototype to create a new isc_socket_t.  The new
2745  * socket has one outstanding reference.  The task receiving the event
2746  * will be detached from just after the event is delivered.
2747  *
2748  * On entry to this function, the event delivered is the internal
2749  * readable event, and the first item on the accept_list should be
2750  * the done event we want to send.  If the list is empty, this is a no-op,
2751  * so just unlock and return.
2752  */
2753 static void
internal_accept(isc_socket_t * sock)2754 internal_accept(isc_socket_t *sock) {
2755 	isc_socketmgr_t *manager;
2756 	isc__socketthread_t *thread, *nthread;
2757 	isc_socket_newconnev_t *dev;
2758 	isc_task_t *task;
2759 	socklen_t addrlen;
2760 	int fd;
2761 	isc_result_t result = ISC_R_SUCCESS;
2762 	char strbuf[ISC_STRERRORSIZE];
2763 	const char *err = "accept";
2764 
2765 	INSIST(VALID_SOCKET(sock));
2766 	REQUIRE(sock->fd >= 0);
2767 
2768 	socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2769 
2770 	manager = sock->manager;
2771 	INSIST(VALID_MANAGER(manager));
2772 	thread = &manager->threads[sock->threadid];
2773 
2774 	INSIST(sock->listener);
2775 
2776 	/*
2777 	 * Get the first item off the accept list.
2778 	 * If it is empty, unlock the socket and return.
2779 	 */
2780 	dev = ISC_LIST_HEAD(sock->accept_list);
2781 	if (dev == NULL) {
2782 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2783 		UNLOCK(&sock->lock);
2784 		return;
2785 	}
2786 
2787 	/*
2788 	 * Try to accept the new connection.  If the accept fails with
2789 	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2790 	 * again.  Also ignore ECONNRESET, which has been reported to
2791 	 * be spuriously returned on Linux 2.2.19 although it is not
2792 	 * a documented error for accept().  ECONNABORTED has been
2793 	 * reported for Solaris 8.  The rest are thrown in not because
2794 	 * we have seen them but because they are ignored by other
2795 	 * daemons such as BIND 8 and Apache.
2796 	 */
2797 
2798 	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2799 	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2800 	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2801 		    (void *)&addrlen);
2802 
2803 #ifdef F_DUPFD
2804 	/*
2805 	 * Leave a space for stdio to work in.
2806 	 */
2807 	if (fd >= 0 && fd < 20) {
2808 		int newfd, tmp;
2809 		newfd = fcntl(fd, F_DUPFD, 20);
2810 		tmp = errno;
2811 		(void)close(fd);
2812 		errno = tmp;
2813 		fd = newfd;
2814 		err = "accept/fcntl";
2815 	}
2816 #endif /* ifdef F_DUPFD */
2817 
2818 	if (fd < 0) {
2819 		if (SOFT_ERROR(errno)) {
2820 			goto soft_error;
2821 		}
2822 		switch (errno) {
2823 		case ENFILE:
2824 		case EMFILE:
2825 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2826 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2827 				      "%s: too many open file descriptors",
2828 				      err);
2829 			goto soft_error;
2830 
2831 		case ENOBUFS:
2832 		case ENOMEM:
2833 		case ECONNRESET:
2834 		case ECONNABORTED:
2835 		case EHOSTUNREACH:
2836 		case EHOSTDOWN:
2837 		case ENETUNREACH:
2838 		case ENETDOWN:
2839 		case ECONNREFUSED:
2840 #ifdef EPROTO
2841 		case EPROTO:
2842 #endif /* ifdef EPROTO */
2843 #ifdef ENONET
2844 		case ENONET:
2845 #endif /* ifdef ENONET */
2846 			goto soft_error;
2847 		default:
2848 			break;
2849 		}
2850 		strerror_r(errno, strbuf, sizeof(strbuf));
2851 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2852 				 "internal_accept: %s() failed: %s", err,
2853 				 strbuf);
2854 		fd = -1;
2855 		result = ISC_R_UNEXPECTED;
2856 	} else {
2857 		if (addrlen == 0U) {
2858 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2859 					 "internal_accept(): "
2860 					 "accept() failed to return "
2861 					 "remote address");
2862 
2863 			(void)close(fd);
2864 			goto soft_error;
2865 		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2866 			   sock->pf) {
2867 			UNEXPECTED_ERROR(
2868 				__FILE__, __LINE__,
2869 				"internal_accept(): "
2870 				"accept() returned peer address "
2871 				"family %u (expected %u)",
2872 				NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2873 				sock->pf);
2874 			(void)close(fd);
2875 			goto soft_error;
2876 		} else if (fd >= (int)manager->maxsocks) {
2877 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2878 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2879 				      "accept: file descriptor exceeds limit "
2880 				      "(%d/%u)",
2881 				      fd, manager->maxsocks);
2882 			(void)close(fd);
2883 			goto soft_error;
2884 		}
2885 	}
2886 
2887 	if (fd != -1) {
2888 		NEWCONNSOCK(dev)->peer_address.length = addrlen;
2889 		NEWCONNSOCK(dev)->pf = sock->pf;
2890 	}
2891 
2892 	/*
2893 	 * Pull off the done event.
2894 	 */
2895 	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2896 
2897 	/*
2898 	 * Poke watcher if there are more pending accepts.
2899 	 */
2900 	if (ISC_LIST_EMPTY(sock->accept_list)) {
2901 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2902 	}
2903 
2904 	if (fd != -1) {
2905 		result = make_nonblock(fd);
2906 		if (result != ISC_R_SUCCESS) {
2907 			(void)close(fd);
2908 			fd = -1;
2909 		}
2910 	}
2911 
2912 	/*
2913 	 * We need to unlock sock->lock now to be able to lock manager->lock
2914 	 * without risking a deadlock with xmlstats.
2915 	 */
2916 	UNLOCK(&sock->lock);
2917 
2918 	/*
2919 	 * -1 means the new socket didn't happen.
2920 	 */
2921 	if (fd != -1) {
2922 		int lockid = FDLOCK_ID(fd);
2923 
2924 		NEWCONNSOCK(dev)->fd = fd;
2925 		NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2926 		NEWCONNSOCK(dev)->bound = 1;
2927 		NEWCONNSOCK(dev)->connected = 1;
2928 		nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2929 
2930 		/*
2931 		 * We already hold a lock on one fdlock in accepting thread,
2932 		 * we need to make sure that we don't double lock.
2933 		 */
2934 		bool same_bucket = (sock->threadid ==
2935 				    NEWCONNSOCK(dev)->threadid) &&
2936 				   (FDLOCK_ID(sock->fd) == lockid);
2937 
2938 		/*
2939 		 * Use minimum mtu if possible.
2940 		 */
2941 		use_min_mtu(NEWCONNSOCK(dev));
2942 		set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
2943 
2944 		/*
2945 		 * Ensure DSCP settings are inherited across accept.
2946 		 */
2947 		setdscp(NEWCONNSOCK(dev), sock->dscp);
2948 
2949 		/*
2950 		 * Save away the remote address
2951 		 */
2952 		dev->address = NEWCONNSOCK(dev)->peer_address;
2953 
2954 		if (NEWCONNSOCK(dev)->active == 0) {
2955 			inc_stats(manager->stats,
2956 				  NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
2957 			NEWCONNSOCK(dev)->active = 1;
2958 		}
2959 
2960 		if (!same_bucket) {
2961 			LOCK(&nthread->fdlock[lockid]);
2962 		}
2963 		nthread->fds[fd] = NEWCONNSOCK(dev);
2964 		nthread->fdstate[fd] = MANAGED;
2965 #if defined(USE_EPOLL)
2966 		nthread->epoll_events[fd] = 0;
2967 #endif /* if defined(USE_EPOLL) */
2968 		if (!same_bucket) {
2969 			UNLOCK(&nthread->fdlock[lockid]);
2970 		}
2971 
2972 		LOCK(&manager->lock);
2973 
2974 #ifdef USE_SELECT
2975 		if (nthread->maxfd < fd) {
2976 			nthread->maxfd = fd;
2977 		}
2978 #endif /* ifdef USE_SELECT */
2979 
2980 		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
2981 			   "accepted connection, new socket %p",
2982 			   dev->newsocket);
2983 
2984 		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
2985 
2986 		UNLOCK(&manager->lock);
2987 
2988 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2989 	} else {
2990 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2991 		isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
2992 		free_socket((isc_socket_t **)&dev->newsocket);
2993 	}
2994 
2995 	/*
2996 	 * Fill in the done event details and send it off.
2997 	 */
2998 	dev->result = result;
2999 	task = dev->ev_sender;
3000 	dev->ev_sender = sock;
3001 
3002 	isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
3003 	return;
3004 
3005 soft_error:
3006 	watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
3007 	UNLOCK(&sock->lock);
3008 
3009 	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3010 	return;
3011 }
3012 
3013 static void
internal_recv(isc_socket_t * sock)3014 internal_recv(isc_socket_t *sock) {
3015 	isc_socketevent_t *dev;
3016 
3017 	INSIST(VALID_SOCKET(sock));
3018 	REQUIRE(sock->fd >= 0);
3019 
3020 	dev = ISC_LIST_HEAD(sock->recv_list);
3021 	if (dev == NULL) {
3022 		goto finish;
3023 	}
3024 
3025 	socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
3026 		   dev, dev->ev_sender);
3027 
3028 	/*
3029 	 * Try to do as much I/O as possible on this socket.  There are no
3030 	 * limits here, currently.
3031 	 */
3032 	while (dev != NULL) {
3033 		switch (doio_recv(sock, dev)) {
3034 		case DOIO_SOFT:
3035 			goto finish;
3036 
3037 		case DOIO_EOF:
3038 			/*
3039 			 * read of 0 means the remote end was closed.
3040 			 * Run through the event queue and dispatch all
3041 			 * the events with an EOF result code.
3042 			 */
3043 			do {
3044 				dev->result = ISC_R_EOF;
3045 				send_recvdone_event(sock, &dev);
3046 				dev = ISC_LIST_HEAD(sock->recv_list);
3047 			} while (dev != NULL);
3048 			goto finish;
3049 
3050 		case DOIO_SUCCESS:
3051 		case DOIO_HARD:
3052 			send_recvdone_event(sock, &dev);
3053 			break;
3054 		}
3055 
3056 		dev = ISC_LIST_HEAD(sock->recv_list);
3057 	}
3058 
3059 finish:
3060 	if (ISC_LIST_EMPTY(sock->recv_list)) {
3061 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3062 			   SELECT_POKE_READ);
3063 	}
3064 }
3065 
3066 static void
internal_send(isc_socket_t * sock)3067 internal_send(isc_socket_t *sock) {
3068 	isc_socketevent_t *dev;
3069 
3070 	INSIST(VALID_SOCKET(sock));
3071 	REQUIRE(sock->fd >= 0);
3072 
3073 	dev = ISC_LIST_HEAD(sock->send_list);
3074 	if (dev == NULL) {
3075 		goto finish;
3076 	}
3077 	socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3078 		   dev->ev_sender);
3079 
3080 	/*
3081 	 * Try to do as much I/O as possible on this socket.  There are no
3082 	 * limits here, currently.
3083 	 */
3084 	while (dev != NULL) {
3085 		switch (doio_send(sock, dev)) {
3086 		case DOIO_SOFT:
3087 			goto finish;
3088 
3089 		case DOIO_HARD:
3090 		case DOIO_SUCCESS:
3091 			send_senddone_event(sock, &dev);
3092 			break;
3093 		}
3094 
3095 		dev = ISC_LIST_HEAD(sock->send_list);
3096 	}
3097 
3098 finish:
3099 	if (ISC_LIST_EMPTY(sock->send_list)) {
3100 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3101 			   SELECT_POKE_WRITE);
3102 	}
3103 }
3104 
3105 /*
3106  * Process read/writes on each fd here.  Avoid locking
3107  * and unlocking twice if both reads and writes are possible.
3108  */
3109 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3110 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3111 	isc_socket_t *sock;
3112 	int lockid = FDLOCK_ID(fd);
3113 
3114 	/*
3115 	 * If the socket is going to be closed, don't do more I/O.
3116 	 */
3117 	LOCK(&thread->fdlock[lockid]);
3118 	if (thread->fdstate[fd] == CLOSE_PENDING) {
3119 		UNLOCK(&thread->fdlock[lockid]);
3120 
3121 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3122 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3123 		return;
3124 	}
3125 
3126 	sock = thread->fds[fd];
3127 	if (sock == NULL) {
3128 		UNLOCK(&thread->fdlock[lockid]);
3129 		return;
3130 	}
3131 
3132 	LOCK(&sock->lock);
3133 
3134 	if (sock->fd < 0) {
3135 		/*
3136 		 * Sock is being closed - the final external reference
3137 		 * is gone but it was not yet removed from event loop
3138 		 * and fdstate[]/fds[] as destroy() is waiting on
3139 		 * thread->fdlock[lockid] or sock->lock that we're holding.
3140 		 * Just release the locks and bail.
3141 		 */
3142 		UNLOCK(&sock->lock);
3143 		UNLOCK(&thread->fdlock[lockid]);
3144 		return;
3145 	}
3146 
3147 	REQUIRE(readable || writeable);
3148 	if (writeable) {
3149 		if (sock->connecting) {
3150 			internal_connect(sock);
3151 		} else {
3152 			internal_send(sock);
3153 		}
3154 	}
3155 
3156 	if (readable) {
3157 		if (sock->listener) {
3158 			internal_accept(sock); /* unlocks sock */
3159 		} else {
3160 			internal_recv(sock);
3161 			UNLOCK(&sock->lock);
3162 		}
3163 	} else {
3164 		UNLOCK(&sock->lock);
3165 	}
3166 
3167 	UNLOCK(&thread->fdlock[lockid]);
3168 
3169 	/*
3170 	 * Socket destruction might be pending, it will resume
3171 	 * after releasing fdlock and sock->lock.
3172 	 */
3173 }
3174 
3175 /*
3176  * process_fds is different for different event loops
3177  * it takes the events from event loops and for each FD
3178  * launches process_fd
3179  */
3180 #ifdef USE_KQUEUE
3181 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3182 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3183 	int i;
3184 	bool readable, writable;
3185 	bool done = false;
3186 	bool have_ctlevent = false;
3187 	if (nevents == thread->nevents) {
3188 		/*
3189 		 * This is not an error, but something unexpected.  If this
3190 		 * happens, it may indicate the need for increasing
3191 		 * ISC_SOCKET_MAXEVENTS.
3192 		 */
3193 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3194 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3195 			   "maximum number of FD events (%d) received",
3196 			   nevents);
3197 	}
3198 
3199 	for (i = 0; i < nevents; i++) {
3200 		REQUIRE(events[i].ident < thread->manager->maxsocks);
3201 		if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3202 			have_ctlevent = true;
3203 			continue;
3204 		}
3205 		readable = (events[i].filter == EVFILT_READ);
3206 		writable = (events[i].filter == EVFILT_WRITE);
3207 		process_fd(thread, events[i].ident, readable, writable);
3208 	}
3209 
3210 	if (have_ctlevent) {
3211 		done = process_ctlfd(thread);
3212 	}
3213 
3214 	return (done);
3215 }
3216 #elif defined(USE_EPOLL)
3217 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3218 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3219 	    int nevents) {
3220 	int i;
3221 	bool done = false;
3222 	bool have_ctlevent = false;
3223 
3224 	if (nevents == thread->nevents) {
3225 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3226 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3227 			   "maximum number of FD events (%d) received",
3228 			   nevents);
3229 	}
3230 
3231 	for (i = 0; i < nevents; i++) {
3232 		REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3233 		if (events[i].data.fd == thread->pipe_fds[0]) {
3234 			have_ctlevent = true;
3235 			continue;
3236 		}
3237 		if ((events[i].events & EPOLLERR) != 0 ||
3238 		    (events[i].events & EPOLLHUP) != 0) {
3239 			/*
3240 			 * epoll does not set IN/OUT bits on an erroneous
3241 			 * condition, so we need to try both anyway.  This is a
3242 			 * bit inefficient, but should be okay for such rare
3243 			 * events.  Note also that the read or write attempt
3244 			 * won't block because we use non-blocking sockets.
3245 			 */
3246 			int fd = events[i].data.fd;
3247 			events[i].events |= thread->epoll_events[fd];
3248 		}
3249 		process_fd(thread, events[i].data.fd,
3250 			   (events[i].events & EPOLLIN) != 0,
3251 			   (events[i].events & EPOLLOUT) != 0);
3252 	}
3253 
3254 	if (have_ctlevent) {
3255 		done = process_ctlfd(thread);
3256 	}
3257 
3258 	return (done);
3259 }
3260 #elif defined(USE_DEVPOLL)
3261 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3262 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3263 	int i;
3264 	bool done = false;
3265 	bool have_ctlevent = false;
3266 
3267 	if (nevents == thread->nevents) {
3268 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3269 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3270 			   "maximum number of FD events (%d) received",
3271 			   nevents);
3272 	}
3273 
3274 	for (i = 0; i < nevents; i++) {
3275 		REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3276 		if (events[i].fd == thread->pipe_fds[0]) {
3277 			have_ctlevent = true;
3278 			continue;
3279 		}
3280 		process_fd(thread, events[i].fd,
3281 			   (events[i].events & POLLIN) != 0,
3282 			   (events[i].events & POLLOUT) != 0);
3283 	}
3284 
3285 	if (have_ctlevent) {
3286 		done = process_ctlfd(thread);
3287 	}
3288 
3289 	return (done);
3290 }
3291 #elif defined(USE_SELECT)
3292 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3293 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3294 	    fd_set *writefds) {
3295 	int i;
3296 
3297 	REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3298 
3299 	for (i = 0; i < maxfd; i++) {
3300 		if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3301 			continue;
3302 		}
3303 		process_fd(thread, i, FD_ISSET(i, readfds),
3304 			   FD_ISSET(i, writefds));
3305 	}
3306 }
3307 #endif /* ifdef USE_KQUEUE */
3308 
3309 static bool
process_ctlfd(isc__socketthread_t * thread)3310 process_ctlfd(isc__socketthread_t *thread) {
3311 	int msg, fd;
3312 
3313 	for (;;) {
3314 		select_readmsg(thread, &fd, &msg);
3315 
3316 		thread_log(thread, IOEVENT,
3317 			   "watcher got message %d for socket %d", msg, fd);
3318 
3319 		/*
3320 		 * Nothing to read?
3321 		 */
3322 		if (msg == SELECT_POKE_NOTHING) {
3323 			break;
3324 		}
3325 
3326 		/*
3327 		 * Handle shutdown message.  We really should
3328 		 * jump out of this loop right away, but
3329 		 * it doesn't matter if we have to do a little
3330 		 * more work first.
3331 		 */
3332 		if (msg == SELECT_POKE_SHUTDOWN) {
3333 			return (true);
3334 		}
3335 
3336 		/*
3337 		 * This is a wakeup on a socket.  Look
3338 		 * at the event queue for both read and write,
3339 		 * and decide if we need to watch on it now
3340 		 * or not.
3341 		 */
3342 		wakeup_socket(thread, fd, msg);
3343 	}
3344 
3345 	return (false);
3346 }
3347 
3348 /*
3349  * This is the thread that will loop forever, always in a select or poll
3350  * call.
3351  *
3352  * When select returns something to do, do whatever's necessary and post
3353  * an event to the task that was requesting the action.
3354  */
3355 static isc_threadresult_t
netthread(void * uap)3356 netthread(void *uap) {
3357 	isc__socketthread_t *thread = uap;
3358 	isc_socketmgr_t *manager = thread->manager;
3359 	(void)manager;
3360 	bool done;
3361 	int cc;
3362 #ifdef USE_KQUEUE
3363 	const char *fnname = "kevent()";
3364 #elif defined(USE_EPOLL)
3365 	const char *fnname = "epoll_wait()";
3366 #elif defined(USE_DEVPOLL)
3367 	isc_result_t result;
3368 	const char *fnname = "ioctl(DP_POLL)";
3369 	struct dvpoll dvp;
3370 	int pass;
3371 #if defined(ISC_SOCKET_USE_POLLWATCH)
3372 	pollstate_t pollstate = poll_idle;
3373 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3374 #elif defined(USE_SELECT)
3375 	const char *fnname = "select()";
3376 	int maxfd;
3377 	int ctlfd;
3378 #endif /* ifdef USE_KQUEUE */
3379 	char strbuf[ISC_STRERRORSIZE];
3380 
3381 #if defined(USE_SELECT)
3382 	/*
3383 	 * Get the control fd here.  This will never change.
3384 	 */
3385 	ctlfd = thread->pipe_fds[0];
3386 #endif /* if defined(USE_SELECT) */
3387 	done = false;
3388 	while (!done) {
3389 		do {
3390 #ifdef USE_KQUEUE
3391 			cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3392 				    thread->nevents, NULL);
3393 #elif defined(USE_EPOLL)
3394 			cc = epoll_wait(thread->epoll_fd, thread->events,
3395 					thread->nevents, -1);
3396 #elif defined(USE_DEVPOLL)
3397 			/*
3398 			 * Re-probe every thousand calls.
3399 			 */
3400 			if (thread->calls++ > 1000U) {
3401 				result = isc_resource_getcurlimit(
3402 					isc_resource_openfiles,
3403 					&thread->open_max);
3404 				if (result != ISC_R_SUCCESS) {
3405 					thread->open_max = 64;
3406 				}
3407 				thread->calls = 0;
3408 			}
3409 			for (pass = 0; pass < 2; pass++) {
3410 				dvp.dp_fds = thread->events;
3411 				dvp.dp_nfds = thread->nevents;
3412 				if (dvp.dp_nfds >= thread->open_max) {
3413 					dvp.dp_nfds = thread->open_max - 1;
3414 				}
3415 #ifndef ISC_SOCKET_USE_POLLWATCH
3416 				dvp.dp_timeout = -1;
3417 #else  /* ifndef ISC_SOCKET_USE_POLLWATCH */
3418 				if (pollstate == poll_idle) {
3419 					dvp.dp_timeout = -1;
3420 				} else {
3421 					dvp.dp_timeout =
3422 						ISC_SOCKET_POLLWATCH_TIMEOUT;
3423 				}
3424 #endif /* ISC_SOCKET_USE_POLLWATCH */
3425 				cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3426 				if (cc == -1 && errno == EINVAL) {
3427 					/*
3428 					 * {OPEN_MAX} may have dropped.  Look
3429 					 * up the current value and try again.
3430 					 */
3431 					result = isc_resource_getcurlimit(
3432 						isc_resource_openfiles,
3433 						&thread->open_max);
3434 					if (result != ISC_R_SUCCESS) {
3435 						thread->open_max = 64;
3436 					}
3437 				} else {
3438 					break;
3439 				}
3440 			}
3441 #elif defined(USE_SELECT)
3442 			/*
3443 			 * We will have only one thread anyway, we can lock
3444 			 * manager lock and don't care
3445 			 */
3446 			LOCK(&manager->lock);
3447 			memmove(thread->read_fds_copy, thread->read_fds,
3448 				thread->fd_bufsize);
3449 			memmove(thread->write_fds_copy, thread->write_fds,
3450 				thread->fd_bufsize);
3451 			maxfd = thread->maxfd + 1;
3452 			UNLOCK(&manager->lock);
3453 
3454 			cc = select(maxfd, thread->read_fds_copy,
3455 				    thread->write_fds_copy, NULL, NULL);
3456 #endif /* USE_KQUEUE */
3457 
3458 			if (cc < 0 && !SOFT_ERROR(errno)) {
3459 				strerror_r(errno, strbuf, sizeof(strbuf));
3460 				FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3461 					    fnname, strbuf);
3462 			}
3463 
3464 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3465 			if (cc == 0) {
3466 				if (pollstate == poll_active) {
3467 					pollstate = poll_checking;
3468 				} else if (pollstate == poll_checking) {
3469 					pollstate = poll_idle;
3470 				}
3471 			} else if (cc > 0) {
3472 				if (pollstate == poll_checking) {
3473 					/*
3474 					 * XXX: We'd like to use a more
3475 					 * verbose log level as it's actually an
3476 					 * unexpected event, but the kernel bug
3477 					 * reportedly happens pretty frequently
3478 					 * (and it can also be a false positive)
3479 					 * so it would be just too noisy.
3480 					 */
3481 					thread_log(thread,
3482 						   ISC_LOGCATEGORY_GENERAL,
3483 						   ISC_LOGMODULE_SOCKET,
3484 						   ISC_LOG_DEBUG(1),
3485 						   "unexpected POLL timeout");
3486 				}
3487 				pollstate = poll_active;
3488 			}
3489 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3490 		} while (cc < 0);
3491 
3492 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3493 		done = process_fds(thread, thread->events, cc);
3494 #elif defined(USE_SELECT)
3495 		process_fds(thread, maxfd, thread->read_fds_copy,
3496 			    thread->write_fds_copy);
3497 
3498 		/*
3499 		 * Process reads on internal, control fd.
3500 		 */
3501 		if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3502 			done = process_ctlfd(thread);
3503 		}
3504 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3505 	* */
3506 	}
3507 
3508 	thread_log(thread, TRACE, "watcher exiting");
3509 	return ((isc_threadresult_t)0);
3510 }
3511 
3512 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager,uint32_t reserved)3513 isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
3514 	REQUIRE(VALID_MANAGER(manager));
3515 
3516 	manager->reserved = reserved;
3517 }
3518 
3519 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager,unsigned int maxudp)3520 isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
3521 	REQUIRE(VALID_MANAGER(manager));
3522 
3523 	manager->maxudp = maxudp;
3524 }
3525 
3526 /*
3527  * Setup socket thread, thread->manager and thread->threadid must be filled.
3528  */
3529 
3530 static isc_result_t
setup_thread(isc__socketthread_t * thread)3531 setup_thread(isc__socketthread_t *thread) {
3532 	isc_result_t result = ISC_R_SUCCESS;
3533 	int i;
3534 	char strbuf[ISC_STRERRORSIZE];
3535 
3536 	REQUIRE(thread != NULL);
3537 	REQUIRE(VALID_MANAGER(thread->manager));
3538 	REQUIRE(thread->threadid >= 0 &&
3539 		thread->threadid < thread->manager->nthreads);
3540 
3541 	thread->fds =
3542 		isc_mem_get(thread->manager->mctx,
3543 			    thread->manager->maxsocks * sizeof(isc_socket_t *));
3544 
3545 	memset(thread->fds, 0,
3546 	       thread->manager->maxsocks * sizeof(isc_socket_t *));
3547 
3548 	thread->fdstate = isc_mem_get(thread->manager->mctx,
3549 				      thread->manager->maxsocks * sizeof(int));
3550 
3551 	memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3552 
3553 	thread->fdlock = isc_mem_get(thread->manager->mctx,
3554 				     FDLOCK_COUNT * sizeof(isc_mutex_t));
3555 
3556 	for (i = 0; i < FDLOCK_COUNT; i++) {
3557 		isc_mutex_init(&thread->fdlock[i]);
3558 	}
3559 
3560 	if (pipe(thread->pipe_fds) != 0) {
3561 		strerror_r(errno, strbuf, sizeof(strbuf));
3562 		UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3563 				 strbuf);
3564 		return (ISC_R_UNEXPECTED);
3565 	}
3566 	RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3567 
3568 #ifdef USE_KQUEUE
3569 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3570 	thread->events = isc_mem_get(thread->manager->mctx,
3571 				     sizeof(struct kevent) * thread->nevents);
3572 
3573 	thread->kqueue_fd = kqueue();
3574 	if (thread->kqueue_fd == -1) {
3575 		result = isc__errno2result(errno);
3576 		strerror_r(errno, strbuf, sizeof(strbuf));
3577 		UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3578 				 strbuf);
3579 		isc_mem_put(thread->manager->mctx, thread->events,
3580 			    sizeof(struct kevent) * thread->nevents);
3581 		return (result);
3582 	}
3583 
3584 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3585 	if (result != ISC_R_SUCCESS) {
3586 		close(thread->kqueue_fd);
3587 		isc_mem_put(thread->manager->mctx, thread->events,
3588 			    sizeof(struct kevent) * thread->nevents);
3589 	}
3590 	return (result);
3591 
3592 #elif defined(USE_EPOLL)
3593 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3594 	thread->epoll_events =
3595 		isc_mem_get(thread->manager->mctx,
3596 			    (thread->manager->maxsocks * sizeof(uint32_t)));
3597 
3598 	memset(thread->epoll_events, 0,
3599 	       thread->manager->maxsocks * sizeof(uint32_t));
3600 
3601 	thread->events =
3602 		isc_mem_get(thread->manager->mctx,
3603 			    sizeof(struct epoll_event) * thread->nevents);
3604 
3605 	thread->epoll_fd = epoll_create(thread->nevents);
3606 	if (thread->epoll_fd == -1) {
3607 		result = isc__errno2result(errno);
3608 		strerror_r(errno, strbuf, sizeof(strbuf));
3609 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3610 				 strbuf);
3611 		return (result);
3612 	}
3613 
3614 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3615 	return (result);
3616 
3617 #elif defined(USE_DEVPOLL)
3618 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3619 	result = isc_resource_getcurlimit(isc_resource_openfiles,
3620 					  &thread->open_max);
3621 	if (result != ISC_R_SUCCESS) {
3622 		thread->open_max = 64;
3623 	}
3624 	thread->calls = 0;
3625 	thread->events = isc_mem_get(thread->manager->mctx,
3626 				     sizeof(struct pollfd) * thread->nevents);
3627 
3628 	/*
3629 	 * Note: fdpollinfo should be able to support all possible FDs, so
3630 	 * it must have maxsocks entries (not nevents).
3631 	 */
3632 	thread->fdpollinfo =
3633 		isc_mem_get(thread->manager->mctx,
3634 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3635 	memset(thread->fdpollinfo, 0,
3636 	       sizeof(pollinfo_t) * thread->manager->maxsocks);
3637 	thread->devpoll_fd = open("/dev/poll", O_RDWR);
3638 	if (thread->devpoll_fd == -1) {
3639 		result = isc__errno2result(errno);
3640 		strerror_r(errno, strbuf, sizeof(strbuf));
3641 		UNEXPECTED_ERROR(__FILE__, __LINE__,
3642 				 "open(/dev/poll) failed: %s", strbuf);
3643 		isc_mem_put(thread->manager->mctx, thread->events,
3644 			    sizeof(struct pollfd) * thread->nevents);
3645 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3646 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3647 		return (result);
3648 	}
3649 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3650 	if (result != ISC_R_SUCCESS) {
3651 		close(thread->devpoll_fd);
3652 		isc_mem_put(thread->manager->mctx, thread->events,
3653 			    sizeof(struct pollfd) * thread->nevents);
3654 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3655 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3656 		return (result);
3657 	}
3658 
3659 	return (ISC_R_SUCCESS);
3660 #elif defined(USE_SELECT)
3661 	UNUSED(result);
3662 
3663 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3664 	/*
3665 	 * Note: this code should also cover the case of MAXSOCKETS <=
3666 	 * FD_SETSIZE, but we separate the cases to avoid possible portability
3667 	 * issues regarding howmany() and the actual representation of fd_set.
3668 	 */
3669 	thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3670 			     sizeof(fd_mask);
3671 #else  /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3672 	thread->fd_bufsize = sizeof(fd_set);
3673 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3674 
3675 	thread->read_fds = isc_mem_get(thread->manager->mctx,
3676 				       thread->fd_bufsize);
3677 	thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3678 					    thread->fd_bufsize);
3679 	thread->write_fds = isc_mem_get(thread->manager->mctx,
3680 					thread->fd_bufsize);
3681 	thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3682 					     thread->fd_bufsize);
3683 	memset(thread->read_fds, 0, thread->fd_bufsize);
3684 	memset(thread->write_fds, 0, thread->fd_bufsize);
3685 
3686 	(void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3687 	thread->maxfd = thread->pipe_fds[0];
3688 
3689 	return (ISC_R_SUCCESS);
3690 #endif /* USE_KQUEUE */
3691 }
3692 
3693 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3694 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3695 	isc_result_t result;
3696 	int i;
3697 
3698 	result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3699 	if (result != ISC_R_SUCCESS) {
3700 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3701 	}
3702 #ifdef USE_KQUEUE
3703 	close(thread->kqueue_fd);
3704 	isc_mem_put(mctx, thread->events,
3705 		    sizeof(struct kevent) * thread->nevents);
3706 #elif defined(USE_EPOLL)
3707 	close(thread->epoll_fd);
3708 
3709 	isc_mem_put(mctx, thread->events,
3710 		    sizeof(struct epoll_event) * thread->nevents);
3711 #elif defined(USE_DEVPOLL)
3712 	close(thread->devpoll_fd);
3713 	isc_mem_put(mctx, thread->events,
3714 		    sizeof(struct pollfd) * thread->nevents);
3715 	isc_mem_put(mctx, thread->fdpollinfo,
3716 		    sizeof(pollinfo_t) * thread->manager->maxsocks);
3717 #elif defined(USE_SELECT)
3718 	if (thread->read_fds != NULL) {
3719 		isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3720 	}
3721 	if (thread->read_fds_copy != NULL) {
3722 		isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3723 	}
3724 	if (thread->write_fds != NULL) {
3725 		isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3726 	}
3727 	if (thread->write_fds_copy != NULL) {
3728 		isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3729 	}
3730 #endif /* USE_KQUEUE */
3731 	for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3732 		if (thread->fdstate[i] == CLOSE_PENDING) {
3733 			/* no need to lock */
3734 			(void)close(i);
3735 		}
3736 	}
3737 
3738 #if defined(USE_EPOLL)
3739 	isc_mem_put(thread->manager->mctx, thread->epoll_events,
3740 		    thread->manager->maxsocks * sizeof(uint32_t));
3741 #endif /* if defined(USE_EPOLL) */
3742 	isc_mem_put(thread->manager->mctx, thread->fds,
3743 		    thread->manager->maxsocks * sizeof(isc_socket_t *));
3744 	isc_mem_put(thread->manager->mctx, thread->fdstate,
3745 		    thread->manager->maxsocks * sizeof(int));
3746 
3747 	for (i = 0; i < FDLOCK_COUNT; i++) {
3748 		isc_mutex_destroy(&thread->fdlock[i]);
3749 	}
3750 	isc_mem_put(thread->manager->mctx, thread->fdlock,
3751 		    FDLOCK_COUNT * sizeof(isc_mutex_t));
3752 }
3753 
3754 isc_result_t
isc_socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp)3755 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3756 	return (isc_socketmgr_create2(mctx, managerp, 0, 1));
3757 }
3758 
3759 isc_result_t
isc_socketmgr_create2(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3760 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3761 		      unsigned int maxsocks, int nthreads) {
3762 	int i;
3763 	isc_socketmgr_t *manager;
3764 
3765 	REQUIRE(managerp != NULL && *managerp == NULL);
3766 
3767 	if (maxsocks == 0) {
3768 		maxsocks = ISC_SOCKET_MAXSOCKETS;
3769 	}
3770 
3771 	manager = isc_mem_get(mctx, sizeof(*manager));
3772 
3773 	/* zero-clear so that necessary cleanup on failure will be easy */
3774 	memset(manager, 0, sizeof(*manager));
3775 	manager->maxsocks = maxsocks;
3776 	manager->reserved = 0;
3777 	manager->maxudp = 0;
3778 	manager->nthreads = nthreads;
3779 	manager->stats = NULL;
3780 
3781 	manager->magic = SOCKET_MANAGER_MAGIC;
3782 	manager->mctx = NULL;
3783 	ISC_LIST_INIT(manager->socklist);
3784 	isc_mutex_init(&manager->lock);
3785 	isc_condition_init(&manager->shutdown_ok);
3786 
3787 	/*
3788 	 * Start up the select/poll thread.
3789 	 */
3790 	manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3791 						     manager->nthreads);
3792 	isc_mem_attach(mctx, &manager->mctx);
3793 
3794 	for (i = 0; i < manager->nthreads; i++) {
3795 		manager->threads[i].manager = manager;
3796 		manager->threads[i].threadid = i;
3797 		setup_thread(&manager->threads[i]);
3798 		isc_thread_create(netthread, &manager->threads[i],
3799 				  &manager->threads[i].thread);
3800 		char tname[1024];
3801 		sprintf(tname, "isc-socket-%d", i);
3802 		isc_thread_setname(manager->threads[i].thread, tname);
3803 	}
3804 
3805 	*managerp = manager;
3806 
3807 	return (ISC_R_SUCCESS);
3808 }
3809 
3810 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager,unsigned int * nsockp)3811 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3812 	REQUIRE(VALID_MANAGER(manager));
3813 	REQUIRE(nsockp != NULL);
3814 
3815 	*nsockp = manager->maxsocks;
3816 
3817 	return (ISC_R_SUCCESS);
3818 }
3819 
3820 void
isc_socketmgr_setstats(isc_socketmgr_t * manager,isc_stats_t * stats)3821 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3822 	REQUIRE(VALID_MANAGER(manager));
3823 	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3824 	REQUIRE(manager->stats == NULL);
3825 	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3826 
3827 	isc_stats_attach(stats, &manager->stats);
3828 }
3829 
3830 void
isc_socketmgr_destroy(isc_socketmgr_t ** managerp)3831 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3832 	isc_socketmgr_t *manager;
3833 
3834 	/*
3835 	 * Destroy a socket manager.
3836 	 */
3837 
3838 	REQUIRE(managerp != NULL);
3839 	manager = *managerp;
3840 	REQUIRE(VALID_MANAGER(manager));
3841 
3842 	LOCK(&manager->lock);
3843 
3844 	/*
3845 	 * Wait for all sockets to be destroyed.
3846 	 */
3847 	while (!ISC_LIST_EMPTY(manager->socklist)) {
3848 		manager_log(manager, CREATION, "sockets exist");
3849 		WAIT(&manager->shutdown_ok, &manager->lock);
3850 	}
3851 
3852 	UNLOCK(&manager->lock);
3853 
3854 	/*
3855 	 * Here, poke our select/poll thread.  Do this by closing the write
3856 	 * half of the pipe, which will send EOF to the read half.
3857 	 * This is currently a no-op in the non-threaded case.
3858 	 */
3859 	for (int i = 0; i < manager->nthreads; i++) {
3860 		select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3861 	}
3862 
3863 	/*
3864 	 * Wait for thread to exit.
3865 	 */
3866 	for (int i = 0; i < manager->nthreads; i++) {
3867 		isc_thread_join(manager->threads[i].thread, NULL);
3868 		cleanup_thread(manager->mctx, &manager->threads[i]);
3869 	}
3870 	/*
3871 	 * Clean up.
3872 	 */
3873 	isc_mem_put(manager->mctx, manager->threads,
3874 		    sizeof(isc__socketthread_t) * manager->nthreads);
3875 	(void)isc_condition_destroy(&manager->shutdown_ok);
3876 
3877 	if (manager->stats != NULL) {
3878 		isc_stats_detach(&manager->stats);
3879 	}
3880 	isc_mutex_destroy(&manager->lock);
3881 	manager->magic = 0;
3882 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
3883 
3884 	*managerp = NULL;
3885 }
3886 
3887 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)3888 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3889 	    unsigned int flags) {
3890 	int io_state;
3891 	bool have_lock = false;
3892 	isc_task_t *ntask = NULL;
3893 	isc_result_t result = ISC_R_SUCCESS;
3894 
3895 	dev->ev_sender = task;
3896 
3897 	if (sock->type == isc_sockettype_udp) {
3898 		io_state = doio_recv(sock, dev);
3899 	} else {
3900 		LOCK(&sock->lock);
3901 		have_lock = true;
3902 
3903 		if (ISC_LIST_EMPTY(sock->recv_list)) {
3904 			io_state = doio_recv(sock, dev);
3905 		} else {
3906 			io_state = DOIO_SOFT;
3907 		}
3908 	}
3909 
3910 	switch (io_state) {
3911 	case DOIO_SOFT:
3912 		/*
3913 		 * We couldn't read all or part of the request right now, so
3914 		 * queue it.
3915 		 *
3916 		 * Attach to socket and to task
3917 		 */
3918 		isc_task_attach(task, &ntask);
3919 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3920 
3921 		if (!have_lock) {
3922 			LOCK(&sock->lock);
3923 			have_lock = true;
3924 		}
3925 
3926 		/*
3927 		 * Enqueue the request.  If the socket was previously not being
3928 		 * watched, poke the watcher to start paying attention to it.
3929 		 */
3930 		bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
3931 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3932 		if (do_poke) {
3933 			select_poke(sock->manager, sock->threadid, sock->fd,
3934 				    SELECT_POKE_READ);
3935 		}
3936 
3937 		socket_log(sock, NULL, EVENT,
3938 			   "socket_recv: event %p -> task %p", dev, ntask);
3939 
3940 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
3941 			result = ISC_R_INPROGRESS;
3942 		}
3943 		break;
3944 
3945 	case DOIO_EOF:
3946 		dev->result = ISC_R_EOF;
3947 		/* fallthrough */
3948 
3949 	case DOIO_HARD:
3950 	case DOIO_SUCCESS:
3951 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
3952 			send_recvdone_event(sock, &dev);
3953 		}
3954 		break;
3955 	}
3956 
3957 	if (have_lock) {
3958 		UNLOCK(&sock->lock);
3959 	}
3960 
3961 	return (result);
3962 }
3963 
3964 isc_result_t
isc_socket_recv(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)3965 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3966 		isc_task_t *task, isc_taskaction_t action, void *arg) {
3967 	isc_socketevent_t *dev;
3968 	isc_socketmgr_t *manager;
3969 
3970 	REQUIRE(VALID_SOCKET(sock));
3971 	REQUIRE(action != NULL);
3972 
3973 	manager = sock->manager;
3974 	REQUIRE(VALID_MANAGER(manager));
3975 
3976 	INSIST(sock->bound);
3977 
3978 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
3979 				   action, arg);
3980 	if (dev == NULL) {
3981 		return (ISC_R_NOMEMORY);
3982 	}
3983 
3984 	return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
3985 }
3986 
3987 isc_result_t
isc_socket_recv2(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)3988 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3989 		 isc_task_t *task, isc_socketevent_t *event,
3990 		 unsigned int flags) {
3991 	event->ev_sender = sock;
3992 	event->result = ISC_R_UNSET;
3993 	event->region = *region;
3994 	event->n = 0;
3995 	event->offset = 0;
3996 	event->attributes = 0;
3997 
3998 	/*
3999 	 * UDP sockets are always partial read.
4000 	 */
4001 	if (sock->type == isc_sockettype_udp) {
4002 		event->minimum = 1;
4003 	} else {
4004 		if (minimum == 0) {
4005 			event->minimum = region->length;
4006 		} else {
4007 			event->minimum = minimum;
4008 		}
4009 	}
4010 
4011 	return (socket_recv(sock, event, task, flags));
4012 }
4013 
4014 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)4015 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4016 	    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4017 	    unsigned int flags) {
4018 	int io_state;
4019 	bool have_lock = false;
4020 	isc_task_t *ntask = NULL;
4021 	isc_result_t result = ISC_R_SUCCESS;
4022 
4023 	dev->ev_sender = task;
4024 
4025 	set_dev_address(address, sock, dev);
4026 	if (pktinfo != NULL) {
4027 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4028 		dev->pktinfo = *pktinfo;
4029 
4030 		if (!isc_sockaddr_issitelocal(&dev->address) &&
4031 		    !isc_sockaddr_islinklocal(&dev->address))
4032 		{
4033 			socket_log(sock, NULL, TRACE,
4034 				   "pktinfo structure provided, ifindex %u "
4035 				   "(set to 0)",
4036 				   pktinfo->ipi6_ifindex);
4037 
4038 			/*
4039 			 * Set the pktinfo index to 0 here, to let the
4040 			 * kernel decide what interface it should send on.
4041 			 */
4042 			dev->pktinfo.ipi6_ifindex = 0;
4043 		}
4044 	}
4045 
4046 	if (sock->type == isc_sockettype_udp) {
4047 		io_state = doio_send(sock, dev);
4048 	} else {
4049 		LOCK(&sock->lock);
4050 		have_lock = true;
4051 
4052 		if (ISC_LIST_EMPTY(sock->send_list)) {
4053 			io_state = doio_send(sock, dev);
4054 		} else {
4055 			io_state = DOIO_SOFT;
4056 		}
4057 	}
4058 
4059 	switch (io_state) {
4060 	case DOIO_SOFT:
4061 		/*
4062 		 * We couldn't send all or part of the request right now, so
4063 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4064 		 */
4065 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4066 			isc_task_attach(task, &ntask);
4067 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4068 
4069 			if (!have_lock) {
4070 				LOCK(&sock->lock);
4071 				have_lock = true;
4072 			}
4073 
4074 			/*
4075 			 * Enqueue the request.  If the socket was previously
4076 			 * not being watched, poke the watcher to start
4077 			 * paying attention to it.
4078 			 */
4079 			bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4080 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4081 			if (do_poke) {
4082 				select_poke(sock->manager, sock->threadid,
4083 					    sock->fd, SELECT_POKE_WRITE);
4084 			}
4085 			socket_log(sock, NULL, EVENT,
4086 				   "socket_send: event %p -> task %p", dev,
4087 				   ntask);
4088 
4089 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4090 				result = ISC_R_INPROGRESS;
4091 			}
4092 			break;
4093 		}
4094 
4095 		/* FALLTHROUGH */
4096 
4097 	case DOIO_HARD:
4098 	case DOIO_SUCCESS:
4099 		if (!have_lock) {
4100 			LOCK(&sock->lock);
4101 			have_lock = true;
4102 		}
4103 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4104 			send_senddone_event(sock, &dev);
4105 		}
4106 		break;
4107 	}
4108 
4109 	if (have_lock) {
4110 		UNLOCK(&sock->lock);
4111 	}
4112 
4113 	return (result);
4114 }
4115 
4116 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4117 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4118 		isc_taskaction_t action, void *arg) {
4119 	/*
4120 	 * REQUIRE() checking is performed in isc_socket_sendto().
4121 	 */
4122 	return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4123 }
4124 
4125 isc_result_t
isc_socket_sendto(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4126 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4127 		  isc_taskaction_t action, void *arg,
4128 		  const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4129 	isc_socketevent_t *dev;
4130 	isc_socketmgr_t *manager;
4131 
4132 	REQUIRE(VALID_SOCKET(sock));
4133 	REQUIRE(region != NULL);
4134 	REQUIRE(task != NULL);
4135 	REQUIRE(action != NULL);
4136 
4137 	manager = sock->manager;
4138 	REQUIRE(VALID_MANAGER(manager));
4139 
4140 	INSIST(sock->bound);
4141 
4142 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4143 				   action, arg);
4144 	if (dev == NULL) {
4145 		return (ISC_R_NOMEMORY);
4146 	}
4147 
4148 	dev->region = *region;
4149 
4150 	return (socket_send(sock, dev, task, address, pktinfo, 0));
4151 }
4152 
4153 isc_result_t
isc_socket_sendto2(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4154 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4155 		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4156 		   isc_socketevent_t *event, unsigned int flags) {
4157 	REQUIRE(VALID_SOCKET(sock));
4158 	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4159 		0);
4160 	if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4161 		REQUIRE(sock->type == isc_sockettype_udp);
4162 	}
4163 	event->ev_sender = sock;
4164 	event->result = ISC_R_UNSET;
4165 	event->region = *region;
4166 	event->n = 0;
4167 	event->offset = 0;
4168 	event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4169 
4170 	return (socket_send(sock, event, task, address, pktinfo, flags));
4171 }
4172 
4173 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4174 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4175 #ifdef ISC_PLATFORM_HAVESYSUNH
4176 	int s;
4177 	struct stat sb;
4178 	char strbuf[ISC_STRERRORSIZE];
4179 
4180 	if (sockaddr->type.sa.sa_family != AF_UNIX) {
4181 		return;
4182 	}
4183 
4184 #ifndef S_ISSOCK
4185 #if defined(S_IFMT) && defined(S_IFSOCK)
4186 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4187 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4188 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4189 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4190 #endif /* ifndef S_ISSOCK */
4191 
4192 #ifndef S_ISFIFO
4193 #if defined(S_IFMT) && defined(S_IFIFO)
4194 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4195 #elif defined(_S_IFMT) && defined(S_IFIFO)
4196 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4197 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4198 #endif /* ifndef S_ISFIFO */
4199 
4200 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4201 /* cppcheck-suppress preprocessorErrorDirective */
4202 #error \
4203 	You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4204 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4205 
4206 #ifndef S_ISFIFO
4207 #define S_ISFIFO(mode) 0
4208 #endif /* ifndef S_ISFIFO */
4209 
4210 #ifndef S_ISSOCK
4211 #define S_ISSOCK(mode) 0
4212 #endif /* ifndef S_ISSOCK */
4213 
4214 	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4215 		switch (errno) {
4216 		case ENOENT:
4217 			if (active) { /* We exited cleanly last time */
4218 				break;
4219 			}
4220 			/* intentional fallthrough */
4221 		default:
4222 			strerror_r(errno, strbuf, sizeof(strbuf));
4223 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4224 				      ISC_LOGMODULE_SOCKET,
4225 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4226 				      "isc_socket_cleanunix: stat(%s): %s",
4227 				      sockaddr->type.sunix.sun_path, strbuf);
4228 			return;
4229 		}
4230 	} else {
4231 		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4232 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4233 				      ISC_LOGMODULE_SOCKET,
4234 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4235 				      "isc_socket_cleanunix: %s: not a socket",
4236 				      sockaddr->type.sunix.sun_path);
4237 			return;
4238 		}
4239 	}
4240 
4241 	if (active) {
4242 		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4243 			strerror_r(errno, strbuf, sizeof(strbuf));
4244 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4245 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4246 				      "isc_socket_cleanunix: unlink(%s): %s",
4247 				      sockaddr->type.sunix.sun_path, strbuf);
4248 		}
4249 		return;
4250 	}
4251 
4252 	s = socket(AF_UNIX, SOCK_STREAM, 0);
4253 	if (s < 0) {
4254 		strerror_r(errno, strbuf, sizeof(strbuf));
4255 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4256 			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4257 			      "isc_socket_cleanunix: socket(%s): %s",
4258 			      sockaddr->type.sunix.sun_path, strbuf);
4259 		return;
4260 	}
4261 
4262 	if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4263 		    sizeof(sockaddr->type.sunix)) < 0)
4264 	{
4265 		switch (errno) {
4266 		case ECONNREFUSED:
4267 		case ECONNRESET:
4268 			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4269 				strerror_r(errno, strbuf, sizeof(strbuf));
4270 				isc_log_write(
4271 					isc_lctx, ISC_LOGCATEGORY_GENERAL,
4272 					ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4273 					"isc_socket_cleanunix: "
4274 					"unlink(%s): %s",
4275 					sockaddr->type.sunix.sun_path, strbuf);
4276 			}
4277 			break;
4278 		default:
4279 			strerror_r(errno, strbuf, sizeof(strbuf));
4280 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4281 				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4282 				      "isc_socket_cleanunix: connect(%s): %s",
4283 				      sockaddr->type.sunix.sun_path, strbuf);
4284 			break;
4285 		}
4286 	}
4287 	close(s);
4288 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4289 	UNUSED(sockaddr);
4290 	UNUSED(active);
4291 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4292 }
4293 
4294 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4295 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4296 		    uint32_t owner, uint32_t group) {
4297 #ifdef ISC_PLATFORM_HAVESYSUNH
4298 	isc_result_t result = ISC_R_SUCCESS;
4299 	char strbuf[ISC_STRERRORSIZE];
4300 	char path[sizeof(sockaddr->type.sunix.sun_path)];
4301 #ifdef NEED_SECURE_DIRECTORY
4302 	char *slash;
4303 #endif /* ifdef NEED_SECURE_DIRECTORY */
4304 
4305 	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4306 	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4307 	strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4308 
4309 #ifdef NEED_SECURE_DIRECTORY
4310 	slash = strrchr(path, '/');
4311 	if (slash != NULL) {
4312 		if (slash != path) {
4313 			*slash = '\0';
4314 		} else {
4315 			strlcpy(path, "/", sizeof(path));
4316 		}
4317 	} else {
4318 		strlcpy(path, ".", sizeof(path));
4319 	}
4320 #endif /* ifdef NEED_SECURE_DIRECTORY */
4321 
4322 	if (chmod(path, perm) < 0) {
4323 		strerror_r(errno, strbuf, sizeof(strbuf));
4324 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4325 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4326 			      "isc_socket_permunix: chmod(%s, %d): %s", path,
4327 			      perm, strbuf);
4328 		result = ISC_R_FAILURE;
4329 	}
4330 	if (chown(path, owner, group) < 0) {
4331 		strerror_r(errno, strbuf, sizeof(strbuf));
4332 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4333 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4334 			      "isc_socket_permunix: chown(%s, %d, %d): %s",
4335 			      path, owner, group, strbuf);
4336 		result = ISC_R_FAILURE;
4337 	}
4338 	return (result);
4339 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4340 	UNUSED(sockaddr);
4341 	UNUSED(perm);
4342 	UNUSED(owner);
4343 	UNUSED(group);
4344 	return (ISC_R_NOTIMPLEMENTED);
4345 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4346 }
4347 
4348 isc_result_t
isc_socket_bind(isc_socket_t * sock,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4349 isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
4350 		isc_socket_options_t options) {
4351 	char strbuf[ISC_STRERRORSIZE];
4352 	int on = 1;
4353 
4354 	REQUIRE(VALID_SOCKET(sock));
4355 
4356 	LOCK(&sock->lock);
4357 
4358 	INSIST(!sock->bound);
4359 	INSIST(!sock->dupped);
4360 
4361 	if (sock->pf != sockaddr->type.sa.sa_family) {
4362 		UNLOCK(&sock->lock);
4363 		return (ISC_R_FAMILYMISMATCH);
4364 	}
4365 
4366 	/*
4367 	 * Only set SO_REUSEADDR when we want a specific port.
4368 	 */
4369 #ifdef AF_UNIX
4370 	if (sock->pf == AF_UNIX) {
4371 		goto bind_socket;
4372 	}
4373 #endif /* ifdef AF_UNIX */
4374 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4375 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4376 	{
4377 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4378 			       sizeof(on)) < 0) {
4379 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4380 					 "setsockopt(%d) failed", sock->fd);
4381 		}
4382 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4383 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4384 			       (void *)&on, sizeof(on)) < 0)
4385 		{
4386 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4387 					 "setsockopt(%d) failed", sock->fd);
4388 		}
4389 #elif defined(__linux__) && defined(SO_REUSEPORT)
4390 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4391 			       sizeof(on)) < 0) {
4392 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4393 					 "setsockopt(%d) failed", sock->fd);
4394 		}
4395 #endif		/* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4396 		/* Press on... */
4397 	}
4398 #ifdef AF_UNIX
4399 bind_socket:
4400 #endif /* ifdef AF_UNIX */
4401 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4402 		inc_stats(sock->manager->stats,
4403 			  sock->statsindex[STATID_BINDFAIL]);
4404 
4405 		UNLOCK(&sock->lock);
4406 		switch (errno) {
4407 		case EACCES:
4408 			return (ISC_R_NOPERM);
4409 		case EADDRNOTAVAIL:
4410 			return (ISC_R_ADDRNOTAVAIL);
4411 		case EADDRINUSE:
4412 			return (ISC_R_ADDRINUSE);
4413 		case EINVAL:
4414 			return (ISC_R_BOUND);
4415 		default:
4416 			strerror_r(errno, strbuf, sizeof(strbuf));
4417 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4418 					 strbuf);
4419 			return (ISC_R_UNEXPECTED);
4420 		}
4421 	}
4422 
4423 	socket_log(sock, sockaddr, TRACE, "bound");
4424 	sock->bound = 1;
4425 
4426 	UNLOCK(&sock->lock);
4427 	return (ISC_R_SUCCESS);
4428 }
4429 
4430 /*
4431  * Enable this only for specific OS versions, and only when they have repaired
4432  * their problems with it.  Until then, this is is broken and needs to be
4433  * disabled by default.  See RT22589 for details.
4434  */
4435 #undef ENABLE_ACCEPTFILTER
4436 
4437 isc_result_t
isc_socket_filter(isc_socket_t * sock,const char * filter)4438 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4439 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4440 	char strbuf[ISC_STRERRORSIZE];
4441 	struct accept_filter_arg afa;
4442 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4443 	UNUSED(sock);
4444 	UNUSED(filter);
4445 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4446 
4447 	REQUIRE(VALID_SOCKET(sock));
4448 
4449 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4450 	bzero(&afa, sizeof(afa));
4451 	strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4452 	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4453 		       sizeof(afa)) == -1) {
4454 		strerror_r(errno, strbuf, sizeof(strbuf));
4455 		socket_log(sock, NULL, CREATION,
4456 			   "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4457 		return (ISC_R_FAILURE);
4458 	}
4459 	return (ISC_R_SUCCESS);
4460 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4461 	return (ISC_R_NOTIMPLEMENTED);
4462 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4463 }
4464 
4465 /*
4466  * Try enabling TCP Fast Open for a given socket if the OS supports it.
4467  */
4468 static void
set_tcp_fastopen(isc_socket_t * sock,unsigned int backlog)4469 set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
4470 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4471 	char strbuf[ISC_STRERRORSIZE];
4472 
4473 /*
4474  * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4475  * shipping a default kernel without TFO support, so we special-case it by
4476  * performing an additional runtime check for TFO support using sysctl to
4477  * prevent setsockopt() errors from being logged.
4478  */
4479 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4480 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4481 	unsigned int enabled;
4482 	size_t enabledlen = sizeof(enabled);
4483 	static bool tfo_notice_logged = false;
4484 
4485 	if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4486 		/*
4487 		 * This kernel does not support TCP Fast Open.  There is
4488 		 * nothing more we can do.
4489 		 */
4490 		return;
4491 	} else if (enabled == 0) {
4492 		/*
4493 		 * This kernel does support TCP Fast Open, but it is disabled
4494 		 * by sysctl.  Notify the user, but do not nag.
4495 		 */
4496 		if (!tfo_notice_logged) {
4497 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4498 				      ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4499 				      "TCP_FASTOPEN support is disabled by "
4500 				      "sysctl (" SYSCTL_TFO " = 0)");
4501 			tfo_notice_logged = true;
4502 		}
4503 		return;
4504 	}
4505 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4506 
4507 #ifdef __APPLE__
4508 	backlog = 1;
4509 #else  /* ifdef __APPLE__ */
4510 	backlog = backlog / 2;
4511 	if (backlog == 0) {
4512 		backlog = 1;
4513 	}
4514 #endif /* ifdef __APPLE__ */
4515 	if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4516 		       sizeof(backlog)) < 0)
4517 	{
4518 		strerror_r(errno, strbuf, sizeof(strbuf));
4519 		UNEXPECTED_ERROR(__FILE__, __LINE__,
4520 				 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4521 				 sock->fd, strbuf);
4522 		/* TCP_FASTOPEN is experimental so ignore failures */
4523 	}
4524 #else  /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4525 	UNUSED(sock);
4526 	UNUSED(backlog);
4527 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4528 }
4529 
4530 /*
4531  * Set up to listen on a given socket.  We do this by creating an internal
4532  * event that will be dispatched when the socket has read activity.  The
4533  * watcher will send the internal event to the task when there is a new
4534  * connection.
4535  *
4536  * Unlike in read, we don't preallocate a done event here.  Every time there
4537  * is a new connection we'll have to allocate a new one anyway, so we might
4538  * as well keep things simple rather than having to track them.
4539  */
4540 isc_result_t
isc_socket_listen(isc_socket_t * sock,unsigned int backlog)4541 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4542 	char strbuf[ISC_STRERRORSIZE];
4543 
4544 	REQUIRE(VALID_SOCKET(sock));
4545 
4546 	LOCK(&sock->lock);
4547 
4548 	REQUIRE(!sock->listener);
4549 	REQUIRE(sock->bound);
4550 	REQUIRE(sock->type == isc_sockettype_tcp ||
4551 		sock->type == isc_sockettype_unix);
4552 
4553 	if (backlog == 0) {
4554 		backlog = SOMAXCONN;
4555 	}
4556 
4557 	if (listen(sock->fd, (int)backlog) < 0) {
4558 		UNLOCK(&sock->lock);
4559 		strerror_r(errno, strbuf, sizeof(strbuf));
4560 
4561 		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4562 
4563 		return (ISC_R_UNEXPECTED);
4564 	}
4565 
4566 	set_tcp_fastopen(sock, backlog);
4567 
4568 	sock->listener = 1;
4569 
4570 	UNLOCK(&sock->lock);
4571 	return (ISC_R_SUCCESS);
4572 }
4573 
4574 /*
4575  * This should try to do aggressive accept() XXXMLG
4576  */
4577 isc_result_t
isc_socket_accept(isc_socket_t * sock,isc_task_t * task,isc_taskaction_t action,void * arg)4578 isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
4579 		  void *arg) {
4580 	isc_socket_newconnev_t *dev;
4581 	isc_socketmgr_t *manager;
4582 	isc_task_t *ntask = NULL;
4583 	isc_socket_t *nsock;
4584 	isc_result_t result;
4585 	bool do_poke = false;
4586 
4587 	REQUIRE(VALID_SOCKET(sock));
4588 	manager = sock->manager;
4589 	REQUIRE(VALID_MANAGER(manager));
4590 
4591 	LOCK(&sock->lock);
4592 
4593 	REQUIRE(sock->listener);
4594 
4595 	/*
4596 	 * Sender field is overloaded here with the task we will be sending
4597 	 * this event to.  Just before the actual event is delivered the
4598 	 * actual ev_sender will be touched up to be the socket.
4599 	 */
4600 	dev = (isc_socket_newconnev_t *)isc_event_allocate(
4601 		manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4602 		sizeof(*dev));
4603 	ISC_LINK_INIT(dev, ev_link);
4604 
4605 	result = allocate_socket(manager, sock->type, &nsock);
4606 	if (result != ISC_R_SUCCESS) {
4607 		isc_event_free(ISC_EVENT_PTR(&dev));
4608 		UNLOCK(&sock->lock);
4609 		return (result);
4610 	}
4611 
4612 	/*
4613 	 * Attach to socket and to task.
4614 	 */
4615 	isc_task_attach(task, &ntask);
4616 	if (isc_task_exiting(ntask)) {
4617 		free_socket(&nsock);
4618 		isc_task_detach(&ntask);
4619 		isc_event_free(ISC_EVENT_PTR(&dev));
4620 		UNLOCK(&sock->lock);
4621 		return (ISC_R_SHUTTINGDOWN);
4622 	}
4623 	isc_refcount_increment0(&nsock->references);
4624 	nsock->statsindex = sock->statsindex;
4625 
4626 	dev->ev_sender = ntask;
4627 	dev->newsocket = nsock;
4628 
4629 	/*
4630 	 * Poke watcher here.  We still have the socket locked, so there
4631 	 * is no race condition.  We will keep the lock for such a short
4632 	 * bit of time waking it up now or later won't matter all that much.
4633 	 */
4634 	do_poke = ISC_LIST_EMPTY(sock->accept_list);
4635 	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4636 	if (do_poke) {
4637 		select_poke(manager, sock->threadid, sock->fd,
4638 			    SELECT_POKE_ACCEPT);
4639 	}
4640 	UNLOCK(&sock->lock);
4641 	return (ISC_R_SUCCESS);
4642 }
4643 
4644 isc_result_t
isc_socket_connect(isc_socket_t * sock,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4645 isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
4646 		   isc_task_t *task, isc_taskaction_t action, void *arg) {
4647 	isc_socket_connev_t *dev;
4648 	isc_task_t *ntask = NULL;
4649 	isc_socketmgr_t *manager;
4650 	int cc;
4651 	char strbuf[ISC_STRERRORSIZE];
4652 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4653 
4654 	REQUIRE(VALID_SOCKET(sock));
4655 	REQUIRE(addr != NULL);
4656 	REQUIRE(task != NULL);
4657 	REQUIRE(action != NULL);
4658 
4659 	manager = sock->manager;
4660 	REQUIRE(VALID_MANAGER(manager));
4661 	REQUIRE(addr != NULL);
4662 
4663 	if (isc_sockaddr_ismulticast(addr)) {
4664 		return (ISC_R_MULTICAST);
4665 	}
4666 
4667 	LOCK(&sock->lock);
4668 
4669 	dev = (isc_socket_connev_t *)isc_event_allocate(
4670 		manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4671 		sizeof(*dev));
4672 	ISC_LINK_INIT(dev, ev_link);
4673 
4674 	if (sock->connecting) {
4675 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4676 		goto queue;
4677 	}
4678 
4679 	if (sock->connected) {
4680 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4681 		dev->result = ISC_R_SUCCESS;
4682 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4683 
4684 		UNLOCK(&sock->lock);
4685 
4686 		return (ISC_R_SUCCESS);
4687 	}
4688 
4689 	/*
4690 	 * Try to do the connect right away, as there can be only one
4691 	 * outstanding, and it might happen to complete.
4692 	 */
4693 	sock->peer_address = *addr;
4694 	cc = connect(sock->fd, &addr->type.sa, addr->length);
4695 	if (cc < 0) {
4696 		/*
4697 		 * The socket is nonblocking and the connection cannot be
4698 		 * completed immediately.  It is possible to select(2) or
4699 		 * poll(2) for completion by selecting the socket for writing.
4700 		 * After select(2) indicates writability, use getsockopt(2) to
4701 		 * read the SO_ERROR option at level SOL_SOCKET to determine
4702 		 * whether connect() completed successfully (SO_ERROR is zero)
4703 		 * or unsuccessfully (SO_ERROR is one of the usual error codes
4704 		 * listed here, explaining the reason for the failure).
4705 		 */
4706 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4707 			cc = 0;
4708 			goto success;
4709 		}
4710 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4711 			goto queue;
4712 		}
4713 
4714 		switch (errno) {
4715 #define ERROR_MATCH(a, b)        \
4716 	case a:                  \
4717 		dev->result = b; \
4718 		goto err_exit;
4719 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4720 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4721 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4722 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4723 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4724 #ifdef EHOSTDOWN
4725 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4726 #endif /* ifdef EHOSTDOWN */
4727 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4728 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4729 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4730 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4731 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4732 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4733 #undef ERROR_MATCH
4734 		}
4735 
4736 		sock->connected = 0;
4737 
4738 		strerror_r(errno, strbuf, sizeof(strbuf));
4739 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4740 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4741 				 addrbuf, errno, strbuf);
4742 
4743 		UNLOCK(&sock->lock);
4744 		inc_stats(sock->manager->stats,
4745 			  sock->statsindex[STATID_CONNECTFAIL]);
4746 		isc_event_free(ISC_EVENT_PTR(&dev));
4747 		return (ISC_R_UNEXPECTED);
4748 
4749 	err_exit:
4750 		sock->connected = 0;
4751 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4752 
4753 		UNLOCK(&sock->lock);
4754 		inc_stats(sock->manager->stats,
4755 			  sock->statsindex[STATID_CONNECTFAIL]);
4756 		return (ISC_R_SUCCESS);
4757 	}
4758 
4759 	/*
4760 	 * If connect completed, fire off the done event.
4761 	 */
4762 success:
4763 	if (cc == 0) {
4764 		sock->connected = 1;
4765 		sock->bound = 1;
4766 		dev->result = ISC_R_SUCCESS;
4767 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4768 
4769 		UNLOCK(&sock->lock);
4770 
4771 		inc_stats(sock->manager->stats,
4772 			  sock->statsindex[STATID_CONNECT]);
4773 
4774 		return (ISC_R_SUCCESS);
4775 	}
4776 
4777 queue:
4778 
4779 	/*
4780 	 * Attach to task.
4781 	 */
4782 	isc_task_attach(task, &ntask);
4783 
4784 	dev->ev_sender = ntask;
4785 
4786 	/*
4787 	 * Poke watcher here.  We still have the socket locked, so there
4788 	 * is no race condition.  We will keep the lock for such a short
4789 	 * bit of time waking it up now or later won't matter all that much.
4790 	 */
4791 	bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4792 	ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4793 	if (do_poke && !sock->connecting) {
4794 		sock->connecting = 1;
4795 		select_poke(manager, sock->threadid, sock->fd,
4796 			    SELECT_POKE_CONNECT);
4797 	}
4798 
4799 	UNLOCK(&sock->lock);
4800 	return (ISC_R_SUCCESS);
4801 }
4802 
4803 /*
4804  * Called when a socket with a pending connect() finishes.
4805  */
4806 static void
internal_connect(isc_socket_t * sock)4807 internal_connect(isc_socket_t *sock) {
4808 	isc_socket_connev_t *dev;
4809 	int cc;
4810 	isc_result_t result;
4811 	socklen_t optlen;
4812 	char strbuf[ISC_STRERRORSIZE];
4813 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4814 
4815 	INSIST(VALID_SOCKET(sock));
4816 	REQUIRE(sock->fd >= 0);
4817 
4818 	/*
4819 	 * Get the first item off the connect list.
4820 	 * If it is empty, unlock the socket and return.
4821 	 */
4822 	dev = ISC_LIST_HEAD(sock->connect_list);
4823 	if (dev == NULL) {
4824 		INSIST(!sock->connecting);
4825 		goto finish;
4826 	}
4827 
4828 	INSIST(sock->connecting);
4829 	sock->connecting = 0;
4830 
4831 	/*
4832 	 * Get any possible error status here.
4833 	 */
4834 	optlen = sizeof(cc);
4835 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4836 		       (void *)&optlen) != 0)
4837 	{
4838 		cc = errno;
4839 	} else {
4840 		errno = cc;
4841 	}
4842 
4843 	if (errno != 0) {
4844 		/*
4845 		 * If the error is EAGAIN, just re-select on this
4846 		 * fd and pretend nothing strange happened.
4847 		 */
4848 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4849 			sock->connecting = 1;
4850 			return;
4851 		}
4852 
4853 		inc_stats(sock->manager->stats,
4854 			  sock->statsindex[STATID_CONNECTFAIL]);
4855 
4856 		/*
4857 		 * Translate other errors into ISC_R_* flavors.
4858 		 */
4859 		switch (errno) {
4860 #define ERROR_MATCH(a, b)   \
4861 	case a:             \
4862 		result = b; \
4863 		break;
4864 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4865 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4866 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4867 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4868 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4869 #ifdef EHOSTDOWN
4870 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4871 #endif /* ifdef EHOSTDOWN */
4872 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4873 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4874 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4875 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4876 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4877 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4878 #undef ERROR_MATCH
4879 		default:
4880 			result = ISC_R_UNEXPECTED;
4881 			isc_sockaddr_format(&sock->peer_address, peerbuf,
4882 					    sizeof(peerbuf));
4883 			strerror_r(errno, strbuf, sizeof(strbuf));
4884 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4885 					 "internal_connect: connect(%s) %s",
4886 					 peerbuf, strbuf);
4887 		}
4888 	} else {
4889 		inc_stats(sock->manager->stats,
4890 			  sock->statsindex[STATID_CONNECT]);
4891 		result = ISC_R_SUCCESS;
4892 		sock->connected = 1;
4893 		sock->bound = 1;
4894 	}
4895 
4896 	do {
4897 		dev->result = result;
4898 		send_connectdone_event(sock, &dev);
4899 		dev = ISC_LIST_HEAD(sock->connect_list);
4900 	} while (dev != NULL);
4901 
4902 finish:
4903 	unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
4904 		   SELECT_POKE_CONNECT);
4905 }
4906 
4907 isc_result_t
isc_socket_getpeername(isc_socket_t * sock,isc_sockaddr_t * addressp)4908 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4909 	isc_result_t result;
4910 
4911 	REQUIRE(VALID_SOCKET(sock));
4912 	REQUIRE(addressp != NULL);
4913 
4914 	LOCK(&sock->lock);
4915 
4916 	if (sock->connected) {
4917 		*addressp = sock->peer_address;
4918 		result = ISC_R_SUCCESS;
4919 	} else {
4920 		result = ISC_R_NOTCONNECTED;
4921 	}
4922 
4923 	UNLOCK(&sock->lock);
4924 
4925 	return (result);
4926 }
4927 
4928 isc_result_t
isc_socket_getsockname(isc_socket_t * sock,isc_sockaddr_t * addressp)4929 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4930 	socklen_t len;
4931 	isc_result_t result;
4932 	char strbuf[ISC_STRERRORSIZE];
4933 
4934 	REQUIRE(VALID_SOCKET(sock));
4935 	REQUIRE(addressp != NULL);
4936 
4937 	LOCK(&sock->lock);
4938 
4939 	if (!sock->bound) {
4940 		result = ISC_R_NOTBOUND;
4941 		goto out;
4942 	}
4943 
4944 	result = ISC_R_SUCCESS;
4945 
4946 	len = sizeof(addressp->type);
4947 	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
4948 		strerror_r(errno, strbuf, sizeof(strbuf));
4949 		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
4950 		result = ISC_R_UNEXPECTED;
4951 		goto out;
4952 	}
4953 	addressp->length = (unsigned int)len;
4954 
4955 out:
4956 	UNLOCK(&sock->lock);
4957 
4958 	return (result);
4959 }
4960 
4961 /*
4962  * Run through the list of events on this socket, and cancel the ones
4963  * queued for task "task" of type "how".  "how" is a bitmask.
4964  */
4965 void
isc_socket_cancel(isc_socket_t * sock,isc_task_t * task,unsigned int how)4966 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
4967 	REQUIRE(VALID_SOCKET(sock));
4968 
4969 	/*
4970 	 * Quick exit if there is nothing to do.  Don't even bother locking
4971 	 * in this case.
4972 	 */
4973 	if (how == 0) {
4974 		return;
4975 	}
4976 
4977 	LOCK(&sock->lock);
4978 
4979 	/*
4980 	 * All of these do the same thing, more or less.
4981 	 * Each will:
4982 	 *	o If the internal event is marked as "posted" try to
4983 	 *	  remove it from the task's queue.  If this fails, mark it
4984 	 *	  as canceled instead, and let the task clean it up later.
4985 	 *	o For each I/O request for that task of that type, post
4986 	 *	  its done event with status of "ISC_R_CANCELED".
4987 	 *	o Reset any state needed.
4988 	 */
4989 	if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
4990 	    !ISC_LIST_EMPTY(sock->recv_list)) {
4991 		isc_socketevent_t *dev;
4992 		isc_socketevent_t *next;
4993 		isc_task_t *current_task;
4994 
4995 		dev = ISC_LIST_HEAD(sock->recv_list);
4996 
4997 		while (dev != NULL) {
4998 			current_task = dev->ev_sender;
4999 			next = ISC_LIST_NEXT(dev, ev_link);
5000 
5001 			if ((task == NULL) || (task == current_task)) {
5002 				dev->result = ISC_R_CANCELED;
5003 				send_recvdone_event(sock, &dev);
5004 			}
5005 			dev = next;
5006 		}
5007 	}
5008 
5009 	if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
5010 	    !ISC_LIST_EMPTY(sock->send_list)) {
5011 		isc_socketevent_t *dev;
5012 		isc_socketevent_t *next;
5013 		isc_task_t *current_task;
5014 
5015 		dev = ISC_LIST_HEAD(sock->send_list);
5016 
5017 		while (dev != NULL) {
5018 			current_task = dev->ev_sender;
5019 			next = ISC_LIST_NEXT(dev, ev_link);
5020 
5021 			if ((task == NULL) || (task == current_task)) {
5022 				dev->result = ISC_R_CANCELED;
5023 				send_senddone_event(sock, &dev);
5024 			}
5025 			dev = next;
5026 		}
5027 	}
5028 
5029 	if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
5030 	    !ISC_LIST_EMPTY(sock->accept_list)) {
5031 		isc_socket_newconnev_t *dev;
5032 		isc_socket_newconnev_t *next;
5033 		isc_task_t *current_task;
5034 
5035 		dev = ISC_LIST_HEAD(sock->accept_list);
5036 		while (dev != NULL) {
5037 			current_task = dev->ev_sender;
5038 			next = ISC_LIST_NEXT(dev, ev_link);
5039 
5040 			if ((task == NULL) || (task == current_task)) {
5041 				ISC_LIST_UNLINK(sock->accept_list, dev,
5042 						ev_link);
5043 
5044 				isc_refcount_decrementz(
5045 					&NEWCONNSOCK(dev)->references);
5046 				free_socket((isc_socket_t **)&dev->newsocket);
5047 
5048 				dev->result = ISC_R_CANCELED;
5049 				dev->ev_sender = sock;
5050 				isc_task_sendtoanddetach(&current_task,
5051 							 ISC_EVENT_PTR(&dev),
5052 							 sock->threadid);
5053 			}
5054 
5055 			dev = next;
5056 		}
5057 	}
5058 
5059 	if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5060 	    !ISC_LIST_EMPTY(sock->connect_list))
5061 	{
5062 		isc_socket_connev_t *dev;
5063 		isc_socket_connev_t *next;
5064 		isc_task_t *current_task;
5065 
5066 		INSIST(sock->connecting);
5067 		sock->connecting = 0;
5068 
5069 		dev = ISC_LIST_HEAD(sock->connect_list);
5070 
5071 		while (dev != NULL) {
5072 			current_task = dev->ev_sender;
5073 			next = ISC_LIST_NEXT(dev, ev_link);
5074 
5075 			if ((task == NULL) || (task == current_task)) {
5076 				dev->result = ISC_R_CANCELED;
5077 				send_connectdone_event(sock, &dev);
5078 			}
5079 			dev = next;
5080 		}
5081 	}
5082 
5083 	UNLOCK(&sock->lock);
5084 }
5085 
5086 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock)5087 isc_socket_gettype(isc_socket_t *sock) {
5088 	REQUIRE(VALID_SOCKET(sock));
5089 
5090 	return (sock->type);
5091 }
5092 
5093 void
isc_socket_ipv6only(isc_socket_t * sock,bool yes)5094 isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
5095 #if defined(IPV6_V6ONLY)
5096 	int onoff = yes ? 1 : 0;
5097 #else  /* if defined(IPV6_V6ONLY) */
5098 	UNUSED(yes);
5099 	UNUSED(sock);
5100 #endif /* if defined(IPV6_V6ONLY) */
5101 
5102 	REQUIRE(VALID_SOCKET(sock));
5103 	INSIST(!sock->dupped);
5104 
5105 #ifdef IPV6_V6ONLY
5106 	if (sock->pf == AF_INET6) {
5107 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5108 			       (void *)&onoff, sizeof(int)) < 0)
5109 		{
5110 			char strbuf[ISC_STRERRORSIZE];
5111 			strerror_r(errno, strbuf, sizeof(strbuf));
5112 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5113 					 "setsockopt(%d, IPV6_V6ONLY) failed: "
5114 					 "%s",
5115 					 sock->fd, strbuf);
5116 		}
5117 	}
5118 #endif /* ifdef IPV6_V6ONLY */
5119 }
5120 
5121 static void
setdscp(isc_socket_t * sock,isc_dscp_t dscp)5122 setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
5123 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5124 	int value = dscp << 2;
5125 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5126 
5127 	sock->dscp = dscp;
5128 
5129 #ifdef IP_TOS
5130 	if (sock->pf == AF_INET) {
5131 		if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5132 			       sizeof(value)) < 0) {
5133 			char strbuf[ISC_STRERRORSIZE];
5134 			strerror_r(errno, strbuf, sizeof(strbuf));
5135 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5136 					 "setsockopt(%d, IP_TOS, %.02x) "
5137 					 "failed: %s",
5138 					 sock->fd, value >> 2, strbuf);
5139 		}
5140 	}
5141 #endif /* ifdef IP_TOS */
5142 #ifdef IPV6_TCLASS
5143 	if (sock->pf == AF_INET6) {
5144 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5145 			       (void *)&value, sizeof(value)) < 0)
5146 		{
5147 			char strbuf[ISC_STRERRORSIZE];
5148 			strerror_r(errno, strbuf, sizeof(strbuf));
5149 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5150 					 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5151 					 "failed: %s",
5152 					 sock->fd, dscp >> 2, strbuf);
5153 		}
5154 	}
5155 #endif /* ifdef IPV6_TCLASS */
5156 }
5157 
5158 void
isc_socket_dscp(isc_socket_t * sock,isc_dscp_t dscp)5159 isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
5160 	REQUIRE(VALID_SOCKET(sock));
5161 	REQUIRE(dscp < 0x40);
5162 
5163 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5164 	UNUSED(dscp);
5165 #else  /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5166 	if (dscp < 0) {
5167 		return;
5168 	}
5169 
5170 	/* The DSCP value must not be changed once it has been set. */
5171 	if (isc_dscp_check_value != -1) {
5172 		INSIST(dscp == isc_dscp_check_value);
5173 	}
5174 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5175 
5176 #ifdef notyet
5177 	REQUIRE(!sock->dupped);
5178 #endif /* ifdef notyet */
5179 
5180 	setdscp(sock, dscp);
5181 }
5182 
5183 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5184 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5185 		       isc_taskaction_t action, void *arg) {
5186 	return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5187 }
5188 
5189 void
isc_socket_setname(isc_socket_t * sock,const char * name,void * tag)5190 isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
5191 	/*
5192 	 * Name 'sock'.
5193 	 */
5194 
5195 	REQUIRE(VALID_SOCKET(sock));
5196 
5197 	LOCK(&sock->lock);
5198 	strlcpy(sock->name, name, sizeof(sock->name));
5199 	sock->tag = tag;
5200 	UNLOCK(&sock->lock);
5201 }
5202 
5203 const char *
isc_socket_getname(isc_socket_t * sock)5204 isc_socket_getname(isc_socket_t *sock) {
5205 	return (sock->name);
5206 }
5207 
5208 void *
isc_socket_gettag(isc_socket_t * sock)5209 isc_socket_gettag(isc_socket_t *sock) {
5210 	return (sock->tag);
5211 }
5212 
5213 int
isc_socket_getfd(isc_socket_t * sock)5214 isc_socket_getfd(isc_socket_t *sock) {
5215 	return ((short)sock->fd);
5216 }
5217 
5218 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5219 static bool hasreuseport = false;
5220 
5221 static void
init_hasreuseport()5222 init_hasreuseport() {
5223 /*
5224  * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5225  * We only want to use it on Linux, if it's available. On BSD we want to dup()
5226  * sockets instead of re-binding them.
5227  */
5228 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5229 	(defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5230 	int sock, yes = 1;
5231 	sock = socket(AF_INET, SOCK_DGRAM, 0);
5232 	if (sock < 0) {
5233 		sock = socket(AF_INET6, SOCK_DGRAM, 0);
5234 		if (sock < 0) {
5235 			return;
5236 		}
5237 	}
5238 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5239 		       sizeof(yes)) < 0) {
5240 		close(sock);
5241 		return;
5242 #if defined(__FreeBSD_kernel__)
5243 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5244 			      sizeof(yes)) < 0)
5245 #else  /* if defined(__FreeBSD_kernel__) */
5246 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5247 			      sizeof(yes)) < 0)
5248 #endif /* if defined(__FreeBSD_kernel__) */
5249 	{
5250 		close(sock);
5251 		return;
5252 	}
5253 	hasreuseport = true;
5254 	close(sock);
5255 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5256 	* (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5257 }
5258 
5259 bool
isc_socket_hasreuseport()5260 isc_socket_hasreuseport() {
5261 	RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5262 		      ISC_R_SUCCESS);
5263 	return (hasreuseport);
5264 }
5265 
5266 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5267 static const char *
_socktype(isc_sockettype_t type)5268 _socktype(isc_sockettype_t type) {
5269 	switch (type) {
5270 	case isc_sockettype_udp:
5271 		return ("udp");
5272 	case isc_sockettype_tcp:
5273 		return ("tcp");
5274 	case isc_sockettype_unix:
5275 		return ("unix");
5276 	default:
5277 		return ("not-initialized");
5278 	}
5279 }
5280 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5281 
5282 #ifdef HAVE_LIBXML2
5283 #define TRY0(a)                     \
5284 	do {                        \
5285 		xmlrc = (a);        \
5286 		if (xmlrc < 0)      \
5287 			goto error; \
5288 	} while (0)
5289 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr,void * writer0)5290 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
5291 	isc_socket_t *sock = NULL;
5292 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5293 	isc_sockaddr_t addr;
5294 	socklen_t len;
5295 	int xmlrc;
5296 	xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5297 
5298 	LOCK(&mgr->lock);
5299 
5300 	TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5301 	sock = ISC_LIST_HEAD(mgr->socklist);
5302 	while (sock != NULL) {
5303 		LOCK(&sock->lock);
5304 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5305 
5306 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5307 		TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5308 		TRY0(xmlTextWriterEndElement(writer));
5309 
5310 		if (sock->name[0] != 0) {
5311 			TRY0(xmlTextWriterStartElement(writer,
5312 						       ISC_XMLCHAR "name"));
5313 			TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5314 							    sock->name));
5315 			TRY0(xmlTextWriterEndElement(writer)); /* name */
5316 		}
5317 
5318 		TRY0(xmlTextWriterStartElement(writer,
5319 					       ISC_XMLCHAR "references"));
5320 		TRY0(xmlTextWriterWriteFormatString(
5321 			writer, "%d",
5322 			(int)isc_refcount_current(&sock->references)));
5323 		TRY0(xmlTextWriterEndElement(writer));
5324 
5325 		TRY0(xmlTextWriterWriteElement(
5326 			writer, ISC_XMLCHAR "type",
5327 			ISC_XMLCHAR _socktype(sock->type)));
5328 
5329 		if (sock->connected) {
5330 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5331 					    sizeof(peerbuf));
5332 			TRY0(xmlTextWriterWriteElement(
5333 				writer, ISC_XMLCHAR "peer-address",
5334 				ISC_XMLCHAR peerbuf));
5335 		}
5336 
5337 		len = sizeof(addr);
5338 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5339 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5340 			TRY0(xmlTextWriterWriteElement(
5341 				writer, ISC_XMLCHAR "local-address",
5342 				ISC_XMLCHAR peerbuf));
5343 		}
5344 
5345 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5346 		if (sock->listener) {
5347 			TRY0(xmlTextWriterWriteElement(writer,
5348 						       ISC_XMLCHAR "state",
5349 						       ISC_XMLCHAR "listener"));
5350 		}
5351 		if (sock->connected) {
5352 			TRY0(xmlTextWriterWriteElement(
5353 				writer, ISC_XMLCHAR "state",
5354 				ISC_XMLCHAR "connected"));
5355 		}
5356 		if (sock->connecting) {
5357 			TRY0(xmlTextWriterWriteElement(
5358 				writer, ISC_XMLCHAR "state",
5359 				ISC_XMLCHAR "connecting"));
5360 		}
5361 		if (sock->bound) {
5362 			TRY0(xmlTextWriterWriteElement(writer,
5363 						       ISC_XMLCHAR "state",
5364 						       ISC_XMLCHAR "bound"));
5365 		}
5366 
5367 		TRY0(xmlTextWriterEndElement(writer)); /* states */
5368 
5369 		TRY0(xmlTextWriterEndElement(writer)); /* socket */
5370 
5371 		UNLOCK(&sock->lock);
5372 		sock = ISC_LIST_NEXT(sock, link);
5373 	}
5374 	TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5375 
5376 error:
5377 	if (sock != NULL) {
5378 		UNLOCK(&sock->lock);
5379 	}
5380 
5381 	UNLOCK(&mgr->lock);
5382 
5383 	return (xmlrc);
5384 }
5385 #endif /* HAVE_LIBXML2 */
5386 
5387 #ifdef HAVE_JSON_C
5388 #define CHECKMEM(m)                              \
5389 	do {                                     \
5390 		if (m == NULL) {                 \
5391 			result = ISC_R_NOMEMORY; \
5392 			goto error;              \
5393 		}                                \
5394 	} while (0)
5395 
5396 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr,void * stats0)5397 isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
5398 	isc_result_t result = ISC_R_SUCCESS;
5399 	isc_socket_t *sock = NULL;
5400 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5401 	isc_sockaddr_t addr;
5402 	socklen_t len;
5403 	json_object *obj, *array = json_object_new_array();
5404 	json_object *stats = (json_object *)stats0;
5405 
5406 	CHECKMEM(array);
5407 
5408 	LOCK(&mgr->lock);
5409 
5410 	sock = ISC_LIST_HEAD(mgr->socklist);
5411 	while (sock != NULL) {
5412 		json_object *states, *entry = json_object_new_object();
5413 		char buf[255];
5414 
5415 		CHECKMEM(entry);
5416 		json_object_array_add(array, entry);
5417 
5418 		LOCK(&sock->lock);
5419 
5420 		snprintf(buf, sizeof(buf), "%p", sock);
5421 		obj = json_object_new_string(buf);
5422 		CHECKMEM(obj);
5423 		json_object_object_add(entry, "id", obj);
5424 
5425 		if (sock->name[0] != 0) {
5426 			obj = json_object_new_string(sock->name);
5427 			CHECKMEM(obj);
5428 			json_object_object_add(entry, "name", obj);
5429 		}
5430 
5431 		obj = json_object_new_int(
5432 			(int)isc_refcount_current(&sock->references));
5433 		CHECKMEM(obj);
5434 		json_object_object_add(entry, "references", obj);
5435 
5436 		obj = json_object_new_string(_socktype(sock->type));
5437 		CHECKMEM(obj);
5438 		json_object_object_add(entry, "type", obj);
5439 
5440 		if (sock->connected) {
5441 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5442 					    sizeof(peerbuf));
5443 			obj = json_object_new_string(peerbuf);
5444 			CHECKMEM(obj);
5445 			json_object_object_add(entry, "peer-address", obj);
5446 		}
5447 
5448 		len = sizeof(addr);
5449 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5450 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5451 			obj = json_object_new_string(peerbuf);
5452 			CHECKMEM(obj);
5453 			json_object_object_add(entry, "local-address", obj);
5454 		}
5455 
5456 		states = json_object_new_array();
5457 		CHECKMEM(states);
5458 		json_object_object_add(entry, "states", states);
5459 
5460 		if (sock->listener) {
5461 			obj = json_object_new_string("listener");
5462 			CHECKMEM(obj);
5463 			json_object_array_add(states, obj);
5464 		}
5465 
5466 		if (sock->connected) {
5467 			obj = json_object_new_string("connected");
5468 			CHECKMEM(obj);
5469 			json_object_array_add(states, obj);
5470 		}
5471 
5472 		if (sock->connecting) {
5473 			obj = json_object_new_string("connecting");
5474 			CHECKMEM(obj);
5475 			json_object_array_add(states, obj);
5476 		}
5477 
5478 		if (sock->bound) {
5479 			obj = json_object_new_string("bound");
5480 			CHECKMEM(obj);
5481 			json_object_array_add(states, obj);
5482 		}
5483 
5484 		UNLOCK(&sock->lock);
5485 		sock = ISC_LIST_NEXT(sock, link);
5486 	}
5487 
5488 	json_object_object_add(stats, "sockets", array);
5489 	array = NULL;
5490 	result = ISC_R_SUCCESS;
5491 
5492 error:
5493 	if (array != NULL) {
5494 		json_object_put(array);
5495 	}
5496 
5497 	if (sock != NULL) {
5498 		UNLOCK(&sock->lock);
5499 	}
5500 
5501 	UNLOCK(&mgr->lock);
5502 
5503 	return (result);
5504 }
5505 #endif /* HAVE_JSON_C */
5506