1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  *
8  * See the COPYRIGHT file distributed with this work for additional
9  * information regarding copyright ownership.
10  */
11 
12 /*! \file */
13 
14 #include <inttypes.h>
15 #include <stdbool.h>
16 #include <sys/param.h>
17 #include <sys/socket.h>
18 #include <sys/stat.h>
19 #include <sys/types.h>
20 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
21 #include <sys/sysctl.h>
22 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
23 #include <sys/time.h>
24 #include <sys/uio.h>
25 
26 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
27 #include <linux/netlink.h>
28 #include <linux/rtnetlink.h>
29 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
30 	*/
31 
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <stddef.h>
35 #include <stdlib.h>
36 #include <unistd.h>
37 
38 #include <isc/app.h>
39 #include <isc/buffer.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/mutex.h>
46 #include <isc/net.h>
47 #include <isc/once.h>
48 #include <isc/platform.h>
49 #include <isc/print.h>
50 #include <isc/refcount.h>
51 #include <isc/region.h>
52 #include <isc/resource.h>
53 #include <isc/socket.h>
54 #include <isc/stats.h>
55 #include <isc/strerr.h>
56 #include <isc/string.h>
57 #include <isc/task.h>
58 #include <isc/thread.h>
59 #include <isc/util.h>
60 
61 #ifdef ISC_PLATFORM_HAVESYSUNH
62 #include <sys/un.h>
63 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
64 #ifdef HAVE_KQUEUE
65 #include <sys/event.h>
66 #endif /* ifdef HAVE_KQUEUE */
67 #ifdef HAVE_EPOLL_CREATE1
68 #include <sys/epoll.h>
69 #endif /* ifdef HAVE_EPOLL_CREATE1 */
70 #if defined(HAVE_SYS_DEVPOLL_H)
71 #include <sys/devpoll.h>
72 #elif defined(HAVE_DEVPOLL_H)
73 #include <devpoll.h>
74 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
75 
76 #include <netinet/tcp.h>
77 
78 #include "errno2result.h"
79 
80 #ifdef ENABLE_TCP_FASTOPEN
81 #include <netinet/tcp.h>
82 #endif /* ifdef ENABLE_TCP_FASTOPEN */
83 
84 #ifdef HAVE_JSON_C
85 #include <json_object.h>
86 #endif /* HAVE_JSON_C */
87 
88 #ifdef HAVE_LIBXML2
89 #include <libxml/xmlwriter.h>
90 #define ISC_XMLCHAR (const xmlChar *)
91 #endif /* HAVE_LIBXML2 */
92 
93 /*%
94  * Choose the most preferable multiplex method.
95  */
96 #if defined(HAVE_KQUEUE)
97 #define USE_KQUEUE
98 #elif defined(HAVE_EPOLL_CREATE1)
99 #define USE_EPOLL
100 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
101 #define USE_DEVPOLL
102 typedef struct {
103 	unsigned int want_read : 1, want_write : 1;
104 } pollinfo_t;
105 #else /* if defined(HAVE_KQUEUE) */
106 #define USE_SELECT
107 #endif /* HAVE_KQUEUE */
108 
109 /*
110  * Set by the -T dscp option on the command line. If set to a value
111  * other than -1, we check to make sure DSCP values match it, and
112  * assert if not.
113  */
114 int isc_dscp_check_value = -1;
115 
116 /*%
117  * Maximum number of allowable open sockets.  This is also the maximum
118  * allowable socket file descriptor.
119  *
120  * Care should be taken before modifying this value for select():
121  * The API standard doesn't ensure select() accept more than (the system default
122  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
123  * the vast majority of cases.  This constant should therefore be increased only
124  * when absolutely necessary and possible, i.e., the server is exhausting all
125  * available file descriptors (up to FD_SETSIZE) and the select() function
126  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
127  * always by true, but we keep using some of them to ensure as much
128  * portability as possible).  Note also that overall server performance
129  * may be rather worsened with a larger value of this constant due to
130  * inherent scalability problems of select().
131  *
132  * As a special note, this value shouldn't have to be touched if
133  * this is a build for an authoritative only DNS server.
134  */
135 #ifndef ISC_SOCKET_MAXSOCKETS
136 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
137 #ifdef TUNE_LARGE
138 #define ISC_SOCKET_MAXSOCKETS 21000
139 #else /* ifdef TUNE_LARGE */
140 #define ISC_SOCKET_MAXSOCKETS 4096
141 #endif /* TUNE_LARGE */
142 #elif defined(USE_SELECT)
143 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
144 #endif /* USE_KQUEUE... */
145 #endif /* ISC_SOCKET_MAXSOCKETS */
146 
147 #ifdef USE_SELECT
148 /*%
149  * Mac OS X needs a special definition to support larger values in select().
150  * We always define this because a larger value can be specified run-time.
151  */
152 #ifdef __APPLE__
153 #define _DARWIN_UNLIMITED_SELECT
154 #endif /* __APPLE__ */
155 #endif /* USE_SELECT */
156 
157 #ifdef ISC_SOCKET_USE_POLLWATCH
158 /*%
159  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
160  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
161  * some of the specified FD.  The idea is based on the observation that it's
162  * likely for a busy server to keep receiving packets.  It specifically works
163  * as follows: the socket watcher is first initialized with the state of
164  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
165  * event occurs.  When it wakes up for a socket I/O event, it moves to the
166  * poll_active state, and sets the poll timeout to a short period
167  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
168  * watcher goes to the poll_checking state with the same timeout period.
169  * In this state, the watcher tries to detect whether this is a break
170  * during intermittent events or the kernel bug is triggered.  If the next
171  * polling reports an event within the short period, the previous timeout is
172  * likely to be a kernel bug, and so the watcher goes back to the active state.
173  * Otherwise, it moves to the idle state again.
174  *
175  * It's not clear whether this is a thread-related bug, but since we've only
176  * seen this with threads, this workaround is used only when enabling threads.
177  */
178 
179 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
180 
181 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
182 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
183 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
184 #endif /* ISC_SOCKET_USE_POLLWATCH */
185 
186 /*%
187  * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
188  */
189 #define FDLOCK_BITS  10
190 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
191 #define FDLOCK_ID(fd)                                   \
192 	(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
193 	 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
194 
195 /*%
196  * Maximum number of events communicated with the kernel.  There should normally
197  * be no need for having a large number.
198  */
199 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
200 #ifndef ISC_SOCKET_MAXEVENTS
201 #ifdef TUNE_LARGE
202 #define ISC_SOCKET_MAXEVENTS 2048
203 #else /* ifdef TUNE_LARGE */
204 #define ISC_SOCKET_MAXEVENTS 64
205 #endif /* TUNE_LARGE */
206 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
207 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
208 	* */
209 
210 /*%
211  * Some systems define the socket length argument as an int, some as size_t,
212  * some as socklen_t.  This is here so it can be easily changed if needed.
213  */
214 #ifndef socklen_t
215 #define socklen_t unsigned int
216 #endif /* ifndef socklen_t */
217 
218 /*%
219  * Define what the possible "soft" errors can be.  These are non-fatal returns
220  * of various network related functions, like recv() and so on.
221  *
222  * For some reason, BSDI (and perhaps others) will sometimes return <0
223  * from recv() but will have errno==0.  This is broken, but we have to
224  * work around it here.
225  */
226 #define SOFT_ERROR(e)                                             \
227 	((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
228 	 (e) == EINTR || (e) == 0)
229 
230 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
231 
232 /*!<
233  * DLVL(90)  --  Function entry/exit and other tracing.
234  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
235  * DLVL(60)  --  Socket data send/receive
236  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
237  * DLVL(20)  --  Socket creation/destruction.
238  */
239 #define TRACE_LEVEL	  90
240 #define CORRECTNESS_LEVEL 70
241 #define IOEVENT_LEVEL	  60
242 #define EVENT_LEVEL	  50
243 #define CREATION_LEVEL	  20
244 
245 #define TRACE	    DLVL(TRACE_LEVEL)
246 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
247 #define IOEVENT	    DLVL(IOEVENT_LEVEL)
248 #define EVENT	    DLVL(EVENT_LEVEL)
249 #define CREATION    DLVL(CREATION_LEVEL)
250 
251 typedef isc_event_t intev_t;
252 
253 #define SOCKET_MAGIC	ISC_MAGIC('I', 'O', 'i', 'o')
254 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
255 
256 /*!
257  * IPv6 control information.  If the socket is an IPv6 socket we want
258  * to collect the destination address and interface so the client can
259  * set them on outgoing packets.
260  */
261 #ifndef USE_CMSG
262 #define USE_CMSG 1
263 #endif /* ifndef USE_CMSG */
264 
265 /*%
266  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
267  * a setsockopt() like interface to request timestamps, and if the OS
268  * doesn't do it for us, call gettimeofday() on every UDP receive?
269  */
270 #ifdef SO_TIMESTAMP
271 #ifndef USE_CMSG
272 #define USE_CMSG 1
273 #endif /* ifndef USE_CMSG */
274 #endif /* ifdef SO_TIMESTAMP */
275 
276 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
277 #define SET_RCVBUF
278 #endif
279 
280 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
281 #define SET_SNDBUF
282 #endif
283 
284 /*%
285  * Instead of calculating the cmsgbuf lengths every time we take
286  * a rule of thumb approach - sizes are taken from x86_64 linux,
287  * multiplied by 2, everything should fit. Those sizes are not
288  * large enough to cause any concern.
289  */
290 #if defined(USE_CMSG)
291 #define CMSG_SP_IN6PKT 40
292 #else /* if defined(USE_CMSG) */
293 #define CMSG_SP_IN6PKT 0
294 #endif /* if defined(USE_CMSG) */
295 
296 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
297 #define CMSG_SP_TIMESTAMP 32
298 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
299 #define CMSG_SP_TIMESTAMP 0
300 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
301 
302 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
303 #define CMSG_SP_TCTOS 24
304 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
305 #define CMSG_SP_TCTOS 0
306 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
307 
308 #define CMSG_SP_INT 24
309 
310 /* Align cmsg buffers to be safe on SPARC etc. */
311 #define RECVCMSGBUFLEN                                                       \
312 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
313 			  1,                                                 \
314 		  sizeof(void *))
315 #define SENDCMSGBUFLEN                                                    \
316 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
317 		  sizeof(void *))
318 
319 /*%
320  * The number of times a send operation is repeated if the result is EINTR.
321  */
322 #define NRETRIES 10
323 
324 typedef struct isc__socket isc__socket_t;
325 typedef struct isc__socketmgr isc__socketmgr_t;
326 typedef struct isc__socketthread isc__socketthread_t;
327 
328 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
329 
330 struct isc__socket {
331 	/* Not locked. */
332 	isc_socket_t common;
333 	isc__socketmgr_t *manager;
334 	isc_mutex_t lock;
335 	isc_sockettype_t type;
336 	const isc_statscounter_t *statsindex;
337 	isc_refcount_t references;
338 
339 	/* Locked by socket lock. */
340 	ISC_LINK(isc__socket_t) link;
341 	int fd;
342 	int pf;
343 	int threadid;
344 	char name[16];
345 	void *tag;
346 
347 	ISC_LIST(isc_socketevent_t) send_list;
348 	ISC_LIST(isc_socketevent_t) recv_list;
349 	ISC_LIST(isc_socket_newconnev_t) accept_list;
350 	ISC_LIST(isc_socket_connev_t) connect_list;
351 
352 	isc_sockaddr_t peer_address; /* remote address */
353 
354 	unsigned int listener : 1,	       /* listener socket */
355 		connected : 1, connecting : 1, /* connect pending
356 						* */
357 		bound : 1,		       /* bound to local addr */
358 		dupped : 1, active : 1,	       /* currently active */
359 		pktdscp : 1;		       /* per packet dscp */
360 
361 #ifdef ISC_PLATFORM_RECVOVERFLOW
362 	unsigned char overflow; /* used for MSG_TRUNC fake */
363 #endif				/* ifdef ISC_PLATFORM_RECVOVERFLOW */
364 
365 	unsigned int dscp;
366 };
367 
368 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
369 #define VALID_MANAGER(m)     ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
370 
371 struct isc__socketmgr {
372 	/* Not locked. */
373 	isc_socketmgr_t common;
374 	isc_mem_t *mctx;
375 	isc_mutex_t lock;
376 	isc_stats_t *stats;
377 	int nthreads;
378 	isc__socketthread_t *threads;
379 	unsigned int maxsocks;
380 	/* Locked by manager lock. */
381 	ISC_LIST(isc__socket_t) socklist;
382 	int reserved; /* unlocked */
383 	isc_condition_t shutdown_ok;
384 	size_t maxudp;
385 };
386 
387 struct isc__socketthread {
388 	isc__socketmgr_t *manager;
389 	int threadid;
390 	isc_thread_t thread;
391 	int pipe_fds[2];
392 	isc_mutex_t *fdlock;
393 	/* Locked by fdlock. */
394 	isc__socket_t **fds;
395 	int *fdstate;
396 #ifdef USE_KQUEUE
397 	int kqueue_fd;
398 	int nevents;
399 	struct kevent *events;
400 #endif /* USE_KQUEUE */
401 #ifdef USE_EPOLL
402 	int epoll_fd;
403 	int nevents;
404 	struct epoll_event *events;
405 	uint32_t *epoll_events;
406 #endif /* USE_EPOLL */
407 #ifdef USE_DEVPOLL
408 	int devpoll_fd;
409 	isc_resourcevalue_t open_max;
410 	unsigned int calls;
411 	int nevents;
412 	struct pollfd *events;
413 	pollinfo_t *fdpollinfo;
414 #endif /* USE_DEVPOLL */
415 #ifdef USE_SELECT
416 	int fd_bufsize;
417 	fd_set *read_fds;
418 	fd_set *read_fds_copy;
419 	fd_set *write_fds;
420 	fd_set *write_fds_copy;
421 	int maxfd;
422 #endif /* USE_SELECT */
423 };
424 
425 #define CLOSED	      0 /* this one must be zero */
426 #define MANAGED	      1
427 #define CLOSE_PENDING 2
428 
429 /*
430  * send() and recv() iovec counts
431  */
432 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
433 #ifdef ISC_PLATFORM_RECVOVERFLOW
434 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
435 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
436 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
437 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
438 
439 static isc_result_t
440 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
441 	      isc_socket_t **socketp, isc_socket_t *dup_socket);
442 static void
443 send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
444 static void
445 send_senddone_event(isc__socket_t *, isc_socketevent_t **);
446 static void
447 send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
448 static void
449 free_socket(isc__socket_t **);
450 static isc_result_t
451 allocate_socket(isc__socketmgr_t *, isc_sockettype_t, isc__socket_t **);
452 static void
453 destroy(isc__socket_t **);
454 static void
455 internal_accept(isc__socket_t *);
456 static void
457 internal_connect(isc__socket_t *);
458 static void
459 internal_recv(isc__socket_t *);
460 static void
461 internal_send(isc__socket_t *);
462 static void
463 process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
464 static void
465 build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
466 		  struct iovec *, size_t *);
467 static void
468 build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
469 		  struct iovec *, size_t *);
470 static bool
471 process_ctlfd(isc__socketthread_t *thread);
472 static void
473 setdscp(isc__socket_t *sock, isc_dscp_t dscp);
474 
475 #define SELECT_POKE_SHUTDOWN (-1)
476 #define SELECT_POKE_NOTHING  (-2)
477 #define SELECT_POKE_READ     (-3)
478 #define SELECT_POKE_ACCEPT   (-3) /*%< Same as _READ */
479 #define SELECT_POKE_WRITE    (-4)
480 #define SELECT_POKE_CONNECT  (-4) /*%< Same as _WRITE */
481 #define SELECT_POKE_CLOSE    (-5)
482 
483 /*%
484  * Shortcut index arrays to get access to statistics counters.
485  */
486 enum { STATID_OPEN = 0,
487        STATID_OPENFAIL = 1,
488        STATID_CLOSE = 2,
489        STATID_BINDFAIL = 3,
490        STATID_CONNECTFAIL = 4,
491        STATID_CONNECT = 5,
492        STATID_ACCEPTFAIL = 6,
493        STATID_ACCEPT = 7,
494        STATID_SENDFAIL = 8,
495        STATID_RECVFAIL = 9,
496        STATID_ACTIVE = 10 };
497 static const isc_statscounter_t udp4statsindex[] = {
498 	isc_sockstatscounter_udp4open,
499 	isc_sockstatscounter_udp4openfail,
500 	isc_sockstatscounter_udp4close,
501 	isc_sockstatscounter_udp4bindfail,
502 	isc_sockstatscounter_udp4connectfail,
503 	isc_sockstatscounter_udp4connect,
504 	-1,
505 	-1,
506 	isc_sockstatscounter_udp4sendfail,
507 	isc_sockstatscounter_udp4recvfail,
508 	isc_sockstatscounter_udp4active
509 };
510 static const isc_statscounter_t udp6statsindex[] = {
511 	isc_sockstatscounter_udp6open,
512 	isc_sockstatscounter_udp6openfail,
513 	isc_sockstatscounter_udp6close,
514 	isc_sockstatscounter_udp6bindfail,
515 	isc_sockstatscounter_udp6connectfail,
516 	isc_sockstatscounter_udp6connect,
517 	-1,
518 	-1,
519 	isc_sockstatscounter_udp6sendfail,
520 	isc_sockstatscounter_udp6recvfail,
521 	isc_sockstatscounter_udp6active
522 };
523 static const isc_statscounter_t tcp4statsindex[] = {
524 	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
525 	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
526 	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
527 	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
528 	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
529 	isc_sockstatscounter_tcp4active
530 };
531 static const isc_statscounter_t tcp6statsindex[] = {
532 	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
533 	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
534 	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
535 	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
536 	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
537 	isc_sockstatscounter_tcp6active
538 };
539 static const isc_statscounter_t unixstatsindex[] = {
540 	isc_sockstatscounter_unixopen,	      isc_sockstatscounter_unixopenfail,
541 	isc_sockstatscounter_unixclose,	      isc_sockstatscounter_unixbindfail,
542 	isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
543 	isc_sockstatscounter_unixacceptfail,  isc_sockstatscounter_unixaccept,
544 	isc_sockstatscounter_unixsendfail,    isc_sockstatscounter_unixrecvfail,
545 	isc_sockstatscounter_unixactive
546 };
547 static const isc_statscounter_t rawstatsindex[] = {
548 	isc_sockstatscounter_rawopen,
549 	isc_sockstatscounter_rawopenfail,
550 	isc_sockstatscounter_rawclose,
551 	-1,
552 	-1,
553 	-1,
554 	-1,
555 	-1,
556 	-1,
557 	isc_sockstatscounter_rawrecvfail,
558 	isc_sockstatscounter_rawactive
559 };
560 
561 static int
562 gen_threadid(isc__socket_t *sock);
563 
564 static int
gen_threadid(isc__socket_t * sock)565 gen_threadid(isc__socket_t *sock) {
566 	return (sock->fd % sock->manager->nthreads);
567 }
568 
569 static void
570 manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
571 	    isc_logmodule_t *module, int level, const char *fmt, ...)
572 	ISC_FORMAT_PRINTF(5, 6);
573 static void
manager_log(isc__socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)574 manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
575 	    isc_logmodule_t *module, int level, const char *fmt, ...) {
576 	char msgbuf[2048];
577 	va_list ap;
578 
579 	if (!isc_log_wouldlog(isc_lctx, level)) {
580 		return;
581 	}
582 
583 	va_start(ap, fmt);
584 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
585 	va_end(ap);
586 
587 	isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
588 		      sockmgr, msgbuf);
589 }
590 
591 static void
592 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
593 	   isc_logmodule_t *module, int level, const char *fmt, ...)
594 	ISC_FORMAT_PRINTF(5, 6);
595 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)596 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
597 	   isc_logmodule_t *module, int level, const char *fmt, ...) {
598 	char msgbuf[2048];
599 	va_list ap;
600 
601 	if (!isc_log_wouldlog(isc_lctx, level)) {
602 		return;
603 	}
604 
605 	va_start(ap, fmt);
606 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
607 	va_end(ap);
608 
609 	isc_log_write(isc_lctx, category, module, level,
610 		      "sockmgr %p thread %d: %s", thread->manager,
611 		      thread->threadid, msgbuf);
612 }
613 
614 static void
615 socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
616 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
617 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
618 static void
socket_log(isc__socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)619 socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
620 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
621 	   const char *fmt, ...) {
622 	char msgbuf[2048];
623 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
624 	va_list ap;
625 
626 	if (!isc_log_wouldlog(isc_lctx, level)) {
627 		return;
628 	}
629 
630 	va_start(ap, fmt);
631 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
632 	va_end(ap);
633 
634 	if (address == NULL) {
635 		isc_log_write(isc_lctx, category, module, level,
636 			      "socket %p: %s", sock, msgbuf);
637 	} else {
638 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
639 		isc_log_write(isc_lctx, category, module, level,
640 			      "socket %p %s: %s", sock, peerbuf, msgbuf);
641 	}
642 }
643 
644 /*%
645  * Increment socket-related statistics counters.
646  */
647 static inline void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)648 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
649 	REQUIRE(counterid != -1);
650 
651 	if (stats != NULL) {
652 		isc_stats_increment(stats, counterid);
653 	}
654 }
655 
656 /*%
657  * Decrement socket-related statistics counters.
658  */
659 static inline void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)660 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
661 	REQUIRE(counterid != -1);
662 
663 	if (stats != NULL) {
664 		isc_stats_decrement(stats, counterid);
665 	}
666 }
667 
668 static inline isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)669 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
670 	isc_result_t result = ISC_R_SUCCESS;
671 
672 #ifdef USE_KQUEUE
673 	struct kevent evchange;
674 
675 	memset(&evchange, 0, sizeof(evchange));
676 	if (msg == SELECT_POKE_READ) {
677 		evchange.filter = EVFILT_READ;
678 	} else {
679 		evchange.filter = EVFILT_WRITE;
680 	}
681 	evchange.flags = EV_ADD;
682 	evchange.ident = fd;
683 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
684 		result = isc__errno2result(errno);
685 	}
686 
687 	return (result);
688 #elif defined(USE_EPOLL)
689 	struct epoll_event event;
690 	uint32_t oldevents;
691 	int ret;
692 	int op;
693 
694 	oldevents = thread->epoll_events[fd];
695 	if (msg == SELECT_POKE_READ) {
696 		thread->epoll_events[fd] |= EPOLLIN;
697 	} else {
698 		thread->epoll_events[fd] |= EPOLLOUT;
699 	}
700 
701 	event.events = thread->epoll_events[fd];
702 	memset(&event.data, 0, sizeof(event.data));
703 	event.data.fd = fd;
704 
705 	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
706 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
707 	if (ret == -1) {
708 		if (errno == EEXIST) {
709 			UNEXPECTED_ERROR(__FILE__, __LINE__,
710 					 "epoll_ctl(ADD/MOD) returned "
711 					 "EEXIST for fd %d",
712 					 fd);
713 		}
714 		result = isc__errno2result(errno);
715 	}
716 
717 	return (result);
718 #elif defined(USE_DEVPOLL)
719 	struct pollfd pfd;
720 	int lockid = FDLOCK_ID(fd);
721 
722 	memset(&pfd, 0, sizeof(pfd));
723 	if (msg == SELECT_POKE_READ) {
724 		pfd.events = POLLIN;
725 	} else {
726 		pfd.events = POLLOUT;
727 	}
728 	pfd.fd = fd;
729 	pfd.revents = 0;
730 	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
731 		result = isc__errno2result(errno);
732 	} else {
733 		if (msg == SELECT_POKE_READ) {
734 			thread->fdpollinfo[fd].want_read = 1;
735 		} else {
736 			thread->fdpollinfo[fd].want_write = 1;
737 		}
738 	}
739 
740 	return (result);
741 #elif defined(USE_SELECT)
742 	LOCK(&thread->manager->lock);
743 	if (msg == SELECT_POKE_READ) {
744 		FD_SET(fd, thread->read_fds);
745 	}
746 	if (msg == SELECT_POKE_WRITE) {
747 		FD_SET(fd, thread->write_fds);
748 	}
749 	UNLOCK(&thread->manager->lock);
750 
751 	return (result);
752 #endif /* ifdef USE_KQUEUE */
753 }
754 
755 static inline isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)756 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
757 	isc_result_t result = ISC_R_SUCCESS;
758 
759 #ifdef USE_KQUEUE
760 	struct kevent evchange;
761 
762 	memset(&evchange, 0, sizeof(evchange));
763 	if (msg == SELECT_POKE_READ) {
764 		evchange.filter = EVFILT_READ;
765 	} else {
766 		evchange.filter = EVFILT_WRITE;
767 	}
768 	evchange.flags = EV_DELETE;
769 	evchange.ident = fd;
770 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
771 		result = isc__errno2result(errno);
772 	}
773 
774 	return (result);
775 #elif defined(USE_EPOLL)
776 	struct epoll_event event;
777 	int ret;
778 	int op;
779 
780 	if (msg == SELECT_POKE_READ) {
781 		thread->epoll_events[fd] &= ~(EPOLLIN);
782 	} else {
783 		thread->epoll_events[fd] &= ~(EPOLLOUT);
784 	}
785 
786 	event.events = thread->epoll_events[fd];
787 	memset(&event.data, 0, sizeof(event.data));
788 	event.data.fd = fd;
789 
790 	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
791 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
792 	if (ret == -1 && errno != ENOENT) {
793 		char strbuf[ISC_STRERRORSIZE];
794 		strerror_r(errno, strbuf, sizeof(strbuf));
795 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
796 				 fd, strbuf);
797 		result = ISC_R_UNEXPECTED;
798 	}
799 	return (result);
800 #elif defined(USE_DEVPOLL)
801 	struct pollfd pfds[2];
802 	size_t writelen = sizeof(pfds[0]);
803 	int lockid = FDLOCK_ID(fd);
804 
805 	memset(pfds, 0, sizeof(pfds));
806 	pfds[0].events = POLLREMOVE;
807 	pfds[0].fd = fd;
808 
809 	/*
810 	 * Canceling read or write polling via /dev/poll is tricky.  Since it
811 	 * only provides a way of canceling per FD, we may need to re-poll the
812 	 * socket for the other operation.
813 	 */
814 	if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
815 		pfds[1].events = POLLOUT;
816 		pfds[1].fd = fd;
817 		writelen += sizeof(pfds[1]);
818 	}
819 	if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
820 		pfds[1].events = POLLIN;
821 		pfds[1].fd = fd;
822 		writelen += sizeof(pfds[1]);
823 	}
824 
825 	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
826 		result = isc__errno2result(errno);
827 	} else {
828 		if (msg == SELECT_POKE_READ) {
829 			thread->fdpollinfo[fd].want_read = 0;
830 		} else {
831 			thread->fdpollinfo[fd].want_write = 0;
832 		}
833 	}
834 
835 	return (result);
836 #elif defined(USE_SELECT)
837 	LOCK(&thread->manager->lock);
838 	if (msg == SELECT_POKE_READ) {
839 		FD_CLR(fd, thread->read_fds);
840 	} else if (msg == SELECT_POKE_WRITE) {
841 		FD_CLR(fd, thread->write_fds);
842 	}
843 	UNLOCK(&thread->manager->lock);
844 
845 	return (result);
846 #endif /* ifdef USE_KQUEUE */
847 }
848 
849 /*
850  * A poke message was received, perform a proper watch/unwatch
851  * on a fd provided
852  */
853 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)854 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
855 	isc_result_t result;
856 	int lockid = FDLOCK_ID(fd);
857 
858 	/*
859 	 * This is a wakeup on a socket.  If the socket is not in the
860 	 * process of being closed, start watching it for either reads
861 	 * or writes.
862 	 */
863 
864 	INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
865 
866 	if (msg == SELECT_POKE_CLOSE) {
867 		LOCK(&thread->fdlock[lockid]);
868 		INSIST(thread->fdstate[fd] == CLOSE_PENDING);
869 		thread->fdstate[fd] = CLOSED;
870 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
871 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
872 		(void)close(fd);
873 		UNLOCK(&thread->fdlock[lockid]);
874 		return;
875 	}
876 
877 	LOCK(&thread->fdlock[lockid]);
878 	if (thread->fdstate[fd] == CLOSE_PENDING) {
879 		/*
880 		 * We accept (and ignore) any error from unwatch_fd() as we are
881 		 * closing the socket, hoping it doesn't leave dangling state in
882 		 * the kernel.
883 		 * Note that unwatch_fd() must be called after releasing the
884 		 * fdlock; otherwise it could cause deadlock due to a lock order
885 		 * reversal.
886 		 */
887 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
888 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
889 		UNLOCK(&thread->fdlock[lockid]);
890 		return;
891 	}
892 	if (thread->fdstate[fd] != MANAGED) {
893 		UNLOCK(&thread->fdlock[lockid]);
894 		return;
895 	}
896 	UNLOCK(&thread->fdlock[lockid]);
897 
898 	/*
899 	 * Set requested bit.
900 	 */
901 	result = watch_fd(thread, fd, msg);
902 	if (result != ISC_R_SUCCESS) {
903 		/*
904 		 * XXXJT: what should we do?  Ignoring the failure of watching
905 		 * a socket will make the application dysfunctional, but there
906 		 * seems to be no reasonable recovery process.
907 		 */
908 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
909 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
910 			      "failed to start watching FD (%d): %s", fd,
911 			      isc_result_totext(result));
912 	}
913 }
914 
915 /*
916  * Poke the select loop when there is something for us to do.
917  * The write is required (by POSIX) to complete.  That is, we
918  * will not get partial writes.
919  */
920 static void
select_poke(isc__socketmgr_t * mgr,int threadid,int fd,int msg)921 select_poke(isc__socketmgr_t *mgr, int threadid, int fd, int msg) {
922 	int cc;
923 	int buf[2];
924 	char strbuf[ISC_STRERRORSIZE];
925 
926 	buf[0] = fd;
927 	buf[1] = msg;
928 
929 	do {
930 		cc = write(mgr->threads[threadid].pipe_fds[1], buf,
931 			   sizeof(buf));
932 #ifdef ENOSR
933 		/*
934 		 * Treat ENOSR as EAGAIN but loop slowly as it is
935 		 * unlikely to clear fast.
936 		 */
937 		if (cc < 0 && errno == ENOSR) {
938 			sleep(1);
939 			errno = EAGAIN;
940 		}
941 #endif /* ifdef ENOSR */
942 	} while (cc < 0 && SOFT_ERROR(errno));
943 
944 	if (cc < 0) {
945 		strerror_r(errno, strbuf, sizeof(strbuf));
946 		FATAL_ERROR(__FILE__, __LINE__,
947 			    "write() failed during watcher poke: %s", strbuf);
948 	}
949 
950 	INSIST(cc == sizeof(buf));
951 }
952 
953 /*
954  * Read a message on the internal fd.
955  */
956 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)957 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
958 	int buf[2];
959 	int cc;
960 	char strbuf[ISC_STRERRORSIZE];
961 
962 	cc = read(thread->pipe_fds[0], buf, sizeof(buf));
963 	if (cc < 0) {
964 		*msg = SELECT_POKE_NOTHING;
965 		*fd = -1; /* Silence compiler. */
966 		if (SOFT_ERROR(errno)) {
967 			return;
968 		}
969 
970 		strerror_r(errno, strbuf, sizeof(strbuf));
971 		FATAL_ERROR(__FILE__, __LINE__,
972 			    "read() failed during watcher poke: %s", strbuf);
973 	}
974 	INSIST(cc == sizeof(buf));
975 
976 	*fd = buf[0];
977 	*msg = buf[1];
978 }
979 
980 /*
981  * Make a fd non-blocking.
982  */
983 static isc_result_t
make_nonblock(int fd)984 make_nonblock(int fd) {
985 	int ret;
986 	char strbuf[ISC_STRERRORSIZE];
987 #ifdef USE_FIONBIO_IOCTL
988 	int on = 1;
989 #else  /* ifdef USE_FIONBIO_IOCTL */
990 	int flags;
991 #endif /* ifdef USE_FIONBIO_IOCTL */
992 
993 #ifdef USE_FIONBIO_IOCTL
994 	ret = ioctl(fd, FIONBIO, (char *)&on);
995 #else  /* ifdef USE_FIONBIO_IOCTL */
996 	flags = fcntl(fd, F_GETFL, 0);
997 	flags |= PORT_NONBLOCK;
998 	ret = fcntl(fd, F_SETFL, flags);
999 #endif /* ifdef USE_FIONBIO_IOCTL */
1000 
1001 	if (ret == -1) {
1002 		strerror_r(errno, strbuf, sizeof(strbuf));
1003 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1004 #ifdef USE_FIONBIO_IOCTL
1005 				 "ioctl(%d, FIONBIO, &on): %s", fd,
1006 #else  /* ifdef USE_FIONBIO_IOCTL */
1007 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1008 #endif /* ifdef USE_FIONBIO_IOCTL */
1009 				 strbuf);
1010 
1011 		return (ISC_R_UNEXPECTED);
1012 	}
1013 
1014 	return (ISC_R_SUCCESS);
1015 }
1016 
1017 #ifdef USE_CMSG
1018 /*
1019  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1020  * In order to ensure as much portability as possible, we provide wrapper
1021  * functions of these macros.
1022  * Note that cmsg_space() could run slow on OSes that do not have
1023  * CMSG_SPACE.
1024  */
1025 static inline socklen_t
cmsg_len(socklen_t len)1026 cmsg_len(socklen_t len) {
1027 #ifdef CMSG_LEN
1028 	return (CMSG_LEN(len));
1029 #else  /* ifdef CMSG_LEN */
1030 	socklen_t hdrlen;
1031 
1032 	/*
1033 	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1034 	 * is correct.
1035 	 */
1036 	hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1037 	return (hdrlen + len);
1038 #endif /* ifdef CMSG_LEN */
1039 }
1040 
1041 static inline socklen_t
cmsg_space(socklen_t len)1042 cmsg_space(socklen_t len) {
1043 #ifdef CMSG_SPACE
1044 	return (CMSG_SPACE(len));
1045 #else  /* ifdef CMSG_SPACE */
1046 	struct msghdr msg;
1047 	struct cmsghdr *cmsgp;
1048 	/*
1049 	 * XXX: The buffer length is an ad-hoc value, but should be enough
1050 	 * in a practical sense.
1051 	 */
1052 	char dummybuf[sizeof(struct cmsghdr) + 1024];
1053 
1054 	memset(&msg, 0, sizeof(msg));
1055 	msg.msg_control = dummybuf;
1056 	msg.msg_controllen = sizeof(dummybuf);
1057 
1058 	cmsgp = (struct cmsghdr *)dummybuf;
1059 	cmsgp->cmsg_len = cmsg_len(len);
1060 
1061 	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1062 	if (cmsgp != NULL) {
1063 		return ((char *)cmsgp - (char *)msg.msg_control);
1064 	} else {
1065 		return (0);
1066 	}
1067 #endif /* ifdef CMSG_SPACE */
1068 }
1069 #endif /* USE_CMSG */
1070 
1071 /*
1072  * Process control messages received on a socket.
1073  */
1074 static void
process_cmsg(isc__socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1075 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1076 #ifdef USE_CMSG
1077 	struct cmsghdr *cmsgp;
1078 	struct in6_pktinfo *pktinfop;
1079 #ifdef SO_TIMESTAMP
1080 	void *timevalp;
1081 #endif /* ifdef SO_TIMESTAMP */
1082 #endif /* ifdef USE_CMSG */
1083 
1084 	/*
1085 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1086 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1087 	 * They are all here, outside of the CPP tests, because it is
1088 	 * more consistent with the usual ISC coding style.
1089 	 */
1090 	UNUSED(sock);
1091 	UNUSED(msg);
1092 	UNUSED(dev);
1093 
1094 #ifdef MSG_TRUNC
1095 	if ((msg->msg_flags & MSG_TRUNC) != 0) {
1096 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1097 	}
1098 #endif /* ifdef MSG_TRUNC */
1099 
1100 #ifdef MSG_CTRUNC
1101 	if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1102 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1103 	}
1104 #endif /* ifdef MSG_CTRUNC */
1105 
1106 #ifndef USE_CMSG
1107 	return;
1108 #else /* ifndef USE_CMSG */
1109 	if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1110 		return;
1111 	}
1112 
1113 #ifdef SO_TIMESTAMP
1114 	timevalp = NULL;
1115 #endif /* ifdef SO_TIMESTAMP */
1116 	pktinfop = NULL;
1117 
1118 	cmsgp = CMSG_FIRSTHDR(msg);
1119 	while (cmsgp != NULL) {
1120 		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1121 
1122 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1123 		    cmsgp->cmsg_type == IPV6_PKTINFO) {
1124 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1125 			memmove(&dev->pktinfo, pktinfop,
1126 				sizeof(struct in6_pktinfo));
1127 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1128 			socket_log(sock, NULL, TRACE,
1129 				   "interface received on ifindex %u",
1130 				   dev->pktinfo.ipi6_ifindex);
1131 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1132 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1133 			}
1134 			goto next;
1135 		}
1136 
1137 #ifdef SO_TIMESTAMP
1138 		if (cmsgp->cmsg_level == SOL_SOCKET &&
1139 		    cmsgp->cmsg_type == SCM_TIMESTAMP) {
1140 			struct timeval tv;
1141 			timevalp = CMSG_DATA(cmsgp);
1142 			memmove(&tv, timevalp, sizeof(tv));
1143 			dev->timestamp.seconds = tv.tv_sec;
1144 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1145 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1146 			goto next;
1147 		}
1148 #endif /* ifdef SO_TIMESTAMP */
1149 
1150 #ifdef IPV6_TCLASS
1151 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1152 		    cmsgp->cmsg_type == IPV6_TCLASS) {
1153 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
1154 			dev->dscp >>= 2;
1155 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1156 			goto next;
1157 		}
1158 #endif /* ifdef IPV6_TCLASS */
1159 
1160 #ifdef IP_TOS
1161 		if (cmsgp->cmsg_level == IPPROTO_IP &&
1162 		    (cmsgp->cmsg_type == IP_TOS
1163 #ifdef IP_RECVTOS
1164 		     || cmsgp->cmsg_type == IP_RECVTOS
1165 #endif /* ifdef IP_RECVTOS */
1166 		     ))
1167 		{
1168 			dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1169 			dev->dscp >>= 2;
1170 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1171 			goto next;
1172 		}
1173 #endif /* ifdef IP_TOS */
1174 	next:
1175 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1176 	}
1177 #endif /* USE_CMSG */
1178 }
1179 
1180 /*
1181  * Construct an iov array and attach it to the msghdr passed in.  This is
1182  * the SEND constructor, which will use the used region of the buffer
1183  * (if using a buffer list) or will use the internal region (if a single
1184  * buffer I/O is requested).
1185  *
1186  * Nothing can be NULL, and the done event must list at least one buffer
1187  * on the buffer linked list for this function to be meaningful.
1188  *
1189  * If write_countp != NULL, *write_countp will hold the number of bytes
1190  * this transaction can send.
1191  */
1192 static void
build_msghdr_send(isc__socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1193 build_msghdr_send(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1194 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1195 	unsigned int iovcount;
1196 	size_t write_count;
1197 	struct cmsghdr *cmsgp;
1198 
1199 	memset(msg, 0, sizeof(*msg));
1200 
1201 	if (!sock->connected) {
1202 		msg->msg_name = (void *)&dev->address.type.sa;
1203 		msg->msg_namelen = dev->address.length;
1204 	} else {
1205 		msg->msg_name = NULL;
1206 		msg->msg_namelen = 0;
1207 	}
1208 
1209 	write_count = dev->region.length - dev->n;
1210 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1211 	iov[0].iov_len = write_count;
1212 	iovcount = 1;
1213 
1214 	msg->msg_iov = iov;
1215 	msg->msg_iovlen = iovcount;
1216 	msg->msg_control = NULL;
1217 	msg->msg_controllen = 0;
1218 	msg->msg_flags = 0;
1219 #if defined(USE_CMSG)
1220 
1221 	if ((sock->type == isc_sockettype_udp) &&
1222 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1223 	{
1224 		struct in6_pktinfo *pktinfop;
1225 
1226 		socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1227 			   dev->pktinfo.ipi6_ifindex);
1228 
1229 		msg->msg_control = (void *)cmsgbuf;
1230 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1231 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1232 
1233 		cmsgp = (struct cmsghdr *)cmsgbuf;
1234 		cmsgp->cmsg_level = IPPROTO_IPV6;
1235 		cmsgp->cmsg_type = IPV6_PKTINFO;
1236 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1237 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1238 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1239 	}
1240 
1241 #if defined(IPV6_USE_MIN_MTU)
1242 	if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1243 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1244 	{
1245 		int use_min_mtu = 1; /* -1, 0, 1 */
1246 
1247 		cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1248 		msg->msg_control = (void *)cmsgbuf;
1249 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1250 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1251 
1252 		cmsgp->cmsg_level = IPPROTO_IPV6;
1253 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1254 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1255 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1256 	}
1257 #endif /* if defined(IPV6_USE_MIN_MTU) */
1258 
1259 	if (isc_dscp_check_value > -1) {
1260 		if (sock->type == isc_sockettype_udp) {
1261 			INSIST((int)dev->dscp == isc_dscp_check_value);
1262 		} else if (sock->type == isc_sockettype_tcp) {
1263 			INSIST((int)sock->dscp == isc_dscp_check_value);
1264 		}
1265 	}
1266 
1267 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1268 	if ((sock->type == isc_sockettype_udp) &&
1269 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1270 	{
1271 		int dscp = (dev->dscp << 2) & 0xff;
1272 
1273 		INSIST(dev->dscp < 0x40);
1274 
1275 #ifdef IP_TOS
1276 		if (sock->pf == AF_INET && sock->pktdscp) {
1277 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1278 						   msg->msg_controllen);
1279 			msg->msg_control = (void *)cmsgbuf;
1280 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1281 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1282 
1283 			cmsgp->cmsg_level = IPPROTO_IP;
1284 			cmsgp->cmsg_type = IP_TOS;
1285 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
1286 			*(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1287 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1288 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1289 				       (void *)&dscp, sizeof(int)) < 0) {
1290 				char strbuf[ISC_STRERRORSIZE];
1291 				strerror_r(errno, strbuf, sizeof(strbuf));
1292 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1293 						 "setsockopt(%d, IP_TOS, %.02x)"
1294 						 " failed: %s",
1295 						 sock->fd, dscp >> 2, strbuf);
1296 			} else {
1297 				sock->dscp = dscp;
1298 			}
1299 		}
1300 #endif /* ifdef IP_TOS */
1301 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1302 		if (sock->pf == AF_INET6 && sock->pktdscp) {
1303 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1304 						   msg->msg_controllen);
1305 			msg->msg_control = (void *)cmsgbuf;
1306 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1307 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1308 
1309 			cmsgp->cmsg_level = IPPROTO_IPV6;
1310 			cmsgp->cmsg_type = IPV6_TCLASS;
1311 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1312 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1313 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1314 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1315 				       (void *)&dscp, sizeof(int)) < 0)
1316 			{
1317 				char strbuf[ISC_STRERRORSIZE];
1318 				strerror_r(errno, strbuf, sizeof(strbuf));
1319 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1320 						 "setsockopt(%d, IPV6_TCLASS, "
1321 						 "%.02x) failed: %s",
1322 						 sock->fd, dscp >> 2, strbuf);
1323 			} else {
1324 				sock->dscp = dscp;
1325 			}
1326 		}
1327 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1328 		if (msg->msg_controllen != 0 &&
1329 		    msg->msg_controllen < SENDCMSGBUFLEN) {
1330 			memset(cmsgbuf + msg->msg_controllen, 0,
1331 			       SENDCMSGBUFLEN - msg->msg_controllen);
1332 		}
1333 	}
1334 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1335 	* defined(IPV6_TCLASS))                           \
1336 	* */
1337 #endif /* USE_CMSG */
1338 
1339 	if (write_countp != NULL) {
1340 		*write_countp = write_count;
1341 	}
1342 }
1343 
1344 /*
1345  * Construct an iov array and attach it to the msghdr passed in.  This is
1346  * the RECV constructor, which will use the available region of the buffer
1347  * (if using a buffer list) or will use the internal region (if a single
1348  * buffer I/O is requested).
1349  *
1350  * Nothing can be NULL, and the done event must list at least one buffer
1351  * on the buffer linked list for this function to be meaningful.
1352  *
1353  * If read_countp != NULL, *read_countp will hold the number of bytes
1354  * this transaction can receive.
1355  */
1356 static void
build_msghdr_recv(isc__socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1357 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1358 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1359 	unsigned int iovcount;
1360 	size_t read_count;
1361 
1362 	memset(msg, 0, sizeof(struct msghdr));
1363 
1364 	if (sock->type == isc_sockettype_udp) {
1365 		memset(&dev->address, 0, sizeof(dev->address));
1366 		msg->msg_name = (void *)&dev->address.type.sa;
1367 		msg->msg_namelen = sizeof(dev->address.type);
1368 	} else { /* TCP */
1369 		msg->msg_name = NULL;
1370 		msg->msg_namelen = 0;
1371 		dev->address = sock->peer_address;
1372 	}
1373 
1374 	read_count = dev->region.length - dev->n;
1375 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1376 	iov[0].iov_len = read_count;
1377 	iovcount = 1;
1378 
1379 	/*
1380 	 * If needed, set up to receive that one extra byte.
1381 	 */
1382 #ifdef ISC_PLATFORM_RECVOVERFLOW
1383 	if (sock->type == isc_sockettype_udp) {
1384 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1385 		iov[iovcount].iov_base = (void *)(&sock->overflow);
1386 		iov[iovcount].iov_len = 1;
1387 		iovcount++;
1388 	}
1389 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1390 
1391 	msg->msg_iov = iov;
1392 	msg->msg_iovlen = iovcount;
1393 
1394 #if defined(USE_CMSG)
1395 	msg->msg_control = cmsgbuf;
1396 	msg->msg_controllen = RECVCMSGBUFLEN;
1397 #else  /* if defined(USE_CMSG) */
1398 	msg->msg_control = NULL;
1399 	msg->msg_controllen = 0;
1400 #endif /* USE_CMSG */
1401 	msg->msg_flags = 0;
1402 
1403 	if (read_countp != NULL) {
1404 		*read_countp = read_count;
1405 	}
1406 }
1407 
1408 static void
set_dev_address(const isc_sockaddr_t * address,isc__socket_t * sock,isc_socketevent_t * dev)1409 set_dev_address(const isc_sockaddr_t *address, isc__socket_t *sock,
1410 		isc_socketevent_t *dev) {
1411 	if (sock->type == isc_sockettype_udp) {
1412 		if (address != NULL) {
1413 			dev->address = *address;
1414 		} else {
1415 			dev->address = sock->peer_address;
1416 		}
1417 	} else if (sock->type == isc_sockettype_tcp) {
1418 		INSIST(address == NULL);
1419 		dev->address = sock->peer_address;
1420 	}
1421 }
1422 
1423 static void
destroy_socketevent(isc_event_t * event)1424 destroy_socketevent(isc_event_t *event) {
1425 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1426 
1427 	(ev->destroy)(event);
1428 }
1429 
1430 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1431 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1432 		     isc_taskaction_t action, void *arg) {
1433 	isc_socketevent_t *ev;
1434 
1435 	ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1436 						     action, arg, sizeof(*ev));
1437 
1438 	ev->result = ISC_R_UNSET;
1439 	ISC_LINK_INIT(ev, ev_link);
1440 	ev->region.base = NULL;
1441 	ev->n = 0;
1442 	ev->offset = 0;
1443 	ev->attributes = 0;
1444 	ev->destroy = ev->ev_destroy;
1445 	ev->ev_destroy = destroy_socketevent;
1446 	ev->dscp = 0;
1447 
1448 	return (ev);
1449 }
1450 
1451 #if defined(ISC_SOCKET_DEBUG)
1452 static void
dump_msg(struct msghdr * msg)1453 dump_msg(struct msghdr *msg) {
1454 	unsigned int i;
1455 
1456 	printf("MSGHDR %p\n", msg);
1457 	printf("\tname %p, namelen %ld\n", msg->msg_name,
1458 	       (long)msg->msg_namelen);
1459 	printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1460 	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1461 		printf("\t\t%u\tbase %p, len %ld\n", i,
1462 		       msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1463 	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1464 	       (long)msg->msg_controllen);
1465 }
1466 #endif /* if defined(ISC_SOCKET_DEBUG) */
1467 
1468 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1469 #define DOIO_SOFT    1 /* i/o ok, soft error, no event sent */
1470 #define DOIO_HARD    2 /* i/o error, event sent */
1471 #define DOIO_EOF     3 /* EOF, no event sent */
1472 
1473 static int
doio_recv(isc__socket_t * sock,isc_socketevent_t * dev)1474 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1475 	int cc;
1476 	struct iovec iov[MAXSCATTERGATHER_RECV];
1477 	size_t read_count;
1478 	struct msghdr msghdr;
1479 	int recv_errno;
1480 	char strbuf[ISC_STRERRORSIZE];
1481 	char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1482 
1483 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1484 
1485 #if defined(ISC_SOCKET_DEBUG)
1486 	dump_msg(&msghdr);
1487 #endif /* if defined(ISC_SOCKET_DEBUG) */
1488 
1489 	cc = recvmsg(sock->fd, &msghdr, 0);
1490 	recv_errno = errno;
1491 
1492 #if defined(ISC_SOCKET_DEBUG)
1493 	dump_msg(&msghdr);
1494 #endif /* if defined(ISC_SOCKET_DEBUG) */
1495 
1496 	if (cc < 0) {
1497 		if (SOFT_ERROR(recv_errno)) {
1498 			return (DOIO_SOFT);
1499 		}
1500 
1501 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1502 			strerror_r(recv_errno, strbuf, sizeof(strbuf));
1503 			socket_log(sock, NULL, IOEVENT,
1504 				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1505 				   sock->fd, cc, recv_errno, strbuf);
1506 		}
1507 
1508 #define SOFT_OR_HARD(_system, _isc)                                   \
1509 	if (recv_errno == _system) {                                  \
1510 		if (sock->connected) {                                \
1511 			dev->result = _isc;                           \
1512 			inc_stats(sock->manager->stats,               \
1513 				  sock->statsindex[STATID_RECVFAIL]); \
1514 			return (DOIO_HARD);                           \
1515 		}                                                     \
1516 		return (DOIO_SOFT);                                   \
1517 	}
1518 #define ALWAYS_HARD(_system, _isc)                            \
1519 	if (recv_errno == _system) {                          \
1520 		dev->result = _isc;                           \
1521 		inc_stats(sock->manager->stats,               \
1522 			  sock->statsindex[STATID_RECVFAIL]); \
1523 		return (DOIO_HARD);                           \
1524 	}
1525 
1526 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1527 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1528 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1529 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1530 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1531 		/* Should never get this one but it was seen. */
1532 #ifdef ENOPROTOOPT
1533 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1534 #endif /* ifdef ENOPROTOOPT */
1535 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1536 
1537 #undef SOFT_OR_HARD
1538 #undef ALWAYS_HARD
1539 
1540 		dev->result = isc__errno2result(recv_errno);
1541 		inc_stats(sock->manager->stats,
1542 			  sock->statsindex[STATID_RECVFAIL]);
1543 		return (DOIO_HARD);
1544 	}
1545 
1546 	/*
1547 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1548 	 * while on UDP sockets, zero length reads are perfectly valid,
1549 	 * although strange.
1550 	 */
1551 	switch (sock->type) {
1552 	case isc_sockettype_tcp:
1553 	case isc_sockettype_unix:
1554 		if (cc == 0) {
1555 			return (DOIO_EOF);
1556 		}
1557 		break;
1558 	case isc_sockettype_udp:
1559 	case isc_sockettype_raw:
1560 		break;
1561 	default:
1562 		INSIST(0);
1563 		ISC_UNREACHABLE();
1564 	}
1565 
1566 	if (sock->type == isc_sockettype_udp) {
1567 		dev->address.length = msghdr.msg_namelen;
1568 		if (isc_sockaddr_getport(&dev->address) == 0) {
1569 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1570 				socket_log(sock, &dev->address, IOEVENT,
1571 					   "dropping source port zero packet");
1572 			}
1573 			return (DOIO_SOFT);
1574 		}
1575 		/*
1576 		 * Simulate a firewall blocking UDP responses bigger than
1577 		 * 'maxudp' bytes.
1578 		 */
1579 		if (sock->manager->maxudp != 0 &&
1580 		    cc > (int)sock->manager->maxudp) {
1581 			return (DOIO_SOFT);
1582 		}
1583 	}
1584 
1585 	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1586 
1587 	/*
1588 	 * Overflow bit detection.  If we received MORE bytes than we should,
1589 	 * this indicates an overflow situation.  Set the flag in the
1590 	 * dev entry and adjust how much we read by one.
1591 	 */
1592 #ifdef ISC_PLATFORM_RECVOVERFLOW
1593 	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1594 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1595 		cc--;
1596 	}
1597 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1598 
1599 	/*
1600 	 * If there are control messages attached, run through them and pull
1601 	 * out the interesting bits.
1602 	 */
1603 	process_cmsg(sock, &msghdr, dev);
1604 
1605 	/*
1606 	 * update the buffers (if any) and the i/o count
1607 	 */
1608 	dev->n += cc;
1609 
1610 	/*
1611 	 * If we read less than we expected, update counters,
1612 	 * and let the upper layer poke the descriptor.
1613 	 */
1614 	if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1615 		return (DOIO_SOFT);
1616 	}
1617 
1618 	/*
1619 	 * Full reads are posted, or partials if partials are ok.
1620 	 */
1621 	dev->result = ISC_R_SUCCESS;
1622 	return (DOIO_SUCCESS);
1623 }
1624 
1625 /*
1626  * Returns:
1627  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1628  *			ISC_R_SUCCESS.
1629  *
1630  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1631  *			dev->result contains the appropriate error.
1632  *
1633  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1634  *			event was sent.  The operation should be retried.
1635  *
1636  *	No other return values are possible.
1637  */
1638 static int
doio_send(isc__socket_t * sock,isc_socketevent_t * dev)1639 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1640 	int cc;
1641 	struct iovec iov[MAXSCATTERGATHER_SEND];
1642 	size_t write_count;
1643 	struct msghdr msghdr;
1644 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1645 	int attempts = 0;
1646 	int send_errno;
1647 	char strbuf[ISC_STRERRORSIZE];
1648 	char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1649 
1650 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1651 
1652 resend:
1653 	if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1654 	    write_count > sock->manager->maxudp)
1655 	{
1656 		cc = write_count;
1657 	} else {
1658 		cc = sendmsg(sock->fd, &msghdr, 0);
1659 	}
1660 	send_errno = errno;
1661 
1662 	/*
1663 	 * Check for error or block condition.
1664 	 */
1665 	if (cc < 0) {
1666 		if (send_errno == EINTR && ++attempts < NRETRIES) {
1667 			goto resend;
1668 		}
1669 
1670 		if (SOFT_ERROR(send_errno)) {
1671 			if (errno == EWOULDBLOCK || errno == EAGAIN) {
1672 				dev->result = ISC_R_WOULDBLOCK;
1673 			}
1674 			return (DOIO_SOFT);
1675 		}
1676 
1677 #define SOFT_OR_HARD(_system, _isc)                                   \
1678 	if (send_errno == _system) {                                  \
1679 		if (sock->connected) {                                \
1680 			dev->result = _isc;                           \
1681 			inc_stats(sock->manager->stats,               \
1682 				  sock->statsindex[STATID_SENDFAIL]); \
1683 			return (DOIO_HARD);                           \
1684 		}                                                     \
1685 		return (DOIO_SOFT);                                   \
1686 	}
1687 #define ALWAYS_HARD(_system, _isc)                            \
1688 	if (send_errno == _system) {                          \
1689 		dev->result = _isc;                           \
1690 		inc_stats(sock->manager->stats,               \
1691 			  sock->statsindex[STATID_SENDFAIL]); \
1692 		return (DOIO_HARD);                           \
1693 	}
1694 
1695 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1696 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1697 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1698 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1699 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1700 #ifdef EHOSTDOWN
1701 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1702 #endif /* ifdef EHOSTDOWN */
1703 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1704 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1705 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1706 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1707 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1708 
1709 #undef SOFT_OR_HARD
1710 #undef ALWAYS_HARD
1711 
1712 		/*
1713 		 * The other error types depend on whether or not the
1714 		 * socket is UDP or TCP.  If it is UDP, some errors
1715 		 * that we expect to be fatal under TCP are merely
1716 		 * annoying, and are really soft errors.
1717 		 *
1718 		 * However, these soft errors are still returned as
1719 		 * a status.
1720 		 */
1721 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1722 		strerror_r(send_errno, strbuf, sizeof(strbuf));
1723 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1724 				 addrbuf, strbuf);
1725 		dev->result = isc__errno2result(send_errno);
1726 		inc_stats(sock->manager->stats,
1727 			  sock->statsindex[STATID_SENDFAIL]);
1728 		return (DOIO_HARD);
1729 	}
1730 
1731 	if (cc == 0) {
1732 		inc_stats(sock->manager->stats,
1733 			  sock->statsindex[STATID_SENDFAIL]);
1734 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1735 				 "doio_send: send() returned 0");
1736 	}
1737 
1738 	/*
1739 	 * If we write less than we expected, update counters, poke.
1740 	 */
1741 	dev->n += cc;
1742 	if ((size_t)cc != write_count) {
1743 		return (DOIO_SOFT);
1744 	}
1745 
1746 	/*
1747 	 * Exactly what we wanted to write.  We're done with this
1748 	 * entry.  Post its completion event.
1749 	 */
1750 	dev->result = ISC_R_SUCCESS;
1751 	return (DOIO_SUCCESS);
1752 }
1753 
1754 /*
1755  * Kill.
1756  *
1757  * Caller must ensure that the socket is not locked and no external
1758  * references exist.
1759  */
1760 static void
socketclose(isc__socketthread_t * thread,isc__socket_t * sock,int fd)1761 socketclose(isc__socketthread_t *thread, isc__socket_t *sock, int fd) {
1762 	int lockid = FDLOCK_ID(fd);
1763 	/*
1764 	 * No one has this socket open, so the watcher doesn't have to be
1765 	 * poked, and the socket doesn't have to be locked.
1766 	 */
1767 	LOCK(&thread->fdlock[lockid]);
1768 	thread->fds[fd] = NULL;
1769 	thread->fdstate[fd] = CLOSE_PENDING;
1770 	UNLOCK(&thread->fdlock[lockid]);
1771 	select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
1772 
1773 	inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1774 
1775 	LOCK(&sock->lock);
1776 	if (sock->active == 1) {
1777 		dec_stats(thread->manager->stats,
1778 			  sock->statsindex[STATID_ACTIVE]);
1779 		sock->active = 0;
1780 	}
1781 	UNLOCK(&sock->lock);
1782 
1783 	/*
1784 	 * update manager->maxfd here (XXX: this should be implemented more
1785 	 * efficiently)
1786 	 */
1787 #ifdef USE_SELECT
1788 	LOCK(&thread->manager->lock);
1789 	if (thread->maxfd == fd) {
1790 		int i;
1791 
1792 		thread->maxfd = 0;
1793 		for (i = fd - 1; i >= 0; i--) {
1794 			lockid = FDLOCK_ID(i);
1795 
1796 			LOCK(&thread->fdlock[lockid]);
1797 			if (thread->fdstate[i] == MANAGED) {
1798 				thread->maxfd = i;
1799 				UNLOCK(&thread->fdlock[lockid]);
1800 				break;
1801 			}
1802 			UNLOCK(&thread->fdlock[lockid]);
1803 		}
1804 		if (thread->maxfd < thread->pipe_fds[0]) {
1805 			thread->maxfd = thread->pipe_fds[0];
1806 		}
1807 	}
1808 
1809 	UNLOCK(&thread->manager->lock);
1810 #endif /* USE_SELECT */
1811 }
1812 
1813 static void
destroy(isc__socket_t ** sockp)1814 destroy(isc__socket_t **sockp) {
1815 	int fd = 0;
1816 	isc__socket_t *sock = *sockp;
1817 	isc__socketmgr_t *manager = sock->manager;
1818 	isc__socketthread_t *thread = NULL;
1819 
1820 	socket_log(sock, NULL, CREATION, "destroying");
1821 
1822 	isc_refcount_destroy(&sock->references);
1823 
1824 	LOCK(&sock->lock);
1825 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1826 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1827 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1828 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1829 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1830 
1831 	if (sock->fd >= 0) {
1832 		fd = sock->fd;
1833 		thread = &manager->threads[sock->threadid];
1834 		sock->fd = -1;
1835 		sock->threadid = -1;
1836 	}
1837 	UNLOCK(&sock->lock);
1838 
1839 	if (fd > 0) {
1840 		socketclose(thread, sock, fd);
1841 	}
1842 
1843 	LOCK(&manager->lock);
1844 
1845 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1846 
1847 	if (ISC_LIST_EMPTY(manager->socklist)) {
1848 		SIGNAL(&manager->shutdown_ok);
1849 	}
1850 
1851 	/* can't unlock manager as its memory context is still used */
1852 	free_socket(sockp);
1853 
1854 	UNLOCK(&manager->lock);
1855 }
1856 
1857 static isc_result_t
allocate_socket(isc__socketmgr_t * manager,isc_sockettype_t type,isc__socket_t ** socketp)1858 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1859 		isc__socket_t **socketp) {
1860 	isc__socket_t *sock;
1861 
1862 	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1863 
1864 	sock->common.magic = 0;
1865 	sock->common.impmagic = 0;
1866 	isc_refcount_init(&sock->references, 0);
1867 
1868 	sock->manager = manager;
1869 	sock->type = type;
1870 	sock->fd = -1;
1871 	sock->threadid = -1;
1872 	sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1873 	sock->dupped = 0;
1874 	sock->statsindex = NULL;
1875 	sock->active = 0;
1876 
1877 	ISC_LINK_INIT(sock, link);
1878 
1879 	memset(sock->name, 0, sizeof(sock->name));
1880 	sock->tag = NULL;
1881 
1882 	/*
1883 	 * Set up list of readers and writers to be initially empty.
1884 	 */
1885 	ISC_LIST_INIT(sock->recv_list);
1886 	ISC_LIST_INIT(sock->send_list);
1887 	ISC_LIST_INIT(sock->accept_list);
1888 	ISC_LIST_INIT(sock->connect_list);
1889 
1890 	sock->listener = 0;
1891 	sock->connected = 0;
1892 	sock->connecting = 0;
1893 	sock->bound = 0;
1894 	sock->pktdscp = 0;
1895 
1896 	/*
1897 	 * Initialize the lock.
1898 	 */
1899 	isc_mutex_init(&sock->lock);
1900 
1901 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1902 	sock->common.impmagic = SOCKET_MAGIC;
1903 	*socketp = sock;
1904 
1905 	return (ISC_R_SUCCESS);
1906 }
1907 
1908 /*
1909  * This event requires that the various lists be empty, that the reference
1910  * count be 1, and that the magic number is valid.  The other socket bits,
1911  * like the lock, must be initialized as well.  The fd associated must be
1912  * marked as closed, by setting it to -1 on close, or this routine will
1913  * also close the socket.
1914  */
1915 static void
free_socket(isc__socket_t ** socketp)1916 free_socket(isc__socket_t **socketp) {
1917 	isc__socket_t *sock = *socketp;
1918 	*socketp = NULL;
1919 
1920 	INSIST(VALID_SOCKET(sock));
1921 	isc_refcount_destroy(&sock->references);
1922 	LOCK(&sock->lock);
1923 	INSIST(!sock->connecting);
1924 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1925 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1926 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1927 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1928 	INSIST(!ISC_LINK_LINKED(sock, link));
1929 	UNLOCK(&sock->lock);
1930 
1931 	sock->common.magic = 0;
1932 	sock->common.impmagic = 0;
1933 
1934 	isc_mutex_destroy(&sock->lock);
1935 
1936 	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1937 }
1938 
1939 #if defined(SET_RCVBUF)
1940 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1941 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1942 
1943 static void
set_rcvbuf(void)1944 set_rcvbuf(void) {
1945 	int fd;
1946 	int max = rcvbuf, min;
1947 	socklen_t len;
1948 
1949 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1950 	if (fd == -1) {
1951 		switch (errno) {
1952 		case EPROTONOSUPPORT:
1953 		case EPFNOSUPPORT:
1954 		case EAFNOSUPPORT:
1955 		/*
1956 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1957 		 * EAFNOSUPPORT.
1958 		 */
1959 		case EINVAL:
1960 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
1961 			break;
1962 		}
1963 	}
1964 	if (fd == -1) {
1965 		return;
1966 	}
1967 
1968 	len = sizeof(min);
1969 	if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
1970 	    min < rcvbuf)
1971 	{
1972 	again:
1973 		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
1974 			       sizeof(rcvbuf)) == -1)
1975 		{
1976 			if (errno == ENOBUFS && rcvbuf > min) {
1977 				max = rcvbuf - 1;
1978 				rcvbuf = (rcvbuf + min) / 2;
1979 				goto again;
1980 			} else {
1981 				rcvbuf = min;
1982 				goto cleanup;
1983 			}
1984 		} else {
1985 			min = rcvbuf;
1986 		}
1987 		if (min != max) {
1988 			rcvbuf = max;
1989 			goto again;
1990 		}
1991 	}
1992 cleanup:
1993 	close(fd);
1994 }
1995 #endif /* ifdef SO_RCVBUF */
1996 
1997 #if defined(SET_SNDBUF)
1998 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
1999 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2000 
2001 static void
set_sndbuf(void)2002 set_sndbuf(void) {
2003 	int fd;
2004 	int max = sndbuf, min;
2005 	socklen_t len;
2006 
2007 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2008 #if defined(ISC_PLATFORM_HAVEIPV6)
2009 	if (fd == -1) {
2010 		switch (errno) {
2011 		case EPROTONOSUPPORT:
2012 		case EPFNOSUPPORT:
2013 		case EAFNOSUPPORT:
2014 		/*
2015 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2016 		 * EAFNOSUPPORT.
2017 		 */
2018 		case EINVAL:
2019 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2020 			break;
2021 		}
2022 	}
2023 #endif /* if defined(ISC_PLATFORM_HAVEIPV6) */
2024 	if (fd == -1) {
2025 		return;
2026 	}
2027 
2028 	len = sizeof(min);
2029 	if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2030 	    min < sndbuf)
2031 	{
2032 	again:
2033 		if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2034 			       sizeof(sndbuf)) == -1)
2035 		{
2036 			if (errno == ENOBUFS && sndbuf > min) {
2037 				max = sndbuf - 1;
2038 				sndbuf = (sndbuf + min) / 2;
2039 				goto again;
2040 			} else {
2041 				sndbuf = min;
2042 				goto cleanup;
2043 			}
2044 		} else {
2045 			min = sndbuf;
2046 		}
2047 		if (min != max) {
2048 			sndbuf = max;
2049 			goto again;
2050 		}
2051 	}
2052 cleanup:
2053 	close(fd);
2054 }
2055 #endif /* ifdef SO_SNDBUF */
2056 
2057 static void
use_min_mtu(isc__socket_t * sock)2058 use_min_mtu(isc__socket_t *sock) {
2059 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2060 	UNUSED(sock);
2061 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2062 #ifdef IPV6_USE_MIN_MTU
2063 	/* use minimum MTU */
2064 	if (sock->pf == AF_INET6) {
2065 		int on = 1;
2066 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2067 				 (void *)&on, sizeof(on));
2068 	}
2069 #endif /* ifdef IPV6_USE_MIN_MTU */
2070 #if defined(IPV6_MTU)
2071 	/*
2072 	 * Use minimum MTU on IPv6 sockets.
2073 	 */
2074 	if (sock->pf == AF_INET6) {
2075 		int mtu = 1280;
2076 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2077 				 sizeof(mtu));
2078 	}
2079 #endif /* if defined(IPV6_MTU) */
2080 }
2081 
2082 static void
set_tcp_maxseg(isc__socket_t * sock,int size)2083 set_tcp_maxseg(isc__socket_t *sock, int size) {
2084 #ifdef TCP_MAXSEG
2085 	if (sock->type == isc_sockettype_tcp) {
2086 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2087 				 (void *)&size, sizeof(size));
2088 	}
2089 #endif /* ifdef TCP_MAXSEG */
2090 }
2091 
2092 static isc_result_t
opensocket(isc__socketmgr_t * manager,isc__socket_t * sock,isc__socket_t * dup_socket)2093 opensocket(isc__socketmgr_t *manager, isc__socket_t *sock,
2094 	   isc__socket_t *dup_socket) {
2095 	isc_result_t result;
2096 	char strbuf[ISC_STRERRORSIZE];
2097 	const char *err = "socket";
2098 	int tries = 0;
2099 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2100 	int on = 1;
2101 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2102 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2103 	socklen_t optlen;
2104 	int size = 0;
2105 #endif
2106 
2107 again:
2108 	if (dup_socket == NULL) {
2109 		switch (sock->type) {
2110 		case isc_sockettype_udp:
2111 			sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2112 			break;
2113 		case isc_sockettype_tcp:
2114 			sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2115 			break;
2116 		case isc_sockettype_unix:
2117 			sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2118 			break;
2119 		case isc_sockettype_raw:
2120 			errno = EPFNOSUPPORT;
2121 			/*
2122 			 * PF_ROUTE is a alias for PF_NETLINK on linux.
2123 			 */
2124 #if defined(PF_ROUTE)
2125 			if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2126 #ifdef NETLINK_ROUTE
2127 				sock->fd = socket(sock->pf, SOCK_RAW,
2128 						  NETLINK_ROUTE);
2129 #else  /* ifdef NETLINK_ROUTE */
2130 				sock->fd = socket(sock->pf, SOCK_RAW, 0);
2131 #endif /* ifdef NETLINK_ROUTE */
2132 				if (sock->fd != -1) {
2133 #ifdef NETLINK_ROUTE
2134 					struct sockaddr_nl sa;
2135 					int n;
2136 
2137 					/*
2138 					 * Do an implicit bind.
2139 					 */
2140 					memset(&sa, 0, sizeof(sa));
2141 					sa.nl_family = AF_NETLINK;
2142 					sa.nl_groups = RTMGRP_IPV4_IFADDR |
2143 						       RTMGRP_IPV6_IFADDR;
2144 					n = bind(sock->fd,
2145 						 (struct sockaddr *)&sa,
2146 						 sizeof(sa));
2147 					if (n < 0) {
2148 						close(sock->fd);
2149 						sock->fd = -1;
2150 					}
2151 #endif /* ifdef NETLINK_ROUTE */
2152 					sock->bound = 1;
2153 				}
2154 			}
2155 #endif /* if defined(PF_ROUTE) */
2156 			break;
2157 		}
2158 	} else {
2159 		sock->fd = dup(dup_socket->fd);
2160 		sock->dupped = 1;
2161 		sock->bound = dup_socket->bound;
2162 	}
2163 	if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2164 		goto again;
2165 	}
2166 
2167 #ifdef F_DUPFD
2168 	/*
2169 	 * Leave a space for stdio and TCP to work in.
2170 	 */
2171 	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2172 	    sock->fd >= 0 && sock->fd < manager->reserved)
2173 	{
2174 		int newfd, tmp;
2175 		newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2176 		tmp = errno;
2177 		(void)close(sock->fd);
2178 		errno = tmp;
2179 		sock->fd = newfd;
2180 		err = "isc_socket_create: fcntl/reserved";
2181 	} else if (sock->fd >= 0 && sock->fd < 20) {
2182 		int newfd, tmp;
2183 		newfd = fcntl(sock->fd, F_DUPFD, 20);
2184 		tmp = errno;
2185 		(void)close(sock->fd);
2186 		errno = tmp;
2187 		sock->fd = newfd;
2188 		err = "isc_socket_create: fcntl";
2189 	}
2190 #endif /* ifdef F_DUPFD */
2191 
2192 	if (sock->fd >= (int)manager->maxsocks) {
2193 		(void)close(sock->fd);
2194 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2195 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2196 			      "socket: file descriptor exceeds limit (%d/%u)",
2197 			      sock->fd, manager->maxsocks);
2198 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2199 		return (ISC_R_NORESOURCES);
2200 	}
2201 
2202 	if (sock->fd < 0) {
2203 		switch (errno) {
2204 		case EMFILE:
2205 		case ENFILE:
2206 			strerror_r(errno, strbuf, sizeof(strbuf));
2207 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2208 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2209 				      "%s: %s", err, strbuf);
2210 		/* fallthrough */
2211 		case ENOBUFS:
2212 			inc_stats(manager->stats,
2213 				  sock->statsindex[STATID_OPENFAIL]);
2214 			return (ISC_R_NORESOURCES);
2215 
2216 		case EPROTONOSUPPORT:
2217 		case EPFNOSUPPORT:
2218 		case EAFNOSUPPORT:
2219 		/*
2220 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2221 		 * EAFNOSUPPORT.
2222 		 */
2223 		case EINVAL:
2224 			inc_stats(manager->stats,
2225 				  sock->statsindex[STATID_OPENFAIL]);
2226 			return (ISC_R_FAMILYNOSUPPORT);
2227 
2228 		default:
2229 			strerror_r(errno, strbuf, sizeof(strbuf));
2230 			UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2231 					 err, strbuf);
2232 			inc_stats(manager->stats,
2233 				  sock->statsindex[STATID_OPENFAIL]);
2234 			return (ISC_R_UNEXPECTED);
2235 		}
2236 	}
2237 
2238 	if (dup_socket != NULL) {
2239 		goto setup_done;
2240 	}
2241 
2242 	result = make_nonblock(sock->fd);
2243 	if (result != ISC_R_SUCCESS) {
2244 		(void)close(sock->fd);
2245 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2246 		return (result);
2247 	}
2248 
2249 #ifdef SO_NOSIGPIPE
2250 	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2251 		       sizeof(on)) < 0) {
2252 		strerror_r(errno, strbuf, sizeof(strbuf));
2253 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2254 				 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2255 				 sock->fd, strbuf);
2256 		/* Press on... */
2257 	}
2258 #endif /* ifdef SO_NOSIGPIPE */
2259 
2260 	/*
2261 	 * Use minimum mtu if possible.
2262 	 */
2263 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2264 		use_min_mtu(sock);
2265 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2266 	}
2267 
2268 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2269 	if (sock->type == isc_sockettype_udp) {
2270 #if defined(USE_CMSG)
2271 #if defined(SO_TIMESTAMP)
2272 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2273 			       sizeof(on)) < 0 &&
2274 		    errno != ENOPROTOOPT)
2275 		{
2276 			strerror_r(errno, strbuf, sizeof(strbuf));
2277 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2278 					 "setsockopt(%d, SO_TIMESTAMP) failed: "
2279 					 "%s",
2280 					 sock->fd, strbuf);
2281 			/* Press on... */
2282 		}
2283 #endif /* SO_TIMESTAMP */
2284 
2285 #ifdef IPV6_RECVPKTINFO
2286 		/* RFC 3542 */
2287 		if ((sock->pf == AF_INET6) &&
2288 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2289 				(void *)&on, sizeof(on)) < 0))
2290 		{
2291 			strerror_r(errno, strbuf, sizeof(strbuf));
2292 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2293 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2294 					 "failed: %s",
2295 					 sock->fd, strbuf);
2296 		}
2297 #else  /* ifdef IPV6_RECVPKTINFO */
2298 		/* RFC 2292 */
2299 		if ((sock->pf == AF_INET6) &&
2300 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2301 				(void *)&on, sizeof(on)) < 0))
2302 		{
2303 			strerror_r(errno, strbuf, sizeof(strbuf));
2304 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2305 					 "setsockopt(%d, IPV6_PKTINFO) failed: "
2306 					 "%s",
2307 					 sock->fd, strbuf);
2308 		}
2309 #endif /* IPV6_RECVPKTINFO */
2310 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2311 		/*
2312 		 * Turn off Path MTU discovery on IPv6/UDP sockets.
2313 		 */
2314 		if (sock->pf == AF_INET6) {
2315 			int action = IPV6_PMTUDISC_DONT;
2316 			(void)setsockopt(sock->fd, IPPROTO_IPV6,
2317 					 IPV6_MTU_DISCOVER, &action,
2318 					 sizeof(action));
2319 		}
2320 #endif /* if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT) */
2321 #endif /* defined(USE_CMSG) */
2322 
2323 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2324 		/*
2325 		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2326 		 * Prefer IP_PMTUDISC_OMIT over IP_PMTUDISC_DONT
2327 		 * if it available.
2328 		 */
2329 		if (sock->pf == AF_INET) {
2330 			int action;
2331 #if defined(IP_PMTUDISC_OMIT)
2332 			action = IP_PMTUDISC_OMIT;
2333 			if (setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2334 				       &action, sizeof(action)) < 0)
2335 			{
2336 #endif /* if defined(IP_PMTUDISC_OMIT) */
2337 				action = IP_PMTUDISC_DONT;
2338 				(void)setsockopt(sock->fd, IPPROTO_IP,
2339 						 IP_MTU_DISCOVER, &action,
2340 						 sizeof(action));
2341 #if defined(IP_PMTUDISC_OMIT)
2342 			}
2343 #endif /* if defined(IP_PMTUDISC_OMIT) */
2344 		}
2345 #endif /* if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) */
2346 #if defined(IP_DONTFRAG)
2347 		/*
2348 		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2349 		 */
2350 		if (sock->pf == AF_INET) {
2351 			int off = 0;
2352 			(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2353 					 &off, sizeof(off));
2354 		}
2355 #endif /* if defined(IP_DONTFRAG) */
2356 
2357 #if defined(SET_RCVBUF)
2358 		optlen = sizeof(size);
2359 		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2360 			       &optlen) == 0 &&
2361 		    size < rcvbuf)
2362 		{
2363 			RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2364 				      ISC_R_SUCCESS);
2365 			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2366 				       (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2367 			{
2368 				strerror_r(errno, strbuf, sizeof(strbuf));
2369 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2370 						 "setsockopt(%d, SO_RCVBUF, "
2371 						 "%d) failed: %s",
2372 						 sock->fd, rcvbuf, strbuf);
2373 			}
2374 		}
2375 #endif /* if defined(SET_RCVBUF) */
2376 
2377 #if defined(SET_SNDBUF)
2378 		optlen = sizeof(size);
2379 		if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2380 			       &optlen) == 0 &&
2381 		    size < sndbuf)
2382 		{
2383 			RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2384 				      ISC_R_SUCCESS);
2385 			if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2386 				       (void *)&sndbuf, sizeof(sndbuf)) == -1)
2387 			{
2388 				strerror_r(errno, strbuf, sizeof(strbuf));
2389 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2390 						 "setsockopt(%d, SO_SNDBUF, "
2391 						 "%d) failed: %s",
2392 						 sock->fd, sndbuf, strbuf);
2393 			}
2394 		}
2395 #endif /* if defined(SO_SNDBUF) */
2396 	}
2397 #ifdef IPV6_RECVTCLASS
2398 	if ((sock->pf == AF_INET6) &&
2399 	    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2400 			sizeof(on)) < 0))
2401 	{
2402 		strerror_r(errno, strbuf, sizeof(strbuf));
2403 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2404 				 "setsockopt(%d, IPV6_RECVTCLASS) "
2405 				 "failed: %s",
2406 				 sock->fd, strbuf);
2407 	}
2408 #endif /* ifdef IPV6_RECVTCLASS */
2409 #ifdef IP_RECVTOS
2410 	if ((sock->pf == AF_INET) &&
2411 	    (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2412 			sizeof(on)) < 0))
2413 	{
2414 		strerror_r(errno, strbuf, sizeof(strbuf));
2415 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2416 				 "setsockopt(%d, IP_RECVTOS) "
2417 				 "failed: %s",
2418 				 sock->fd, strbuf);
2419 	}
2420 #endif /* ifdef IP_RECVTOS */
2421 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2422 
2423 setup_done:
2424 	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2425 	if (sock->active == 0) {
2426 		inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2427 		sock->active = 1;
2428 	}
2429 
2430 	return (ISC_R_SUCCESS);
2431 }
2432 
2433 /*
2434  * Create a 'type' socket or duplicate an existing socket, managed
2435  * by 'manager'.  Events will be posted to 'task' and when dispatched
2436  * 'action' will be called with 'arg' as the arg value.  The new
2437  * socket is returned in 'socketp'.
2438  */
2439 static isc_result_t
socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp,isc_socket_t * dup_socket)2440 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2441 	      isc_socket_t **socketp, isc_socket_t *dup_socket) {
2442 	isc__socket_t *sock = NULL;
2443 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2444 	isc__socketthread_t *thread;
2445 	isc_result_t result;
2446 	int lockid;
2447 
2448 	REQUIRE(VALID_MANAGER(manager));
2449 	REQUIRE(socketp != NULL && *socketp == NULL);
2450 
2451 	result = allocate_socket(manager, type, &sock);
2452 	if (result != ISC_R_SUCCESS) {
2453 		return (result);
2454 	}
2455 
2456 	switch (sock->type) {
2457 	case isc_sockettype_udp:
2458 		sock->statsindex = (pf == AF_INET) ? udp4statsindex
2459 						   : udp6statsindex;
2460 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2461 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2462 		break;
2463 	case isc_sockettype_tcp:
2464 		sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2465 						   : tcp6statsindex;
2466 		break;
2467 	case isc_sockettype_unix:
2468 		sock->statsindex = unixstatsindex;
2469 		break;
2470 	case isc_sockettype_raw:
2471 		sock->statsindex = rawstatsindex;
2472 		break;
2473 	default:
2474 		INSIST(0);
2475 		ISC_UNREACHABLE();
2476 	}
2477 
2478 	sock->pf = pf;
2479 
2480 	result = opensocket(manager, sock, (isc__socket_t *)dup_socket);
2481 	if (result != ISC_R_SUCCESS) {
2482 		free_socket(&sock);
2483 		return (result);
2484 	}
2485 
2486 	if (sock->fd == -1) {
2487 		abort();
2488 	}
2489 	sock->threadid = gen_threadid(sock);
2490 	isc_refcount_increment0(&sock->references);
2491 	thread = &manager->threads[sock->threadid];
2492 	*socketp = (isc_socket_t *)sock;
2493 
2494 	/*
2495 	 * Note we don't have to lock the socket like we normally would because
2496 	 * there are no external references to it yet.
2497 	 */
2498 
2499 	lockid = FDLOCK_ID(sock->fd);
2500 	LOCK(&thread->fdlock[lockid]);
2501 	thread->fds[sock->fd] = sock;
2502 	thread->fdstate[sock->fd] = MANAGED;
2503 #if defined(USE_EPOLL)
2504 	thread->epoll_events[sock->fd] = 0;
2505 #endif /* if defined(USE_EPOLL) */
2506 #ifdef USE_DEVPOLL
2507 	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2508 	       thread->fdpollinfo[sock->fd].want_write == 0);
2509 #endif /* ifdef USE_DEVPOLL */
2510 	UNLOCK(&thread->fdlock[lockid]);
2511 
2512 	LOCK(&manager->lock);
2513 	ISC_LIST_APPEND(manager->socklist, sock, link);
2514 #ifdef USE_SELECT
2515 	if (thread->maxfd < sock->fd) {
2516 		thread->maxfd = sock->fd;
2517 	}
2518 #endif /* ifdef USE_SELECT */
2519 	UNLOCK(&manager->lock);
2520 
2521 	socket_log(sock, NULL, CREATION,
2522 		   dup_socket != NULL ? "dupped" : "created");
2523 
2524 	return (ISC_R_SUCCESS);
2525 }
2526 
2527 /*%
2528  * Create a new 'type' socket managed by 'manager'.  Events
2529  * will be posted to 'task' and when dispatched 'action' will be
2530  * called with 'arg' as the arg value.  The new socket is returned
2531  * in 'socketp'.
2532  */
2533 isc_result_t
isc_socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2534 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2535 		  isc_socket_t **socketp) {
2536 	return (socket_create(manager0, pf, type, socketp, NULL));
2537 }
2538 
2539 /*%
2540  * Duplicate an existing socket.  The new socket is returned
2541  * in 'socketp'.
2542  */
2543 isc_result_t
isc_socket_dup(isc_socket_t * sock0,isc_socket_t ** socketp)2544 isc_socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) {
2545 	isc__socket_t *sock = (isc__socket_t *)sock0;
2546 
2547 	REQUIRE(VALID_SOCKET(sock));
2548 	REQUIRE(socketp != NULL && *socketp == NULL);
2549 
2550 	return (socket_create((isc_socketmgr_t *)sock->manager, sock->pf,
2551 			      sock->type, socketp, sock0));
2552 }
2553 
2554 isc_result_t
isc_socket_open(isc_socket_t * sock0)2555 isc_socket_open(isc_socket_t *sock0) {
2556 	isc_result_t result;
2557 	isc__socket_t *sock = (isc__socket_t *)sock0;
2558 	isc__socketthread_t *thread;
2559 
2560 	REQUIRE(VALID_SOCKET(sock));
2561 
2562 	LOCK(&sock->lock);
2563 
2564 	REQUIRE(isc_refcount_current(&sock->references) >= 1);
2565 	REQUIRE(sock->fd == -1);
2566 	REQUIRE(sock->threadid == -1);
2567 
2568 	result = opensocket(sock->manager, sock, NULL);
2569 
2570 	UNLOCK(&sock->lock);
2571 
2572 	if (result != ISC_R_SUCCESS) {
2573 		sock->fd = -1;
2574 	} else {
2575 		sock->threadid = gen_threadid(sock);
2576 		thread = &sock->manager->threads[sock->threadid];
2577 		int lockid = FDLOCK_ID(sock->fd);
2578 
2579 		LOCK(&thread->fdlock[lockid]);
2580 		thread->fds[sock->fd] = sock;
2581 		thread->fdstate[sock->fd] = MANAGED;
2582 #if defined(USE_EPOLL)
2583 		thread->epoll_events[sock->fd] = 0;
2584 #endif /* if defined(USE_EPOLL) */
2585 #ifdef USE_DEVPOLL
2586 		INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2587 		       thread->fdpollinfo[sock->fd].want_write == 0);
2588 #endif /* ifdef USE_DEVPOLL */
2589 		UNLOCK(&thread->fdlock[lockid]);
2590 
2591 #ifdef USE_SELECT
2592 		LOCK(&sock->manager->lock);
2593 		if (thread->maxfd < sock->fd) {
2594 			thread->maxfd = sock->fd;
2595 		}
2596 		UNLOCK(&sock->manager->lock);
2597 #endif /* ifdef USE_SELECT */
2598 	}
2599 
2600 	return (result);
2601 }
2602 
2603 /*
2604  * Attach to a socket.  Caller must explicitly detach when it is done.
2605  */
2606 void
isc_socket_attach(isc_socket_t * sock0,isc_socket_t ** socketp)2607 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
2608 	isc__socket_t *sock = (isc__socket_t *)sock0;
2609 
2610 	REQUIRE(VALID_SOCKET(sock));
2611 	REQUIRE(socketp != NULL && *socketp == NULL);
2612 
2613 	int old_refs = isc_refcount_increment(&sock->references);
2614 	REQUIRE(old_refs > 0);
2615 
2616 	*socketp = (isc_socket_t *)sock;
2617 }
2618 
2619 /*
2620  * Dereference a socket.  If this is the last reference to it, clean things
2621  * up by destroying the socket.
2622  */
2623 void
isc_socket_detach(isc_socket_t ** socketp)2624 isc_socket_detach(isc_socket_t **socketp) {
2625 	isc__socket_t *sock;
2626 
2627 	REQUIRE(socketp != NULL);
2628 	sock = (isc__socket_t *)*socketp;
2629 	REQUIRE(VALID_SOCKET(sock));
2630 	if (isc_refcount_decrement(&sock->references) == 1) {
2631 		destroy(&sock);
2632 	}
2633 
2634 	*socketp = NULL;
2635 }
2636 
2637 isc_result_t
isc_socket_close(isc_socket_t * sock0)2638 isc_socket_close(isc_socket_t *sock0) {
2639 	isc__socket_t *sock = (isc__socket_t *)sock0;
2640 	int fd;
2641 	isc__socketmgr_t *manager;
2642 	isc__socketthread_t *thread;
2643 	fflush(stdout);
2644 	REQUIRE(VALID_SOCKET(sock));
2645 
2646 	LOCK(&sock->lock);
2647 
2648 	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2649 
2650 	INSIST(!sock->connecting);
2651 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2652 	INSIST(ISC_LIST_EMPTY(sock->send_list));
2653 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2654 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
2655 
2656 	manager = sock->manager;
2657 	thread = &manager->threads[sock->threadid];
2658 	fd = sock->fd;
2659 	sock->fd = -1;
2660 	sock->threadid = -1;
2661 
2662 	sock->dupped = 0;
2663 	memset(sock->name, 0, sizeof(sock->name));
2664 	sock->tag = NULL;
2665 	sock->listener = 0;
2666 	sock->connected = 0;
2667 	sock->connecting = 0;
2668 	sock->bound = 0;
2669 	isc_sockaddr_any(&sock->peer_address);
2670 
2671 	UNLOCK(&sock->lock);
2672 
2673 	socketclose(thread, sock, fd);
2674 
2675 	return (ISC_R_SUCCESS);
2676 }
2677 
2678 /*
2679  * Dequeue an item off the given socket's read queue, set the result code
2680  * in the done event to the one provided, and send it to the task it was
2681  * destined for.
2682  *
2683  * If the event to be sent is on a list, remove it before sending.  If
2684  * asked to, send and detach from the socket as well.
2685  *
2686  * Caller must have the socket locked if the event is attached to the socket.
2687  */
2688 static void
send_recvdone_event(isc__socket_t * sock,isc_socketevent_t ** dev)2689 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
2690 	isc_task_t *task;
2691 
2692 	task = (*dev)->ev_sender;
2693 
2694 	(*dev)->ev_sender = sock;
2695 
2696 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2697 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2698 	}
2699 
2700 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2701 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2702 					 sock->threadid);
2703 	} else {
2704 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2705 	}
2706 }
2707 
2708 /*
2709  * See comments for send_recvdone_event() above.
2710  *
2711  * Caller must have the socket locked if the event is attached to the socket.
2712  */
2713 static void
send_senddone_event(isc__socket_t * sock,isc_socketevent_t ** dev)2714 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
2715 	isc_task_t *task;
2716 
2717 	INSIST(dev != NULL && *dev != NULL);
2718 
2719 	task = (*dev)->ev_sender;
2720 	(*dev)->ev_sender = sock;
2721 
2722 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2723 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2724 	}
2725 
2726 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2727 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2728 					 sock->threadid);
2729 	} else {
2730 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2731 	}
2732 }
2733 
2734 /*
2735  * See comments for send_recvdone_event() above.
2736  *
2737  * Caller must have the socket locked if the event is attached to the socket.
2738  */
2739 static void
send_connectdone_event(isc__socket_t * sock,isc_socket_connev_t ** dev)2740 send_connectdone_event(isc__socket_t *sock, isc_socket_connev_t **dev) {
2741 	isc_task_t *task;
2742 
2743 	INSIST(dev != NULL && *dev != NULL);
2744 
2745 	task = (*dev)->ev_sender;
2746 	(*dev)->ev_sender = sock;
2747 
2748 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2749 		ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2750 	}
2751 
2752 	isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2753 }
2754 
2755 /*
2756  * Call accept() on a socket, to get the new file descriptor.  The listen
2757  * socket is used as a prototype to create a new isc_socket_t.  The new
2758  * socket has one outstanding reference.  The task receiving the event
2759  * will be detached from just after the event is delivered.
2760  *
2761  * On entry to this function, the event delivered is the internal
2762  * readable event, and the first item on the accept_list should be
2763  * the done event we want to send.  If the list is empty, this is a no-op,
2764  * so just unlock and return.
2765  */
2766 static void
internal_accept(isc__socket_t * sock)2767 internal_accept(isc__socket_t *sock) {
2768 	isc__socketmgr_t *manager;
2769 	isc__socketthread_t *thread, *nthread;
2770 	isc_socket_newconnev_t *dev;
2771 	isc_task_t *task;
2772 	socklen_t addrlen;
2773 	int fd;
2774 	isc_result_t result = ISC_R_SUCCESS;
2775 	char strbuf[ISC_STRERRORSIZE];
2776 	const char *err = "accept";
2777 
2778 	INSIST(VALID_SOCKET(sock));
2779 	REQUIRE(sock->fd >= 0);
2780 
2781 	socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2782 
2783 	manager = sock->manager;
2784 	INSIST(VALID_MANAGER(manager));
2785 	thread = &manager->threads[sock->threadid];
2786 
2787 	INSIST(sock->listener);
2788 
2789 	/*
2790 	 * Get the first item off the accept list.
2791 	 * If it is empty, unlock the socket and return.
2792 	 */
2793 	dev = ISC_LIST_HEAD(sock->accept_list);
2794 	if (dev == NULL) {
2795 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2796 		UNLOCK(&sock->lock);
2797 		return;
2798 	}
2799 
2800 	/*
2801 	 * Try to accept the new connection.  If the accept fails with
2802 	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2803 	 * again.  Also ignore ECONNRESET, which has been reported to
2804 	 * be spuriously returned on Linux 2.2.19 although it is not
2805 	 * a documented error for accept().  ECONNABORTED has been
2806 	 * reported for Solaris 8.  The rest are thrown in not because
2807 	 * we have seen them but because they are ignored by other
2808 	 * daemons such as BIND 8 and Apache.
2809 	 */
2810 
2811 	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2812 	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2813 	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2814 		    (void *)&addrlen);
2815 
2816 #ifdef F_DUPFD
2817 	/*
2818 	 * Leave a space for stdio to work in.
2819 	 */
2820 	if (fd >= 0 && fd < 20) {
2821 		int newfd, tmp;
2822 		newfd = fcntl(fd, F_DUPFD, 20);
2823 		tmp = errno;
2824 		(void)close(fd);
2825 		errno = tmp;
2826 		fd = newfd;
2827 		err = "accept/fcntl";
2828 	}
2829 #endif /* ifdef F_DUPFD */
2830 
2831 	if (fd < 0) {
2832 		if (SOFT_ERROR(errno)) {
2833 			goto soft_error;
2834 		}
2835 		switch (errno) {
2836 		case ENFILE:
2837 		case EMFILE:
2838 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2839 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2840 				      "%s: too many open file descriptors",
2841 				      err);
2842 			goto soft_error;
2843 
2844 		case ENOBUFS:
2845 		case ENOMEM:
2846 		case ECONNRESET:
2847 		case ECONNABORTED:
2848 		case EHOSTUNREACH:
2849 		case EHOSTDOWN:
2850 		case ENETUNREACH:
2851 		case ENETDOWN:
2852 		case ECONNREFUSED:
2853 #ifdef EPROTO
2854 		case EPROTO:
2855 #endif /* ifdef EPROTO */
2856 #ifdef ENONET
2857 		case ENONET:
2858 #endif /* ifdef ENONET */
2859 			goto soft_error;
2860 		default:
2861 			break;
2862 		}
2863 		strerror_r(errno, strbuf, sizeof(strbuf));
2864 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2865 				 "internal_accept: %s() failed: %s", err,
2866 				 strbuf);
2867 		fd = -1;
2868 		result = ISC_R_UNEXPECTED;
2869 	} else {
2870 		if (addrlen == 0U) {
2871 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2872 					 "internal_accept(): "
2873 					 "accept() failed to return "
2874 					 "remote address");
2875 
2876 			(void)close(fd);
2877 			goto soft_error;
2878 		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2879 			   sock->pf) {
2880 			UNEXPECTED_ERROR(
2881 				__FILE__, __LINE__,
2882 				"internal_accept(): "
2883 				"accept() returned peer address "
2884 				"family %u (expected %u)",
2885 				NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2886 				sock->pf);
2887 			(void)close(fd);
2888 			goto soft_error;
2889 		} else if (fd >= (int)manager->maxsocks) {
2890 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2891 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2892 				      "accept: file descriptor exceeds limit "
2893 				      "(%d/%u)",
2894 				      fd, manager->maxsocks);
2895 			(void)close(fd);
2896 			goto soft_error;
2897 		}
2898 	}
2899 
2900 	if (fd != -1) {
2901 		NEWCONNSOCK(dev)->peer_address.length = addrlen;
2902 		NEWCONNSOCK(dev)->pf = sock->pf;
2903 	}
2904 
2905 	/*
2906 	 * Pull off the done event.
2907 	 */
2908 	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2909 
2910 	/*
2911 	 * Poke watcher if there are more pending accepts.
2912 	 */
2913 	if (ISC_LIST_EMPTY(sock->accept_list)) {
2914 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2915 	}
2916 
2917 	if (fd != -1) {
2918 		result = make_nonblock(fd);
2919 		if (result != ISC_R_SUCCESS) {
2920 			(void)close(fd);
2921 			fd = -1;
2922 		}
2923 	}
2924 
2925 	/*
2926 	 * We need to unlock sock->lock now to be able to lock manager->lock
2927 	 * without risking a deadlock with xmlstats.
2928 	 */
2929 	UNLOCK(&sock->lock);
2930 
2931 	/*
2932 	 * -1 means the new socket didn't happen.
2933 	 */
2934 	if (fd != -1) {
2935 		int lockid = FDLOCK_ID(fd);
2936 
2937 		NEWCONNSOCK(dev)->fd = fd;
2938 		NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2939 		NEWCONNSOCK(dev)->bound = 1;
2940 		NEWCONNSOCK(dev)->connected = 1;
2941 		nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2942 
2943 		/*
2944 		 * We already hold a lock on one fdlock in accepting thread,
2945 		 * we need to make sure that we don't double lock.
2946 		 */
2947 		bool same_bucket = (sock->threadid ==
2948 				    NEWCONNSOCK(dev)->threadid) &&
2949 				   (FDLOCK_ID(sock->fd) == lockid);
2950 
2951 		/*
2952 		 * Use minimum mtu if possible.
2953 		 */
2954 		use_min_mtu(NEWCONNSOCK(dev));
2955 		set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
2956 
2957 		/*
2958 		 * Ensure DSCP settings are inherited across accept.
2959 		 */
2960 		setdscp(NEWCONNSOCK(dev), sock->dscp);
2961 
2962 		/*
2963 		 * Save away the remote address
2964 		 */
2965 		dev->address = NEWCONNSOCK(dev)->peer_address;
2966 
2967 		if (NEWCONNSOCK(dev)->active == 0) {
2968 			inc_stats(manager->stats,
2969 				  NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
2970 			NEWCONNSOCK(dev)->active = 1;
2971 		}
2972 
2973 		if (!same_bucket) {
2974 			LOCK(&nthread->fdlock[lockid]);
2975 		}
2976 		nthread->fds[fd] = NEWCONNSOCK(dev);
2977 		nthread->fdstate[fd] = MANAGED;
2978 #if defined(USE_EPOLL)
2979 		nthread->epoll_events[fd] = 0;
2980 #endif /* if defined(USE_EPOLL) */
2981 		if (!same_bucket) {
2982 			UNLOCK(&nthread->fdlock[lockid]);
2983 		}
2984 
2985 		LOCK(&manager->lock);
2986 
2987 #ifdef USE_SELECT
2988 		if (nthread->maxfd < fd) {
2989 			nthread->maxfd = fd;
2990 		}
2991 #endif /* ifdef USE_SELECT */
2992 
2993 		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
2994 			   "accepted connection, new socket %p",
2995 			   dev->newsocket);
2996 
2997 		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
2998 
2999 		UNLOCK(&manager->lock);
3000 
3001 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3002 	} else {
3003 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3004 		(void)isc_refcount_decrement(&NEWCONNSOCK(dev)->references);
3005 		free_socket((isc__socket_t **)&dev->newsocket);
3006 	}
3007 
3008 	/*
3009 	 * Fill in the done event details and send it off.
3010 	 */
3011 	dev->result = result;
3012 	task = dev->ev_sender;
3013 	dev->ev_sender = sock;
3014 
3015 	isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
3016 	return;
3017 
3018 soft_error:
3019 	watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
3020 	UNLOCK(&sock->lock);
3021 
3022 	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3023 	return;
3024 }
3025 
3026 static void
internal_recv(isc__socket_t * sock)3027 internal_recv(isc__socket_t *sock) {
3028 	isc_socketevent_t *dev;
3029 
3030 	INSIST(VALID_SOCKET(sock));
3031 	REQUIRE(sock->fd >= 0);
3032 
3033 	dev = ISC_LIST_HEAD(sock->recv_list);
3034 	if (dev == NULL) {
3035 		goto finish;
3036 	}
3037 
3038 	socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
3039 		   dev, dev->ev_sender);
3040 
3041 	/*
3042 	 * Try to do as much I/O as possible on this socket.  There are no
3043 	 * limits here, currently.
3044 	 */
3045 	while (dev != NULL) {
3046 		switch (doio_recv(sock, dev)) {
3047 		case DOIO_SOFT:
3048 			goto finish;
3049 
3050 		case DOIO_EOF:
3051 			/*
3052 			 * read of 0 means the remote end was closed.
3053 			 * Run through the event queue and dispatch all
3054 			 * the events with an EOF result code.
3055 			 */
3056 			do {
3057 				dev->result = ISC_R_EOF;
3058 				send_recvdone_event(sock, &dev);
3059 				dev = ISC_LIST_HEAD(sock->recv_list);
3060 			} while (dev != NULL);
3061 			goto finish;
3062 
3063 		case DOIO_SUCCESS:
3064 		case DOIO_HARD:
3065 			send_recvdone_event(sock, &dev);
3066 			break;
3067 		}
3068 
3069 		dev = ISC_LIST_HEAD(sock->recv_list);
3070 	}
3071 
3072 finish:
3073 	if (ISC_LIST_EMPTY(sock->recv_list)) {
3074 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3075 			   SELECT_POKE_READ);
3076 	}
3077 	UNLOCK(&sock->lock);
3078 }
3079 
3080 static void
internal_send(isc__socket_t * sock)3081 internal_send(isc__socket_t *sock) {
3082 	isc_socketevent_t *dev;
3083 
3084 	INSIST(VALID_SOCKET(sock));
3085 	REQUIRE(sock->fd >= 0);
3086 
3087 	dev = ISC_LIST_HEAD(sock->send_list);
3088 	if (dev == NULL) {
3089 		goto finish;
3090 	}
3091 	socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3092 		   dev->ev_sender);
3093 
3094 	/*
3095 	 * Try to do as much I/O as possible on this socket.  There are no
3096 	 * limits here, currently.
3097 	 */
3098 	while (dev != NULL) {
3099 		switch (doio_send(sock, dev)) {
3100 		case DOIO_SOFT:
3101 			goto finish;
3102 
3103 		case DOIO_HARD:
3104 		case DOIO_SUCCESS:
3105 			send_senddone_event(sock, &dev);
3106 			break;
3107 		}
3108 
3109 		dev = ISC_LIST_HEAD(sock->send_list);
3110 	}
3111 
3112 finish:
3113 	if (ISC_LIST_EMPTY(sock->send_list)) {
3114 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3115 			   SELECT_POKE_WRITE);
3116 	}
3117 	UNLOCK(&sock->lock);
3118 }
3119 
3120 /*
3121  * Process read/writes on each fd here.  Avoid locking
3122  * and unlocking twice if both reads and writes are possible.
3123  */
3124 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3125 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3126 	isc__socket_t *sock;
3127 	int lockid = FDLOCK_ID(fd);
3128 
3129 	/*
3130 	 * If the socket is going to be closed, don't do more I/O.
3131 	 */
3132 	LOCK(&thread->fdlock[lockid]);
3133 	if (thread->fdstate[fd] == CLOSE_PENDING) {
3134 		UNLOCK(&thread->fdlock[lockid]);
3135 
3136 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3137 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3138 		return;
3139 	}
3140 
3141 	sock = thread->fds[fd];
3142 	if (sock == NULL) {
3143 		UNLOCK(&thread->fdlock[lockid]);
3144 		return;
3145 	}
3146 
3147 	LOCK(&sock->lock);
3148 
3149 	if (sock->fd < 0) {
3150 		/*
3151 		 * Sock is being closed - the final external reference
3152 		 * is gone but it was not yet removed from event loop
3153 		 * and fdstate[]/fds[] as destroy() is waiting on
3154 		 * thread->fdlock[lockid] or sock->lock that we're holding.
3155 		 * Just release the locks and bail.
3156 		 */
3157 		UNLOCK(&sock->lock);
3158 		UNLOCK(&thread->fdlock[lockid]);
3159 		return;
3160 	}
3161 
3162 	REQUIRE(readable || writeable);
3163 	if (readable) {
3164 		if (sock->listener) {
3165 			internal_accept(sock);
3166 		} else {
3167 			internal_recv(sock);
3168 		}
3169 	}
3170 
3171 	if (writeable) {
3172 		if (sock->connecting) {
3173 			internal_connect(sock);
3174 		} else {
3175 			internal_send(sock);
3176 		}
3177 	}
3178 
3179 	/* sock->lock is unlocked in internal_* function */
3180 	UNLOCK(&thread->fdlock[lockid]);
3181 
3182 	/*
3183 	 * Socket destruction might be pending, it will resume
3184 	 * after releasing fdlock and sock->lock.
3185 	 */
3186 }
3187 
3188 /*
3189  * process_fds is different for different event loops
3190  * it takes the events from event loops and for each FD
3191  * launches process_fd
3192  */
3193 #ifdef USE_KQUEUE
3194 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3195 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3196 	int i;
3197 	bool readable, writable;
3198 	bool done = false;
3199 	bool have_ctlevent = false;
3200 	if (nevents == thread->nevents) {
3201 		/*
3202 		 * This is not an error, but something unexpected.  If this
3203 		 * happens, it may indicate the need for increasing
3204 		 * ISC_SOCKET_MAXEVENTS.
3205 		 */
3206 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3207 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3208 			   "maximum number of FD events (%d) received",
3209 			   nevents);
3210 	}
3211 
3212 	for (i = 0; i < nevents; i++) {
3213 		REQUIRE(events[i].ident < thread->manager->maxsocks);
3214 		if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3215 			have_ctlevent = true;
3216 			continue;
3217 		}
3218 		readable = (events[i].filter == EVFILT_READ);
3219 		writable = (events[i].filter == EVFILT_WRITE);
3220 		process_fd(thread, events[i].ident, readable, writable);
3221 	}
3222 
3223 	if (have_ctlevent) {
3224 		done = process_ctlfd(thread);
3225 	}
3226 
3227 	return (done);
3228 }
3229 #elif defined(USE_EPOLL)
3230 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3231 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3232 	    int nevents) {
3233 	int i;
3234 	bool done = false;
3235 	bool have_ctlevent = false;
3236 
3237 	if (nevents == thread->nevents) {
3238 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3239 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3240 			   "maximum number of FD events (%d) received",
3241 			   nevents);
3242 	}
3243 
3244 	for (i = 0; i < nevents; i++) {
3245 		REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3246 		if (events[i].data.fd == thread->pipe_fds[0]) {
3247 			have_ctlevent = true;
3248 			continue;
3249 		}
3250 		if ((events[i].events & EPOLLERR) != 0 ||
3251 		    (events[i].events & EPOLLHUP) != 0) {
3252 			/*
3253 			 * epoll does not set IN/OUT bits on an erroneous
3254 			 * condition, so we need to try both anyway.  This is a
3255 			 * bit inefficient, but should be okay for such rare
3256 			 * events.  Note also that the read or write attempt
3257 			 * won't block because we use non-blocking sockets.
3258 			 */
3259 			int fd = events[i].data.fd;
3260 			events[i].events |= thread->epoll_events[fd];
3261 		}
3262 		process_fd(thread, events[i].data.fd,
3263 			   (events[i].events & EPOLLIN) != 0,
3264 			   (events[i].events & EPOLLOUT) != 0);
3265 	}
3266 
3267 	if (have_ctlevent) {
3268 		done = process_ctlfd(thread);
3269 	}
3270 
3271 	return (done);
3272 }
3273 #elif defined(USE_DEVPOLL)
3274 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3275 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3276 	int i;
3277 	bool done = false;
3278 	bool have_ctlevent = false;
3279 
3280 	if (nevents == thread->nevents) {
3281 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3282 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3283 			   "maximum number of FD events (%d) received",
3284 			   nevents);
3285 	}
3286 
3287 	for (i = 0; i < nevents; i++) {
3288 		REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3289 		if (events[i].fd == thread->pipe_fds[0]) {
3290 			have_ctlevent = true;
3291 			continue;
3292 		}
3293 		process_fd(thread, events[i].fd,
3294 			   (events[i].events & POLLIN) != 0,
3295 			   (events[i].events & POLLOUT) != 0);
3296 	}
3297 
3298 	if (have_ctlevent) {
3299 		done = process_ctlfd(thread);
3300 	}
3301 
3302 	return (done);
3303 }
3304 #elif defined(USE_SELECT)
3305 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3306 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3307 	    fd_set *writefds) {
3308 	int i;
3309 
3310 	REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3311 
3312 	for (i = 0; i < maxfd; i++) {
3313 		if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3314 			continue;
3315 		}
3316 		process_fd(thread, i, FD_ISSET(i, readfds),
3317 			   FD_ISSET(i, writefds));
3318 	}
3319 }
3320 #endif /* ifdef USE_KQUEUE */
3321 
3322 static bool
process_ctlfd(isc__socketthread_t * thread)3323 process_ctlfd(isc__socketthread_t *thread) {
3324 	int msg, fd;
3325 
3326 	for (;;) {
3327 		select_readmsg(thread, &fd, &msg);
3328 
3329 		thread_log(thread, IOEVENT,
3330 			   "watcher got message %d for socket %d", msg, fd);
3331 
3332 		/*
3333 		 * Nothing to read?
3334 		 */
3335 		if (msg == SELECT_POKE_NOTHING) {
3336 			break;
3337 		}
3338 
3339 		/*
3340 		 * Handle shutdown message.  We really should
3341 		 * jump out of this loop right away, but
3342 		 * it doesn't matter if we have to do a little
3343 		 * more work first.
3344 		 */
3345 		if (msg == SELECT_POKE_SHUTDOWN) {
3346 			return (true);
3347 		}
3348 
3349 		/*
3350 		 * This is a wakeup on a socket.  Look
3351 		 * at the event queue for both read and write,
3352 		 * and decide if we need to watch on it now
3353 		 * or not.
3354 		 */
3355 		wakeup_socket(thread, fd, msg);
3356 	}
3357 
3358 	return (false);
3359 }
3360 
3361 /*
3362  * This is the thread that will loop forever, always in a select or poll
3363  * call.
3364  *
3365  * When select returns something to do, do whatever's necessary and post
3366  * an event to the task that was requesting the action.
3367  */
3368 static isc_threadresult_t
netthread(void * uap)3369 netthread(void *uap) {
3370 	isc__socketthread_t *thread = uap;
3371 	isc__socketmgr_t *manager = thread->manager;
3372 	(void)manager;
3373 	bool done;
3374 	int cc;
3375 	if (manager->nthreads > 1) {
3376 		isc_thread_setaffinity(thread->threadid);
3377 	}
3378 #ifdef USE_KQUEUE
3379 	const char *fnname = "kevent()";
3380 #elif defined(USE_EPOLL)
3381 	const char *fnname = "epoll_wait()";
3382 #elif defined(USE_DEVPOLL)
3383 	isc_result_t result;
3384 	const char *fnname = "ioctl(DP_POLL)";
3385 	struct dvpoll dvp;
3386 	int pass;
3387 #if defined(ISC_SOCKET_USE_POLLWATCH)
3388 	pollstate_t pollstate = poll_idle;
3389 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3390 #elif defined(USE_SELECT)
3391 	const char *fnname = "select()";
3392 	int maxfd;
3393 	int ctlfd;
3394 #endif /* ifdef USE_KQUEUE */
3395 	char strbuf[ISC_STRERRORSIZE];
3396 
3397 #if defined(USE_SELECT)
3398 	/*
3399 	 * Get the control fd here.  This will never change.
3400 	 */
3401 	ctlfd = thread->pipe_fds[0];
3402 #endif /* if defined(USE_SELECT) */
3403 	done = false;
3404 	while (!done) {
3405 		do {
3406 #ifdef USE_KQUEUE
3407 			cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3408 				    thread->nevents, NULL);
3409 #elif defined(USE_EPOLL)
3410 			cc = epoll_wait(thread->epoll_fd, thread->events,
3411 					thread->nevents, -1);
3412 #elif defined(USE_DEVPOLL)
3413 			/*
3414 			 * Re-probe every thousand calls.
3415 			 */
3416 			if (thread->calls++ > 1000U) {
3417 				result = isc_resource_getcurlimit(
3418 					isc_resource_openfiles,
3419 					&thread->open_max);
3420 				if (result != ISC_R_SUCCESS) {
3421 					thread->open_max = 64;
3422 				}
3423 				thread->calls = 0;
3424 			}
3425 			for (pass = 0; pass < 2; pass++) {
3426 				dvp.dp_fds = thread->events;
3427 				dvp.dp_nfds = thread->nevents;
3428 				if (dvp.dp_nfds >= thread->open_max) {
3429 					dvp.dp_nfds = thread->open_max - 1;
3430 				}
3431 #ifndef ISC_SOCKET_USE_POLLWATCH
3432 				dvp.dp_timeout = -1;
3433 #else  /* ifndef ISC_SOCKET_USE_POLLWATCH */
3434 				if (pollstate == poll_idle) {
3435 					dvp.dp_timeout = -1;
3436 				} else {
3437 					dvp.dp_timeout =
3438 						ISC_SOCKET_POLLWATCH_TIMEOUT;
3439 				}
3440 #endif /* ISC_SOCKET_USE_POLLWATCH */
3441 				cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3442 				if (cc == -1 && errno == EINVAL) {
3443 					/*
3444 					 * {OPEN_MAX} may have dropped.  Look
3445 					 * up the current value and try again.
3446 					 */
3447 					result = isc_resource_getcurlimit(
3448 						isc_resource_openfiles,
3449 						&thread->open_max);
3450 					if (result != ISC_R_SUCCESS) {
3451 						thread->open_max = 64;
3452 					}
3453 				} else {
3454 					break;
3455 				}
3456 			}
3457 #elif defined(USE_SELECT)
3458 			/*
3459 			 * We will have only one thread anyway, we can lock
3460 			 * manager lock and don't care
3461 			 */
3462 			LOCK(&manager->lock);
3463 			memmove(thread->read_fds_copy, thread->read_fds,
3464 				thread->fd_bufsize);
3465 			memmove(thread->write_fds_copy, thread->write_fds,
3466 				thread->fd_bufsize);
3467 			maxfd = thread->maxfd + 1;
3468 			UNLOCK(&manager->lock);
3469 
3470 			cc = select(maxfd, thread->read_fds_copy,
3471 				    thread->write_fds_copy, NULL, NULL);
3472 #endif /* USE_KQUEUE */
3473 
3474 			if (cc < 0 && !SOFT_ERROR(errno)) {
3475 				strerror_r(errno, strbuf, sizeof(strbuf));
3476 				FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3477 					    fnname, strbuf);
3478 			}
3479 
3480 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3481 			if (cc == 0) {
3482 				if (pollstate == poll_active) {
3483 					pollstate = poll_checking;
3484 				} else if (pollstate == poll_checking) {
3485 					pollstate = poll_idle;
3486 				}
3487 			} else if (cc > 0) {
3488 				if (pollstate == poll_checking) {
3489 					/*
3490 					 * XXX: We'd like to use a more
3491 					 * verbose log level as it's actually an
3492 					 * unexpected event, but the kernel bug
3493 					 * reportedly happens pretty frequently
3494 					 * (and it can also be a false positive)
3495 					 * so it would be just too noisy.
3496 					 */
3497 					thread_log(thread,
3498 						   ISC_LOGCATEGORY_GENERAL,
3499 						   ISC_LOGMODULE_SOCKET,
3500 						   ISC_LOG_DEBUG(1),
3501 						   "unexpected POLL timeout");
3502 				}
3503 				pollstate = poll_active;
3504 			}
3505 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3506 		} while (cc < 0);
3507 
3508 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3509 		done = process_fds(thread, thread->events, cc);
3510 #elif defined(USE_SELECT)
3511 		process_fds(thread, maxfd, thread->read_fds_copy,
3512 			    thread->write_fds_copy);
3513 
3514 		/*
3515 		 * Process reads on internal, control fd.
3516 		 */
3517 		if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3518 			done = process_ctlfd(thread);
3519 		}
3520 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3521 	* */
3522 	}
3523 
3524 	thread_log(thread, TRACE, "watcher exiting");
3525 	return ((isc_threadresult_t)0);
3526 }
3527 
3528 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager0,uint32_t reserved)3529 isc_socketmgr_setreserved(isc_socketmgr_t *manager0, uint32_t reserved) {
3530 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3531 
3532 	REQUIRE(VALID_MANAGER(manager));
3533 
3534 	manager->reserved = reserved;
3535 }
3536 
3537 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager0,unsigned int maxudp)3538 isc_socketmgr_maxudp(isc_socketmgr_t *manager0, unsigned int maxudp) {
3539 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3540 
3541 	REQUIRE(VALID_MANAGER(manager));
3542 
3543 	manager->maxudp = maxudp;
3544 }
3545 
3546 /*
3547  * Setup socket thread, thread->manager and thread->threadid must be filled.
3548  */
3549 
3550 static isc_result_t
setup_thread(isc__socketthread_t * thread)3551 setup_thread(isc__socketthread_t *thread) {
3552 	isc_result_t result = ISC_R_SUCCESS;
3553 	int i;
3554 	char strbuf[ISC_STRERRORSIZE];
3555 
3556 	REQUIRE(thread != NULL);
3557 	REQUIRE(VALID_MANAGER(thread->manager));
3558 	REQUIRE(thread->threadid >= 0 &&
3559 		thread->threadid < thread->manager->nthreads);
3560 
3561 	thread->fds = isc_mem_get(thread->manager->mctx,
3562 				  thread->manager->maxsocks *
3563 					  sizeof(isc__socket_t *));
3564 
3565 	memset(thread->fds, 0,
3566 	       thread->manager->maxsocks * sizeof(isc_socket_t *));
3567 
3568 	thread->fdstate = isc_mem_get(thread->manager->mctx,
3569 				      thread->manager->maxsocks * sizeof(int));
3570 
3571 	memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3572 
3573 	thread->fdlock = isc_mem_get(thread->manager->mctx,
3574 				     FDLOCK_COUNT * sizeof(isc_mutex_t));
3575 
3576 	for (i = 0; i < FDLOCK_COUNT; i++) {
3577 		isc_mutex_init(&thread->fdlock[i]);
3578 	}
3579 
3580 	if (pipe(thread->pipe_fds) != 0) {
3581 		strerror_r(errno, strbuf, sizeof(strbuf));
3582 		UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3583 				 strbuf);
3584 		return (ISC_R_UNEXPECTED);
3585 	}
3586 	RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3587 
3588 #ifdef USE_KQUEUE
3589 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3590 	thread->events = isc_mem_get(thread->manager->mctx,
3591 				     sizeof(struct kevent) * thread->nevents);
3592 
3593 	thread->kqueue_fd = kqueue();
3594 	if (thread->kqueue_fd == -1) {
3595 		result = isc__errno2result(errno);
3596 		strerror_r(errno, strbuf, sizeof(strbuf));
3597 		UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3598 				 strbuf);
3599 		isc_mem_put(thread->manager->mctx, thread->events,
3600 			    sizeof(struct kevent) * thread->nevents);
3601 		return (result);
3602 	}
3603 
3604 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3605 	if (result != ISC_R_SUCCESS) {
3606 		close(thread->kqueue_fd);
3607 		isc_mem_put(thread->manager->mctx, thread->events,
3608 			    sizeof(struct kevent) * thread->nevents);
3609 	}
3610 	return (result);
3611 
3612 #elif defined(USE_EPOLL)
3613 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3614 	thread->epoll_events =
3615 		isc_mem_get(thread->manager->mctx,
3616 			    (thread->manager->maxsocks * sizeof(uint32_t)));
3617 
3618 	memset(thread->epoll_events, 0,
3619 	       thread->manager->maxsocks * sizeof(uint32_t));
3620 
3621 	thread->events =
3622 		isc_mem_get(thread->manager->mctx,
3623 			    sizeof(struct epoll_event) * thread->nevents);
3624 
3625 	thread->epoll_fd = epoll_create(thread->nevents);
3626 	if (thread->epoll_fd == -1) {
3627 		result = isc__errno2result(errno);
3628 		strerror_r(errno, strbuf, sizeof(strbuf));
3629 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3630 				 strbuf);
3631 		return (result);
3632 	}
3633 
3634 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3635 	return (result);
3636 
3637 #elif defined(USE_DEVPOLL)
3638 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3639 	result = isc_resource_getcurlimit(isc_resource_openfiles,
3640 					  &thread->open_max);
3641 	if (result != ISC_R_SUCCESS) {
3642 		thread->open_max = 64;
3643 	}
3644 	thread->calls = 0;
3645 	thread->events = isc_mem_get(thread->manager->mctx,
3646 				     sizeof(struct pollfd) * thread->nevents);
3647 
3648 	/*
3649 	 * Note: fdpollinfo should be able to support all possible FDs, so
3650 	 * it must have maxsocks entries (not nevents).
3651 	 */
3652 	thread->fdpollinfo =
3653 		isc_mem_get(thread->manager->mctx,
3654 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3655 	memset(thread->fdpollinfo, 0,
3656 	       sizeof(pollinfo_t) * thread->manager->maxsocks);
3657 	thread->devpoll_fd = open("/dev/poll", O_RDWR);
3658 	if (thread->devpoll_fd == -1) {
3659 		result = isc__errno2result(errno);
3660 		strerror_r(errno, strbuf, sizeof(strbuf));
3661 		UNEXPECTED_ERROR(__FILE__, __LINE__,
3662 				 "open(/dev/poll) failed: %s", strbuf);
3663 		isc_mem_put(thread->manager->mctx, thread->events,
3664 			    sizeof(struct pollfd) * thread->nevents);
3665 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3666 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3667 		return (result);
3668 	}
3669 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3670 	if (result != ISC_R_SUCCESS) {
3671 		close(thread->devpoll_fd);
3672 		isc_mem_put(thread->manager->mctx, thread->events,
3673 			    sizeof(struct pollfd) * thread->nevents);
3674 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3675 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3676 		return (result);
3677 	}
3678 
3679 	return (ISC_R_SUCCESS);
3680 #elif defined(USE_SELECT)
3681 	UNUSED(result);
3682 
3683 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3684 	/*
3685 	 * Note: this code should also cover the case of MAXSOCKETS <=
3686 	 * FD_SETSIZE, but we separate the cases to avoid possible portability
3687 	 * issues regarding howmany() and the actual representation of fd_set.
3688 	 */
3689 	thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3690 			     sizeof(fd_mask);
3691 #else  /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3692 	thread->fd_bufsize = sizeof(fd_set);
3693 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3694 
3695 	thread->read_fds = isc_mem_get(thread->manager->mctx,
3696 				       thread->fd_bufsize);
3697 	thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3698 					    thread->fd_bufsize);
3699 	thread->write_fds = isc_mem_get(thread->manager->mctx,
3700 					thread->fd_bufsize);
3701 	thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3702 					     thread->fd_bufsize);
3703 	memset(thread->read_fds, 0, thread->fd_bufsize);
3704 	memset(thread->write_fds, 0, thread->fd_bufsize);
3705 
3706 	(void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3707 	thread->maxfd = thread->pipe_fds[0];
3708 
3709 	return (ISC_R_SUCCESS);
3710 #endif /* USE_KQUEUE */
3711 }
3712 
3713 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3714 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3715 	isc_result_t result;
3716 	int i;
3717 
3718 	result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3719 	if (result != ISC_R_SUCCESS) {
3720 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3721 	}
3722 #ifdef USE_KQUEUE
3723 	close(thread->kqueue_fd);
3724 	isc_mem_put(mctx, thread->events,
3725 		    sizeof(struct kevent) * thread->nevents);
3726 #elif defined(USE_EPOLL)
3727 	close(thread->epoll_fd);
3728 
3729 	isc_mem_put(mctx, thread->events,
3730 		    sizeof(struct epoll_event) * thread->nevents);
3731 #elif defined(USE_DEVPOLL)
3732 	close(thread->devpoll_fd);
3733 	isc_mem_put(mctx, thread->events,
3734 		    sizeof(struct pollfd) * thread->nevents);
3735 	isc_mem_put(mctx, thread->fdpollinfo,
3736 		    sizeof(pollinfo_t) * thread->manager->maxsocks);
3737 #elif defined(USE_SELECT)
3738 	if (thread->read_fds != NULL) {
3739 		isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3740 	}
3741 	if (thread->read_fds_copy != NULL) {
3742 		isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3743 	}
3744 	if (thread->write_fds != NULL) {
3745 		isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3746 	}
3747 	if (thread->write_fds_copy != NULL) {
3748 		isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3749 	}
3750 #endif /* USE_KQUEUE */
3751 	for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3752 		if (thread->fdstate[i] == CLOSE_PENDING) {
3753 			/* no need to lock */
3754 			(void)close(i);
3755 		}
3756 	}
3757 
3758 #if defined(USE_EPOLL)
3759 	isc_mem_put(thread->manager->mctx, thread->epoll_events,
3760 		    thread->manager->maxsocks * sizeof(uint32_t));
3761 #endif /* if defined(USE_EPOLL) */
3762 	isc_mem_put(thread->manager->mctx, thread->fds,
3763 		    thread->manager->maxsocks * sizeof(isc__socket_t *));
3764 	isc_mem_put(thread->manager->mctx, thread->fdstate,
3765 		    thread->manager->maxsocks * sizeof(int));
3766 
3767 	if (thread->fdlock != NULL) {
3768 		for (i = 0; i < FDLOCK_COUNT; i++) {
3769 			isc_mutex_destroy(&thread->fdlock[i]);
3770 		}
3771 		isc_mem_put(thread->manager->mctx, thread->fdlock,
3772 			    FDLOCK_COUNT * sizeof(isc_mutex_t));
3773 	}
3774 }
3775 
3776 isc_result_t
isc_socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp)3777 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3778 	return (isc_socketmgr_create2(mctx, managerp, 0, 1));
3779 }
3780 
3781 isc_result_t
isc_socketmgr_create2(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3782 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3783 		      unsigned int maxsocks, int nthreads) {
3784 	int i;
3785 	isc__socketmgr_t *manager;
3786 
3787 	REQUIRE(managerp != NULL && *managerp == NULL);
3788 
3789 	if (maxsocks == 0) {
3790 		maxsocks = ISC_SOCKET_MAXSOCKETS;
3791 	}
3792 
3793 	manager = isc_mem_get(mctx, sizeof(*manager));
3794 
3795 	/* zero-clear so that necessary cleanup on failure will be easy */
3796 	memset(manager, 0, sizeof(*manager));
3797 	manager->maxsocks = maxsocks;
3798 	manager->reserved = 0;
3799 	manager->maxudp = 0;
3800 	manager->nthreads = nthreads;
3801 	manager->stats = NULL;
3802 
3803 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
3804 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
3805 	manager->mctx = NULL;
3806 	ISC_LIST_INIT(manager->socklist);
3807 	isc_mutex_init(&manager->lock);
3808 	isc_condition_init(&manager->shutdown_ok);
3809 
3810 	/*
3811 	 * Start up the select/poll thread.
3812 	 */
3813 	manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3814 						     manager->nthreads);
3815 	isc_mem_attach(mctx, &manager->mctx);
3816 
3817 	for (i = 0; i < manager->nthreads; i++) {
3818 		manager->threads[i].manager = manager;
3819 		manager->threads[i].threadid = i;
3820 		setup_thread(&manager->threads[i]);
3821 		isc_thread_create(netthread, &manager->threads[i],
3822 				  &manager->threads[i].thread);
3823 		char tname[1024];
3824 		sprintf(tname, "isc-socket-%d", i);
3825 		isc_thread_setname(manager->threads[i].thread, tname);
3826 	}
3827 
3828 	*managerp = (isc_socketmgr_t *)manager;
3829 
3830 	return (ISC_R_SUCCESS);
3831 }
3832 
3833 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager0,unsigned int * nsockp)3834 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
3835 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3836 	REQUIRE(VALID_MANAGER(manager));
3837 	REQUIRE(nsockp != NULL);
3838 
3839 	*nsockp = manager->maxsocks;
3840 
3841 	return (ISC_R_SUCCESS);
3842 }
3843 
3844 void
isc_socketmgr_setstats(isc_socketmgr_t * manager0,isc_stats_t * stats)3845 isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
3846 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3847 
3848 	REQUIRE(VALID_MANAGER(manager));
3849 	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3850 	REQUIRE(manager->stats == NULL);
3851 	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3852 
3853 	isc_stats_attach(stats, &manager->stats);
3854 }
3855 
3856 void
isc_socketmgr_destroy(isc_socketmgr_t ** managerp)3857 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3858 	isc__socketmgr_t *manager;
3859 
3860 	/*
3861 	 * Destroy a socket manager.
3862 	 */
3863 
3864 	REQUIRE(managerp != NULL);
3865 	manager = (isc__socketmgr_t *)*managerp;
3866 	REQUIRE(VALID_MANAGER(manager));
3867 
3868 	LOCK(&manager->lock);
3869 
3870 	/*
3871 	 * Wait for all sockets to be destroyed.
3872 	 */
3873 	while (!ISC_LIST_EMPTY(manager->socklist)) {
3874 		manager_log(manager, CREATION, "sockets exist");
3875 		WAIT(&manager->shutdown_ok, &manager->lock);
3876 	}
3877 
3878 	UNLOCK(&manager->lock);
3879 
3880 	/*
3881 	 * Here, poke our select/poll thread.  Do this by closing the write
3882 	 * half of the pipe, which will send EOF to the read half.
3883 	 * This is currently a no-op in the non-threaded case.
3884 	 */
3885 	for (int i = 0; i < manager->nthreads; i++) {
3886 		select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3887 	}
3888 
3889 	/*
3890 	 * Wait for thread to exit.
3891 	 */
3892 	for (int i = 0; i < manager->nthreads; i++) {
3893 		isc_thread_join(manager->threads[i].thread, NULL);
3894 		cleanup_thread(manager->mctx, &manager->threads[i]);
3895 	}
3896 	/*
3897 	 * Clean up.
3898 	 */
3899 	isc_mem_put(manager->mctx, manager->threads,
3900 		    sizeof(isc__socketthread_t) * manager->nthreads);
3901 	(void)isc_condition_destroy(&manager->shutdown_ok);
3902 
3903 	if (manager->stats != NULL) {
3904 		isc_stats_detach(&manager->stats);
3905 	}
3906 	isc_mutex_destroy(&manager->lock);
3907 	manager->common.magic = 0;
3908 	manager->common.impmagic = 0;
3909 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
3910 
3911 	*managerp = NULL;
3912 }
3913 
3914 static isc_result_t
socket_recv(isc__socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)3915 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3916 	    unsigned int flags) {
3917 	int io_state;
3918 	bool have_lock = false;
3919 	isc_task_t *ntask = NULL;
3920 	isc_result_t result = ISC_R_SUCCESS;
3921 
3922 	dev->ev_sender = task;
3923 
3924 	if (sock->type == isc_sockettype_udp) {
3925 		io_state = doio_recv(sock, dev);
3926 	} else {
3927 		LOCK(&sock->lock);
3928 		have_lock = true;
3929 
3930 		if (ISC_LIST_EMPTY(sock->recv_list)) {
3931 			io_state = doio_recv(sock, dev);
3932 		} else {
3933 			io_state = DOIO_SOFT;
3934 		}
3935 	}
3936 
3937 	switch (io_state) {
3938 	case DOIO_SOFT:
3939 		/*
3940 		 * We couldn't read all or part of the request right now, so
3941 		 * queue it.
3942 		 *
3943 		 * Attach to socket and to task
3944 		 */
3945 		isc_task_attach(task, &ntask);
3946 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3947 
3948 		if (!have_lock) {
3949 			LOCK(&sock->lock);
3950 			have_lock = true;
3951 		}
3952 
3953 		/*
3954 		 * Enqueue the request.  If the socket was previously not being
3955 		 * watched, poke the watcher to start paying attention to it.
3956 		 */
3957 		bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
3958 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3959 		if (do_poke) {
3960 			select_poke(sock->manager, sock->threadid, sock->fd,
3961 				    SELECT_POKE_READ);
3962 		}
3963 
3964 		socket_log(sock, NULL, EVENT,
3965 			   "socket_recv: event %p -> task %p", dev, ntask);
3966 
3967 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
3968 			result = ISC_R_INPROGRESS;
3969 		}
3970 		break;
3971 
3972 	case DOIO_EOF:
3973 		dev->result = ISC_R_EOF;
3974 		/* fallthrough */
3975 
3976 	case DOIO_HARD:
3977 	case DOIO_SUCCESS:
3978 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
3979 			send_recvdone_event(sock, &dev);
3980 		}
3981 		break;
3982 	}
3983 
3984 	if (have_lock) {
3985 		UNLOCK(&sock->lock);
3986 	}
3987 
3988 	return (result);
3989 }
3990 
3991 isc_result_t
isc_socket_recv(isc_socket_t * sock0,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)3992 isc_socket_recv(isc_socket_t *sock0, isc_region_t *region, unsigned int minimum,
3993 		isc_task_t *task, isc_taskaction_t action, void *arg) {
3994 	isc__socket_t *sock = (isc__socket_t *)sock0;
3995 	isc_socketevent_t *dev;
3996 	isc__socketmgr_t *manager;
3997 
3998 	REQUIRE(VALID_SOCKET(sock));
3999 	REQUIRE(action != NULL);
4000 
4001 	manager = sock->manager;
4002 	REQUIRE(VALID_MANAGER(manager));
4003 
4004 	INSIST(sock->bound);
4005 
4006 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
4007 				   action, arg);
4008 	if (dev == NULL) {
4009 		return (ISC_R_NOMEMORY);
4010 	}
4011 
4012 	return (isc_socket_recv2(sock0, region, minimum, task, dev, 0));
4013 }
4014 
4015 isc_result_t
isc_socket_recv2(isc_socket_t * sock0,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)4016 isc_socket_recv2(isc_socket_t *sock0, isc_region_t *region,
4017 		 unsigned int minimum, isc_task_t *task,
4018 		 isc_socketevent_t *event, unsigned int flags) {
4019 	isc__socket_t *sock = (isc__socket_t *)sock0;
4020 
4021 	event->ev_sender = sock;
4022 	event->result = ISC_R_UNSET;
4023 	event->region = *region;
4024 	event->n = 0;
4025 	event->offset = 0;
4026 	event->attributes = 0;
4027 
4028 	/*
4029 	 * UDP sockets are always partial read.
4030 	 */
4031 	if (sock->type == isc_sockettype_udp) {
4032 		event->minimum = 1;
4033 	} else {
4034 		if (minimum == 0) {
4035 			event->minimum = region->length;
4036 		} else {
4037 			event->minimum = minimum;
4038 		}
4039 	}
4040 
4041 	return (socket_recv(sock, event, task, flags));
4042 }
4043 
4044 static isc_result_t
socket_send(isc__socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)4045 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4046 	    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4047 	    unsigned int flags) {
4048 	int io_state;
4049 	bool have_lock = false;
4050 	isc_task_t *ntask = NULL;
4051 	isc_result_t result = ISC_R_SUCCESS;
4052 
4053 	dev->ev_sender = task;
4054 
4055 	set_dev_address(address, sock, dev);
4056 	if (pktinfo != NULL) {
4057 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4058 		dev->pktinfo = *pktinfo;
4059 
4060 		if (!isc_sockaddr_issitelocal(&dev->address) &&
4061 		    !isc_sockaddr_islinklocal(&dev->address))
4062 		{
4063 			socket_log(sock, NULL, TRACE,
4064 				   "pktinfo structure provided, ifindex %u "
4065 				   "(set to 0)",
4066 				   pktinfo->ipi6_ifindex);
4067 
4068 			/*
4069 			 * Set the pktinfo index to 0 here, to let the
4070 			 * kernel decide what interface it should send on.
4071 			 */
4072 			dev->pktinfo.ipi6_ifindex = 0;
4073 		}
4074 	}
4075 
4076 	if (sock->type == isc_sockettype_udp) {
4077 		io_state = doio_send(sock, dev);
4078 	} else {
4079 		LOCK(&sock->lock);
4080 		have_lock = true;
4081 
4082 		if (ISC_LIST_EMPTY(sock->send_list)) {
4083 			io_state = doio_send(sock, dev);
4084 		} else {
4085 			io_state = DOIO_SOFT;
4086 		}
4087 	}
4088 
4089 	switch (io_state) {
4090 	case DOIO_SOFT:
4091 		/*
4092 		 * We couldn't send all or part of the request right now, so
4093 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4094 		 */
4095 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4096 			isc_task_attach(task, &ntask);
4097 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4098 
4099 			if (!have_lock) {
4100 				LOCK(&sock->lock);
4101 				have_lock = true;
4102 			}
4103 
4104 			/*
4105 			 * Enqueue the request.  If the socket was previously
4106 			 * not being watched, poke the watcher to start
4107 			 * paying attention to it.
4108 			 */
4109 			bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4110 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4111 			if (do_poke) {
4112 				select_poke(sock->manager, sock->threadid,
4113 					    sock->fd, SELECT_POKE_WRITE);
4114 			}
4115 			socket_log(sock, NULL, EVENT,
4116 				   "socket_send: event %p -> task %p", dev,
4117 				   ntask);
4118 
4119 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4120 				result = ISC_R_INPROGRESS;
4121 			}
4122 			break;
4123 		}
4124 
4125 		/* FALLTHROUGH */
4126 
4127 	case DOIO_HARD:
4128 	case DOIO_SUCCESS:
4129 		if (!have_lock) {
4130 			LOCK(&sock->lock);
4131 			have_lock = true;
4132 		}
4133 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4134 			send_senddone_event(sock, &dev);
4135 		}
4136 		break;
4137 	}
4138 
4139 	if (have_lock) {
4140 		UNLOCK(&sock->lock);
4141 	}
4142 
4143 	return (result);
4144 }
4145 
4146 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4147 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4148 		isc_taskaction_t action, void *arg) {
4149 	/*
4150 	 * REQUIRE() checking is performed in isc_socket_sendto().
4151 	 */
4152 	return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4153 }
4154 
4155 isc_result_t
isc_socket_sendto(isc_socket_t * sock0,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4156 isc_socket_sendto(isc_socket_t *sock0, isc_region_t *region, isc_task_t *task,
4157 		  isc_taskaction_t action, void *arg,
4158 		  const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4159 	isc__socket_t *sock = (isc__socket_t *)sock0;
4160 	isc_socketevent_t *dev;
4161 	isc__socketmgr_t *manager;
4162 
4163 	REQUIRE(VALID_SOCKET(sock));
4164 	REQUIRE(region != NULL);
4165 	REQUIRE(task != NULL);
4166 	REQUIRE(action != NULL);
4167 
4168 	manager = sock->manager;
4169 	REQUIRE(VALID_MANAGER(manager));
4170 
4171 	INSIST(sock->bound);
4172 
4173 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4174 				   action, arg);
4175 	if (dev == NULL) {
4176 		return (ISC_R_NOMEMORY);
4177 	}
4178 
4179 	dev->region = *region;
4180 
4181 	return (socket_send(sock, dev, task, address, pktinfo, 0));
4182 }
4183 
4184 isc_result_t
isc_socket_sendto2(isc_socket_t * sock0,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4185 isc_socket_sendto2(isc_socket_t *sock0, isc_region_t *region, isc_task_t *task,
4186 		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4187 		   isc_socketevent_t *event, unsigned int flags) {
4188 	isc__socket_t *sock = (isc__socket_t *)sock0;
4189 
4190 	REQUIRE(VALID_SOCKET(sock));
4191 	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4192 		0);
4193 	if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4194 		REQUIRE(sock->type == isc_sockettype_udp);
4195 	}
4196 	event->ev_sender = sock;
4197 	event->result = ISC_R_UNSET;
4198 	event->region = *region;
4199 	event->n = 0;
4200 	event->offset = 0;
4201 	event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4202 
4203 	return (socket_send(sock, event, task, address, pktinfo, flags));
4204 }
4205 
4206 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4207 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4208 #ifdef ISC_PLATFORM_HAVESYSUNH
4209 	int s;
4210 	struct stat sb;
4211 	char strbuf[ISC_STRERRORSIZE];
4212 
4213 	if (sockaddr->type.sa.sa_family != AF_UNIX) {
4214 		return;
4215 	}
4216 
4217 #ifndef S_ISSOCK
4218 #if defined(S_IFMT) && defined(S_IFSOCK)
4219 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4220 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4221 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4222 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4223 #endif /* ifndef S_ISSOCK */
4224 
4225 #ifndef S_ISFIFO
4226 #if defined(S_IFMT) && defined(S_IFIFO)
4227 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4228 #elif defined(_S_IFMT) && defined(S_IFIFO)
4229 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4230 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4231 #endif /* ifndef S_ISFIFO */
4232 
4233 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4234 /* cppcheck-suppress preprocessorErrorDirective */
4235 #error \
4236 	You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4237 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4238 
4239 #ifndef S_ISFIFO
4240 #define S_ISFIFO(mode) 0
4241 #endif /* ifndef S_ISFIFO */
4242 
4243 #ifndef S_ISSOCK
4244 #define S_ISSOCK(mode) 0
4245 #endif /* ifndef S_ISSOCK */
4246 
4247 	if (active) {
4248 		if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4249 			strerror_r(errno, strbuf, sizeof(strbuf));
4250 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4251 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4252 				      "isc_socket_cleanunix: stat(%s): %s",
4253 				      sockaddr->type.sunix.sun_path, strbuf);
4254 			return;
4255 		}
4256 		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4257 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4258 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4259 				      "isc_socket_cleanunix: %s: not a socket",
4260 				      sockaddr->type.sunix.sun_path);
4261 			return;
4262 		}
4263 		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4264 			strerror_r(errno, strbuf, sizeof(strbuf));
4265 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4266 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4267 				      "isc_socket_cleanunix: unlink(%s): %s",
4268 				      sockaddr->type.sunix.sun_path, strbuf);
4269 		}
4270 		return;
4271 	}
4272 
4273 	s = socket(AF_UNIX, SOCK_STREAM, 0);
4274 	if (s < 0) {
4275 		strerror_r(errno, strbuf, sizeof(strbuf));
4276 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4277 			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4278 			      "isc_socket_cleanunix: socket(%s): %s",
4279 			      sockaddr->type.sunix.sun_path, strbuf);
4280 		return;
4281 	}
4282 
4283 	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4284 		switch (errno) {
4285 		case ENOENT: /* We exited cleanly last time */
4286 			break;
4287 		default:
4288 			strerror_r(errno, strbuf, sizeof(strbuf));
4289 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4290 				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4291 				      "isc_socket_cleanunix: stat(%s): %s",
4292 				      sockaddr->type.sunix.sun_path, strbuf);
4293 			break;
4294 		}
4295 		goto cleanup;
4296 	}
4297 
4298 	if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4299 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4300 			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4301 			      "isc_socket_cleanunix: %s: not a socket",
4302 			      sockaddr->type.sunix.sun_path);
4303 		goto cleanup;
4304 	}
4305 
4306 	if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4307 		    sizeof(sockaddr->type.sunix)) < 0)
4308 	{
4309 		switch (errno) {
4310 		case ECONNREFUSED:
4311 		case ECONNRESET:
4312 			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4313 				strerror_r(errno, strbuf, sizeof(strbuf));
4314 				isc_log_write(
4315 					isc_lctx, ISC_LOGCATEGORY_GENERAL,
4316 					ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4317 					"isc_socket_cleanunix: "
4318 					"unlink(%s): %s",
4319 					sockaddr->type.sunix.sun_path, strbuf);
4320 			}
4321 			break;
4322 		default:
4323 			strerror_r(errno, strbuf, sizeof(strbuf));
4324 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4325 				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4326 				      "isc_socket_cleanunix: connect(%s): %s",
4327 				      sockaddr->type.sunix.sun_path, strbuf);
4328 			break;
4329 		}
4330 	}
4331 cleanup:
4332 	close(s);
4333 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4334 	UNUSED(sockaddr);
4335 	UNUSED(active);
4336 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4337 }
4338 
4339 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4340 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4341 		    uint32_t owner, uint32_t group) {
4342 #ifdef ISC_PLATFORM_HAVESYSUNH
4343 	isc_result_t result = ISC_R_SUCCESS;
4344 	char strbuf[ISC_STRERRORSIZE];
4345 	char path[sizeof(sockaddr->type.sunix.sun_path)];
4346 #ifdef NEED_SECURE_DIRECTORY
4347 	char *slash;
4348 #endif /* ifdef NEED_SECURE_DIRECTORY */
4349 
4350 	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4351 	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4352 	strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4353 
4354 #ifdef NEED_SECURE_DIRECTORY
4355 	slash = strrchr(path, '/');
4356 	if (slash != NULL) {
4357 		if (slash != path) {
4358 			*slash = '\0';
4359 		} else {
4360 			strlcpy(path, "/", sizeof(path));
4361 		}
4362 	} else {
4363 		strlcpy(path, ".", sizeof(path));
4364 	}
4365 #endif /* ifdef NEED_SECURE_DIRECTORY */
4366 
4367 	if (chmod(path, perm) < 0) {
4368 		strerror_r(errno, strbuf, sizeof(strbuf));
4369 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4370 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4371 			      "isc_socket_permunix: chmod(%s, %d): %s", path,
4372 			      perm, strbuf);
4373 		result = ISC_R_FAILURE;
4374 	}
4375 	if (chown(path, owner, group) < 0) {
4376 		strerror_r(errno, strbuf, sizeof(strbuf));
4377 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4378 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4379 			      "isc_socket_permunix: chown(%s, %d, %d): %s",
4380 			      path, owner, group, strbuf);
4381 		result = ISC_R_FAILURE;
4382 	}
4383 	return (result);
4384 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4385 	UNUSED(sockaddr);
4386 	UNUSED(perm);
4387 	UNUSED(owner);
4388 	UNUSED(group);
4389 	return (ISC_R_NOTIMPLEMENTED);
4390 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4391 }
4392 
4393 isc_result_t
isc_socket_bind(isc_socket_t * sock0,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4394 isc_socket_bind(isc_socket_t *sock0, const isc_sockaddr_t *sockaddr,
4395 		isc_socket_options_t options) {
4396 	isc__socket_t *sock = (isc__socket_t *)sock0;
4397 	char strbuf[ISC_STRERRORSIZE];
4398 	int on = 1;
4399 
4400 	REQUIRE(VALID_SOCKET(sock));
4401 
4402 	LOCK(&sock->lock);
4403 
4404 	INSIST(!sock->bound);
4405 	INSIST(!sock->dupped);
4406 
4407 	if (sock->pf != sockaddr->type.sa.sa_family) {
4408 		UNLOCK(&sock->lock);
4409 		return (ISC_R_FAMILYMISMATCH);
4410 	}
4411 
4412 	/*
4413 	 * Only set SO_REUSEADDR when we want a specific port.
4414 	 */
4415 #ifdef AF_UNIX
4416 	if (sock->pf == AF_UNIX) {
4417 		goto bind_socket;
4418 	}
4419 #endif /* ifdef AF_UNIX */
4420 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4421 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4422 	{
4423 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4424 			       sizeof(on)) < 0) {
4425 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4426 					 "setsockopt(%d) failed", sock->fd);
4427 		}
4428 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4429 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4430 			       (void *)&on, sizeof(on)) < 0)
4431 		{
4432 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4433 					 "setsockopt(%d) failed", sock->fd);
4434 		}
4435 #elif defined(__linux__) && defined(SO_REUSEPORT)
4436 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4437 			       sizeof(on)) < 0) {
4438 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4439 					 "setsockopt(%d) failed", sock->fd);
4440 		}
4441 #endif		/* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4442 		/* Press on... */
4443 	}
4444 #ifdef AF_UNIX
4445 bind_socket:
4446 #endif /* ifdef AF_UNIX */
4447 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4448 		inc_stats(sock->manager->stats,
4449 			  sock->statsindex[STATID_BINDFAIL]);
4450 
4451 		UNLOCK(&sock->lock);
4452 		switch (errno) {
4453 		case EACCES:
4454 			return (ISC_R_NOPERM);
4455 		case EADDRNOTAVAIL:
4456 			return (ISC_R_ADDRNOTAVAIL);
4457 		case EADDRINUSE:
4458 			return (ISC_R_ADDRINUSE);
4459 		case EINVAL:
4460 			return (ISC_R_BOUND);
4461 		default:
4462 			strerror_r(errno, strbuf, sizeof(strbuf));
4463 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4464 					 strbuf);
4465 			return (ISC_R_UNEXPECTED);
4466 		}
4467 	}
4468 
4469 	socket_log(sock, sockaddr, TRACE, "bound");
4470 	sock->bound = 1;
4471 
4472 	UNLOCK(&sock->lock);
4473 	return (ISC_R_SUCCESS);
4474 }
4475 
4476 /*
4477  * Enable this only for specific OS versions, and only when they have repaired
4478  * their problems with it.  Until then, this is is broken and needs to be
4479  * disabled by default.  See RT22589 for details.
4480  */
4481 #undef ENABLE_ACCEPTFILTER
4482 
4483 isc_result_t
isc_socket_filter(isc_socket_t * sock0,const char * filter)4484 isc_socket_filter(isc_socket_t *sock0, const char *filter) {
4485 	isc__socket_t *sock = (isc__socket_t *)sock0;
4486 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4487 	char strbuf[ISC_STRERRORSIZE];
4488 	struct accept_filter_arg afa;
4489 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4490 	UNUSED(sock);
4491 	UNUSED(filter);
4492 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4493 
4494 	REQUIRE(VALID_SOCKET(sock));
4495 
4496 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4497 	bzero(&afa, sizeof(afa));
4498 	strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4499 	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4500 		       sizeof(afa)) == -1) {
4501 		strerror_r(errno, strbuf, sizeof(strbuf));
4502 		socket_log(sock, NULL, CREATION,
4503 			   "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4504 		return (ISC_R_FAILURE);
4505 	}
4506 	return (ISC_R_SUCCESS);
4507 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4508 	return (ISC_R_NOTIMPLEMENTED);
4509 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4510 }
4511 
4512 /*
4513  * Try enabling TCP Fast Open for a given socket if the OS supports it.
4514  */
4515 static void
set_tcp_fastopen(isc__socket_t * sock,unsigned int backlog)4516 set_tcp_fastopen(isc__socket_t *sock, unsigned int backlog) {
4517 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4518 	char strbuf[ISC_STRERRORSIZE];
4519 
4520 /*
4521  * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4522  * shipping a default kernel without TFO support, so we special-case it by
4523  * performing an additional runtime check for TFO support using sysctl to
4524  * prevent setsockopt() errors from being logged.
4525  */
4526 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4527 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4528 	unsigned int enabled;
4529 	size_t enabledlen = sizeof(enabled);
4530 	static bool tfo_notice_logged = false;
4531 
4532 	if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4533 		/*
4534 		 * This kernel does not support TCP Fast Open.  There is
4535 		 * nothing more we can do.
4536 		 */
4537 		return;
4538 	} else if (enabled == 0) {
4539 		/*
4540 		 * This kernel does support TCP Fast Open, but it is disabled
4541 		 * by sysctl.  Notify the user, but do not nag.
4542 		 */
4543 		if (!tfo_notice_logged) {
4544 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4545 				      ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4546 				      "TCP_FASTOPEN support is disabled by "
4547 				      "sysctl (" SYSCTL_TFO " = 0)");
4548 			tfo_notice_logged = true;
4549 		}
4550 		return;
4551 	}
4552 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4553 
4554 #ifdef __APPLE__
4555 	backlog = 1;
4556 #else  /* ifdef __APPLE__ */
4557 	backlog = backlog / 2;
4558 	if (backlog == 0) {
4559 		backlog = 1;
4560 	}
4561 #endif /* ifdef __APPLE__ */
4562 	if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4563 		       sizeof(backlog)) < 0)
4564 	{
4565 		strerror_r(errno, strbuf, sizeof(strbuf));
4566 		UNEXPECTED_ERROR(__FILE__, __LINE__,
4567 				 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4568 				 sock->fd, strbuf);
4569 		/* TCP_FASTOPEN is experimental so ignore failures */
4570 	}
4571 #else  /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4572 	UNUSED(sock);
4573 	UNUSED(backlog);
4574 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4575 }
4576 
4577 /*
4578  * Set up to listen on a given socket.  We do this by creating an internal
4579  * event that will be dispatched when the socket has read activity.  The
4580  * watcher will send the internal event to the task when there is a new
4581  * connection.
4582  *
4583  * Unlike in read, we don't preallocate a done event here.  Every time there
4584  * is a new connection we'll have to allocate a new one anyway, so we might
4585  * as well keep things simple rather than having to track them.
4586  */
4587 isc_result_t
isc_socket_listen(isc_socket_t * sock0,unsigned int backlog)4588 isc_socket_listen(isc_socket_t *sock0, unsigned int backlog) {
4589 	isc__socket_t *sock = (isc__socket_t *)sock0;
4590 	char strbuf[ISC_STRERRORSIZE];
4591 
4592 	REQUIRE(VALID_SOCKET(sock));
4593 
4594 	LOCK(&sock->lock);
4595 
4596 	REQUIRE(!sock->listener);
4597 	REQUIRE(sock->bound);
4598 	REQUIRE(sock->type == isc_sockettype_tcp ||
4599 		sock->type == isc_sockettype_unix);
4600 
4601 	if (backlog == 0) {
4602 		backlog = SOMAXCONN;
4603 	}
4604 
4605 	if (listen(sock->fd, (int)backlog) < 0) {
4606 		UNLOCK(&sock->lock);
4607 		strerror_r(errno, strbuf, sizeof(strbuf));
4608 
4609 		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4610 
4611 		return (ISC_R_UNEXPECTED);
4612 	}
4613 
4614 	set_tcp_fastopen(sock, backlog);
4615 
4616 	sock->listener = 1;
4617 
4618 	UNLOCK(&sock->lock);
4619 	return (ISC_R_SUCCESS);
4620 }
4621 
4622 /*
4623  * This should try to do aggressive accept() XXXMLG
4624  */
4625 isc_result_t
isc_socket_accept(isc_socket_t * sock0,isc_task_t * task,isc_taskaction_t action,void * arg)4626 isc_socket_accept(isc_socket_t *sock0, isc_task_t *task,
4627 		  isc_taskaction_t action, void *arg) {
4628 	isc__socket_t *sock = (isc__socket_t *)sock0;
4629 	isc_socket_newconnev_t *dev;
4630 	isc__socketmgr_t *manager;
4631 	isc_task_t *ntask = NULL;
4632 	isc__socket_t *nsock;
4633 	isc_result_t result;
4634 	bool do_poke = false;
4635 
4636 	REQUIRE(VALID_SOCKET(sock));
4637 	manager = sock->manager;
4638 	REQUIRE(VALID_MANAGER(manager));
4639 
4640 	LOCK(&sock->lock);
4641 
4642 	REQUIRE(sock->listener);
4643 
4644 	/*
4645 	 * Sender field is overloaded here with the task we will be sending
4646 	 * this event to.  Just before the actual event is delivered the
4647 	 * actual ev_sender will be touched up to be the socket.
4648 	 */
4649 	dev = (isc_socket_newconnev_t *)isc_event_allocate(
4650 		manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4651 		sizeof(*dev));
4652 	ISC_LINK_INIT(dev, ev_link);
4653 
4654 	result = allocate_socket(manager, sock->type, &nsock);
4655 	if (result != ISC_R_SUCCESS) {
4656 		isc_event_free(ISC_EVENT_PTR(&dev));
4657 		UNLOCK(&sock->lock);
4658 		return (result);
4659 	}
4660 
4661 	/*
4662 	 * Attach to socket and to task.
4663 	 */
4664 	isc_task_attach(task, &ntask);
4665 	if (isc_task_exiting(ntask)) {
4666 		free_socket(&nsock);
4667 		isc_task_detach(&ntask);
4668 		isc_event_free(ISC_EVENT_PTR(&dev));
4669 		UNLOCK(&sock->lock);
4670 		return (ISC_R_SHUTTINGDOWN);
4671 	}
4672 	isc_refcount_increment0(&nsock->references);
4673 	nsock->statsindex = sock->statsindex;
4674 
4675 	dev->ev_sender = ntask;
4676 	dev->newsocket = (isc_socket_t *)nsock;
4677 
4678 	/*
4679 	 * Poke watcher here.  We still have the socket locked, so there
4680 	 * is no race condition.  We will keep the lock for such a short
4681 	 * bit of time waking it up now or later won't matter all that much.
4682 	 */
4683 	do_poke = ISC_LIST_EMPTY(sock->accept_list);
4684 	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4685 	if (do_poke) {
4686 		select_poke(manager, sock->threadid, sock->fd,
4687 			    SELECT_POKE_ACCEPT);
4688 	}
4689 	UNLOCK(&sock->lock);
4690 	return (ISC_R_SUCCESS);
4691 }
4692 
4693 isc_result_t
isc_socket_connect(isc_socket_t * sock0,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4694 isc_socket_connect(isc_socket_t *sock0, const isc_sockaddr_t *addr,
4695 		   isc_task_t *task, isc_taskaction_t action, void *arg) {
4696 	isc__socket_t *sock = (isc__socket_t *)sock0;
4697 	isc_socket_connev_t *dev;
4698 	isc_task_t *ntask = NULL;
4699 	isc__socketmgr_t *manager;
4700 	int cc;
4701 	char strbuf[ISC_STRERRORSIZE];
4702 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4703 
4704 	REQUIRE(VALID_SOCKET(sock));
4705 	REQUIRE(addr != NULL);
4706 	REQUIRE(task != NULL);
4707 	REQUIRE(action != NULL);
4708 
4709 	manager = sock->manager;
4710 	REQUIRE(VALID_MANAGER(manager));
4711 	REQUIRE(addr != NULL);
4712 
4713 	if (isc_sockaddr_ismulticast(addr)) {
4714 		return (ISC_R_MULTICAST);
4715 	}
4716 
4717 	LOCK(&sock->lock);
4718 
4719 	dev = (isc_socket_connev_t *)isc_event_allocate(
4720 		manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4721 		sizeof(*dev));
4722 	ISC_LINK_INIT(dev, ev_link);
4723 
4724 	if (sock->connecting) {
4725 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4726 		goto queue;
4727 	}
4728 
4729 	if (sock->connected) {
4730 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4731 		dev->result = ISC_R_SUCCESS;
4732 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4733 
4734 		UNLOCK(&sock->lock);
4735 
4736 		return (ISC_R_SUCCESS);
4737 	}
4738 
4739 	/*
4740 	 * Try to do the connect right away, as there can be only one
4741 	 * outstanding, and it might happen to complete.
4742 	 */
4743 	sock->peer_address = *addr;
4744 	cc = connect(sock->fd, &addr->type.sa, addr->length);
4745 	if (cc < 0) {
4746 		/*
4747 		 * The socket is nonblocking and the connection cannot be
4748 		 * completed immediately.  It is possible to select(2) or
4749 		 * poll(2) for completion by selecting the socket for writing.
4750 		 * After select(2) indicates writability, use getsockopt(2) to
4751 		 * read the SO_ERROR option at level SOL_SOCKET to determine
4752 		 * whether connect() completed successfully (SO_ERROR is zero)
4753 		 * or unsuccessfully (SO_ERROR is one of the usual error codes
4754 		 * listed here, explaining the reason for the failure).
4755 		 */
4756 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4757 			cc = 0;
4758 			goto success;
4759 		}
4760 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4761 			goto queue;
4762 		}
4763 
4764 		switch (errno) {
4765 #define ERROR_MATCH(a, b)        \
4766 	case a:                  \
4767 		dev->result = b; \
4768 		goto err_exit;
4769 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4770 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4771 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4772 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4773 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4774 #ifdef EHOSTDOWN
4775 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4776 #endif /* ifdef EHOSTDOWN */
4777 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4778 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4779 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4780 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4781 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4782 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4783 #undef ERROR_MATCH
4784 		}
4785 
4786 		sock->connected = 0;
4787 
4788 		strerror_r(errno, strbuf, sizeof(strbuf));
4789 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4790 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4791 				 addrbuf, errno, strbuf);
4792 
4793 		UNLOCK(&sock->lock);
4794 		inc_stats(sock->manager->stats,
4795 			  sock->statsindex[STATID_CONNECTFAIL]);
4796 		isc_event_free(ISC_EVENT_PTR(&dev));
4797 		return (ISC_R_UNEXPECTED);
4798 
4799 	err_exit:
4800 		sock->connected = 0;
4801 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4802 
4803 		UNLOCK(&sock->lock);
4804 		inc_stats(sock->manager->stats,
4805 			  sock->statsindex[STATID_CONNECTFAIL]);
4806 		return (ISC_R_SUCCESS);
4807 	}
4808 
4809 	/*
4810 	 * If connect completed, fire off the done event.
4811 	 */
4812 success:
4813 	if (cc == 0) {
4814 		sock->connected = 1;
4815 		sock->bound = 1;
4816 		dev->result = ISC_R_SUCCESS;
4817 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4818 
4819 		UNLOCK(&sock->lock);
4820 
4821 		inc_stats(sock->manager->stats,
4822 			  sock->statsindex[STATID_CONNECT]);
4823 
4824 		return (ISC_R_SUCCESS);
4825 	}
4826 
4827 queue:
4828 
4829 	/*
4830 	 * Attach to task.
4831 	 */
4832 	isc_task_attach(task, &ntask);
4833 
4834 	dev->ev_sender = ntask;
4835 
4836 	/*
4837 	 * Poke watcher here.  We still have the socket locked, so there
4838 	 * is no race condition.  We will keep the lock for such a short
4839 	 * bit of time waking it up now or later won't matter all that much.
4840 	 */
4841 	bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4842 	ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4843 	if (do_poke && !sock->connecting) {
4844 		sock->connecting = 1;
4845 		select_poke(manager, sock->threadid, sock->fd,
4846 			    SELECT_POKE_CONNECT);
4847 	}
4848 
4849 	UNLOCK(&sock->lock);
4850 	return (ISC_R_SUCCESS);
4851 }
4852 
4853 /*
4854  * Called when a socket with a pending connect() finishes.
4855  */
4856 static void
internal_connect(isc__socket_t * sock)4857 internal_connect(isc__socket_t *sock) {
4858 	isc_socket_connev_t *dev;
4859 	int cc;
4860 	isc_result_t result;
4861 	socklen_t optlen;
4862 	char strbuf[ISC_STRERRORSIZE];
4863 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4864 
4865 	INSIST(VALID_SOCKET(sock));
4866 	REQUIRE(sock->fd >= 0);
4867 
4868 	/*
4869 	 * Get the first item off the connect list.
4870 	 * If it is empty, unlock the socket and return.
4871 	 */
4872 	dev = ISC_LIST_HEAD(sock->connect_list);
4873 	if (dev == NULL) {
4874 		INSIST(!sock->connecting);
4875 		goto finish;
4876 	}
4877 
4878 	INSIST(sock->connecting);
4879 	sock->connecting = 0;
4880 
4881 	/*
4882 	 * Get any possible error status here.
4883 	 */
4884 	optlen = sizeof(cc);
4885 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4886 		       (void *)&optlen) != 0)
4887 	{
4888 		cc = errno;
4889 	} else {
4890 		errno = cc;
4891 	}
4892 
4893 	if (errno != 0) {
4894 		/*
4895 		 * If the error is EAGAIN, just re-select on this
4896 		 * fd and pretend nothing strange happened.
4897 		 */
4898 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4899 			sock->connecting = 1;
4900 			return;
4901 		}
4902 
4903 		inc_stats(sock->manager->stats,
4904 			  sock->statsindex[STATID_CONNECTFAIL]);
4905 
4906 		/*
4907 		 * Translate other errors into ISC_R_* flavors.
4908 		 */
4909 		switch (errno) {
4910 #define ERROR_MATCH(a, b)   \
4911 	case a:             \
4912 		result = b; \
4913 		break;
4914 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4915 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4916 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4917 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4918 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4919 #ifdef EHOSTDOWN
4920 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4921 #endif /* ifdef EHOSTDOWN */
4922 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4923 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4924 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4925 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4926 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4927 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4928 #undef ERROR_MATCH
4929 		default:
4930 			result = ISC_R_UNEXPECTED;
4931 			isc_sockaddr_format(&sock->peer_address, peerbuf,
4932 					    sizeof(peerbuf));
4933 			strerror_r(errno, strbuf, sizeof(strbuf));
4934 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4935 					 "internal_connect: connect(%s) %s",
4936 					 peerbuf, strbuf);
4937 		}
4938 	} else {
4939 		inc_stats(sock->manager->stats,
4940 			  sock->statsindex[STATID_CONNECT]);
4941 		result = ISC_R_SUCCESS;
4942 		sock->connected = 1;
4943 		sock->bound = 1;
4944 	}
4945 
4946 	do {
4947 		dev->result = result;
4948 		send_connectdone_event(sock, &dev);
4949 		dev = ISC_LIST_HEAD(sock->connect_list);
4950 	} while (dev != NULL);
4951 
4952 finish:
4953 	unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
4954 		   SELECT_POKE_CONNECT);
4955 	UNLOCK(&sock->lock);
4956 }
4957 
4958 isc_result_t
isc_socket_getpeername(isc_socket_t * sock0,isc_sockaddr_t * addressp)4959 isc_socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
4960 	isc__socket_t *sock = (isc__socket_t *)sock0;
4961 	isc_result_t result;
4962 
4963 	REQUIRE(VALID_SOCKET(sock));
4964 	REQUIRE(addressp != NULL);
4965 
4966 	LOCK(&sock->lock);
4967 
4968 	if (sock->connected) {
4969 		*addressp = sock->peer_address;
4970 		result = ISC_R_SUCCESS;
4971 	} else {
4972 		result = ISC_R_NOTCONNECTED;
4973 	}
4974 
4975 	UNLOCK(&sock->lock);
4976 
4977 	return (result);
4978 }
4979 
4980 isc_result_t
isc_socket_getsockname(isc_socket_t * sock0,isc_sockaddr_t * addressp)4981 isc_socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
4982 	isc__socket_t *sock = (isc__socket_t *)sock0;
4983 	socklen_t len;
4984 	isc_result_t result;
4985 	char strbuf[ISC_STRERRORSIZE];
4986 
4987 	REQUIRE(VALID_SOCKET(sock));
4988 	REQUIRE(addressp != NULL);
4989 
4990 	LOCK(&sock->lock);
4991 
4992 	if (!sock->bound) {
4993 		result = ISC_R_NOTBOUND;
4994 		goto out;
4995 	}
4996 
4997 	result = ISC_R_SUCCESS;
4998 
4999 	len = sizeof(addressp->type);
5000 	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5001 		strerror_r(errno, strbuf, sizeof(strbuf));
5002 		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
5003 		result = ISC_R_UNEXPECTED;
5004 		goto out;
5005 	}
5006 	addressp->length = (unsigned int)len;
5007 
5008 out:
5009 	UNLOCK(&sock->lock);
5010 
5011 	return (result);
5012 }
5013 
5014 /*
5015  * Run through the list of events on this socket, and cancel the ones
5016  * queued for task "task" of type "how".  "how" is a bitmask.
5017  */
5018 void
isc_socket_cancel(isc_socket_t * sock0,isc_task_t * task,unsigned int how)5019 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
5020 	isc__socket_t *sock = (isc__socket_t *)sock0;
5021 
5022 	REQUIRE(VALID_SOCKET(sock));
5023 
5024 	/*
5025 	 * Quick exit if there is nothing to do.  Don't even bother locking
5026 	 * in this case.
5027 	 */
5028 	if (how == 0) {
5029 		return;
5030 	}
5031 
5032 	LOCK(&sock->lock);
5033 
5034 	/*
5035 	 * All of these do the same thing, more or less.
5036 	 * Each will:
5037 	 *	o If the internal event is marked as "posted" try to
5038 	 *	  remove it from the task's queue.  If this fails, mark it
5039 	 *	  as canceled instead, and let the task clean it up later.
5040 	 *	o For each I/O request for that task of that type, post
5041 	 *	  its done event with status of "ISC_R_CANCELED".
5042 	 *	o Reset any state needed.
5043 	 */
5044 	if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
5045 	    !ISC_LIST_EMPTY(sock->recv_list)) {
5046 		isc_socketevent_t *dev;
5047 		isc_socketevent_t *next;
5048 		isc_task_t *current_task;
5049 
5050 		dev = ISC_LIST_HEAD(sock->recv_list);
5051 
5052 		while (dev != NULL) {
5053 			current_task = dev->ev_sender;
5054 			next = ISC_LIST_NEXT(dev, ev_link);
5055 
5056 			if ((task == NULL) || (task == current_task)) {
5057 				dev->result = ISC_R_CANCELED;
5058 				send_recvdone_event(sock, &dev);
5059 			}
5060 			dev = next;
5061 		}
5062 	}
5063 
5064 	if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
5065 	    !ISC_LIST_EMPTY(sock->send_list)) {
5066 		isc_socketevent_t *dev;
5067 		isc_socketevent_t *next;
5068 		isc_task_t *current_task;
5069 
5070 		dev = ISC_LIST_HEAD(sock->send_list);
5071 
5072 		while (dev != NULL) {
5073 			current_task = dev->ev_sender;
5074 			next = ISC_LIST_NEXT(dev, ev_link);
5075 
5076 			if ((task == NULL) || (task == current_task)) {
5077 				dev->result = ISC_R_CANCELED;
5078 				send_senddone_event(sock, &dev);
5079 			}
5080 			dev = next;
5081 		}
5082 	}
5083 
5084 	if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
5085 	    !ISC_LIST_EMPTY(sock->accept_list)) {
5086 		isc_socket_newconnev_t *dev;
5087 		isc_socket_newconnev_t *next;
5088 		isc_task_t *current_task;
5089 
5090 		dev = ISC_LIST_HEAD(sock->accept_list);
5091 		while (dev != NULL) {
5092 			current_task = dev->ev_sender;
5093 			next = ISC_LIST_NEXT(dev, ev_link);
5094 
5095 			if ((task == NULL) || (task == current_task)) {
5096 				ISC_LIST_UNLINK(sock->accept_list, dev,
5097 						ev_link);
5098 
5099 				(void)isc_refcount_decrement(
5100 					&NEWCONNSOCK(dev)->references);
5101 				free_socket((isc__socket_t **)&dev->newsocket);
5102 
5103 				dev->result = ISC_R_CANCELED;
5104 				dev->ev_sender = sock;
5105 				isc_task_sendtoanddetach(&current_task,
5106 							 ISC_EVENT_PTR(&dev),
5107 							 sock->threadid);
5108 			}
5109 
5110 			dev = next;
5111 		}
5112 	}
5113 
5114 	if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5115 	    !ISC_LIST_EMPTY(sock->connect_list))
5116 	{
5117 		isc_socket_connev_t *dev;
5118 		isc_socket_connev_t *next;
5119 		isc_task_t *current_task;
5120 
5121 		INSIST(sock->connecting);
5122 		sock->connecting = 0;
5123 
5124 		dev = ISC_LIST_HEAD(sock->connect_list);
5125 
5126 		while (dev != NULL) {
5127 			current_task = dev->ev_sender;
5128 			next = ISC_LIST_NEXT(dev, ev_link);
5129 
5130 			if ((task == NULL) || (task == current_task)) {
5131 				dev->result = ISC_R_CANCELED;
5132 				send_connectdone_event(sock, &dev);
5133 			}
5134 			dev = next;
5135 		}
5136 	}
5137 
5138 	UNLOCK(&sock->lock);
5139 }
5140 
5141 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock0)5142 isc_socket_gettype(isc_socket_t *sock0) {
5143 	isc__socket_t *sock = (isc__socket_t *)sock0;
5144 
5145 	REQUIRE(VALID_SOCKET(sock));
5146 
5147 	return (sock->type);
5148 }
5149 
5150 void
isc_socket_ipv6only(isc_socket_t * sock0,bool yes)5151 isc_socket_ipv6only(isc_socket_t *sock0, bool yes) {
5152 	isc__socket_t *sock = (isc__socket_t *)sock0;
5153 #if defined(IPV6_V6ONLY)
5154 	int onoff = yes ? 1 : 0;
5155 #else  /* if defined(IPV6_V6ONLY) */
5156 	UNUSED(yes);
5157 	UNUSED(sock);
5158 #endif /* if defined(IPV6_V6ONLY) */
5159 
5160 	REQUIRE(VALID_SOCKET(sock));
5161 	INSIST(!sock->dupped);
5162 
5163 #ifdef IPV6_V6ONLY
5164 	if (sock->pf == AF_INET6) {
5165 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5166 			       (void *)&onoff, sizeof(int)) < 0)
5167 		{
5168 			char strbuf[ISC_STRERRORSIZE];
5169 			strerror_r(errno, strbuf, sizeof(strbuf));
5170 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5171 					 "setsockopt(%d, IPV6_V6ONLY) failed: "
5172 					 "%s",
5173 					 sock->fd, strbuf);
5174 		}
5175 	}
5176 #endif /* ifdef IPV6_V6ONLY */
5177 }
5178 
5179 static void
setdscp(isc__socket_t * sock,isc_dscp_t dscp)5180 setdscp(isc__socket_t *sock, isc_dscp_t dscp) {
5181 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5182 	int value = dscp << 2;
5183 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5184 
5185 	sock->dscp = dscp;
5186 
5187 #ifdef IP_TOS
5188 	if (sock->pf == AF_INET) {
5189 		if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5190 			       sizeof(value)) < 0) {
5191 			char strbuf[ISC_STRERRORSIZE];
5192 			strerror_r(errno, strbuf, sizeof(strbuf));
5193 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5194 					 "setsockopt(%d, IP_TOS, %.02x) "
5195 					 "failed: %s",
5196 					 sock->fd, value >> 2, strbuf);
5197 		}
5198 	}
5199 #endif /* ifdef IP_TOS */
5200 #ifdef IPV6_TCLASS
5201 	if (sock->pf == AF_INET6) {
5202 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5203 			       (void *)&value, sizeof(value)) < 0)
5204 		{
5205 			char strbuf[ISC_STRERRORSIZE];
5206 			strerror_r(errno, strbuf, sizeof(strbuf));
5207 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5208 					 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5209 					 "failed: %s",
5210 					 sock->fd, dscp >> 2, strbuf);
5211 		}
5212 	}
5213 #endif /* ifdef IPV6_TCLASS */
5214 }
5215 
5216 void
isc_socket_dscp(isc_socket_t * sock0,isc_dscp_t dscp)5217 isc_socket_dscp(isc_socket_t *sock0, isc_dscp_t dscp) {
5218 	isc__socket_t *sock = (isc__socket_t *)sock0;
5219 
5220 	REQUIRE(VALID_SOCKET(sock));
5221 	REQUIRE(dscp < 0x40);
5222 
5223 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5224 	UNUSED(dscp);
5225 #else  /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5226 	if (dscp < 0) {
5227 		return;
5228 	}
5229 
5230 	/* The DSCP value must not be changed once it has been set. */
5231 	if (isc_dscp_check_value != -1) {
5232 		INSIST(dscp == isc_dscp_check_value);
5233 	}
5234 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5235 
5236 #ifdef notyet
5237 	REQUIRE(!sock->dupped);
5238 #endif /* ifdef notyet */
5239 
5240 	setdscp(sock, dscp);
5241 }
5242 
5243 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5244 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5245 		       isc_taskaction_t action, void *arg) {
5246 	return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5247 }
5248 
5249 void
isc_socket_setname(isc_socket_t * socket0,const char * name,void * tag)5250 isc_socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
5251 	isc__socket_t *sock = (isc__socket_t *)socket0;
5252 
5253 	/*
5254 	 * Name 'sock'.
5255 	 */
5256 
5257 	REQUIRE(VALID_SOCKET(sock));
5258 
5259 	LOCK(&sock->lock);
5260 	strlcpy(sock->name, name, sizeof(sock->name));
5261 	sock->tag = tag;
5262 	UNLOCK(&sock->lock);
5263 }
5264 
5265 const char *
isc_socket_getname(isc_socket_t * socket0)5266 isc_socket_getname(isc_socket_t *socket0) {
5267 	isc__socket_t *sock = (isc__socket_t *)socket0;
5268 
5269 	return (sock->name);
5270 }
5271 
5272 void *
isc_socket_gettag(isc_socket_t * socket0)5273 isc_socket_gettag(isc_socket_t *socket0) {
5274 	isc__socket_t *sock = (isc__socket_t *)socket0;
5275 
5276 	return (sock->tag);
5277 }
5278 
5279 int
isc_socket_getfd(isc_socket_t * socket0)5280 isc_socket_getfd(isc_socket_t *socket0) {
5281 	isc__socket_t *sock = (isc__socket_t *)socket0;
5282 
5283 	return ((short)sock->fd);
5284 }
5285 
5286 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5287 static bool hasreuseport = false;
5288 
5289 static void
init_hasreuseport()5290 init_hasreuseport() {
5291 /*
5292  * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5293  * We only want to use it on Linux, if it's available. On BSD we want to dup()
5294  * sockets instead of re-binding them.
5295  */
5296 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5297 	(defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5298 	int sock, yes = 1;
5299 	sock = socket(AF_INET, SOCK_DGRAM, 0);
5300 	if (sock < 0) {
5301 		sock = socket(AF_INET6, SOCK_DGRAM, 0);
5302 		if (sock < 0) {
5303 			return;
5304 		}
5305 	}
5306 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5307 		       sizeof(yes)) < 0) {
5308 		close(sock);
5309 		return;
5310 #if defined(__FreeBSD_kernel__)
5311 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5312 			      sizeof(yes)) < 0)
5313 #else  /* if defined(__FreeBSD_kernel__) */
5314 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5315 			      sizeof(yes)) < 0)
5316 #endif /* if defined(__FreeBSD_kernel__) */
5317 	{
5318 		close(sock);
5319 		return;
5320 	}
5321 	hasreuseport = true;
5322 	close(sock);
5323 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5324 	* (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5325 }
5326 
5327 bool
isc_socket_hasreuseport()5328 isc_socket_hasreuseport() {
5329 	RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5330 		      ISC_R_SUCCESS);
5331 	return (hasreuseport);
5332 }
5333 
5334 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5335 static const char *
_socktype(isc_sockettype_t type)5336 _socktype(isc_sockettype_t type) {
5337 	switch (type) {
5338 	case isc_sockettype_udp:
5339 		return ("udp");
5340 	case isc_sockettype_tcp:
5341 		return ("tcp");
5342 	case isc_sockettype_unix:
5343 		return ("unix");
5344 	default:
5345 		return ("not-initialized");
5346 	}
5347 }
5348 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5349 
5350 #ifdef HAVE_LIBXML2
5351 #define TRY0(a)                     \
5352 	do {                        \
5353 		xmlrc = (a);        \
5354 		if (xmlrc < 0)      \
5355 			goto error; \
5356 	} while (0)
5357 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr0,void * writer0)5358 isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, void *writer0) {
5359 	isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5360 	isc__socket_t *sock = NULL;
5361 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5362 	isc_sockaddr_t addr;
5363 	socklen_t len;
5364 	int xmlrc;
5365 	xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5366 
5367 	LOCK(&mgr->lock);
5368 
5369 	TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5370 	sock = ISC_LIST_HEAD(mgr->socklist);
5371 	while (sock != NULL) {
5372 		LOCK(&sock->lock);
5373 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5374 
5375 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5376 		TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5377 		TRY0(xmlTextWriterEndElement(writer));
5378 
5379 		if (sock->name[0] != 0) {
5380 			TRY0(xmlTextWriterStartElement(writer,
5381 						       ISC_XMLCHAR "name"));
5382 			TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5383 							    sock->name));
5384 			TRY0(xmlTextWriterEndElement(writer)); /* name */
5385 		}
5386 
5387 		TRY0(xmlTextWriterStartElement(writer,
5388 					       ISC_XMLCHAR "references"));
5389 		TRY0(xmlTextWriterWriteFormatString(
5390 			writer, "%d",
5391 			(int)isc_refcount_current(&sock->references)));
5392 		TRY0(xmlTextWriterEndElement(writer));
5393 
5394 		TRY0(xmlTextWriterWriteElement(
5395 			writer, ISC_XMLCHAR "type",
5396 			ISC_XMLCHAR _socktype(sock->type)));
5397 
5398 		if (sock->connected) {
5399 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5400 					    sizeof(peerbuf));
5401 			TRY0(xmlTextWriterWriteElement(
5402 				writer, ISC_XMLCHAR "peer-address",
5403 				ISC_XMLCHAR peerbuf));
5404 		}
5405 
5406 		len = sizeof(addr);
5407 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5408 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5409 			TRY0(xmlTextWriterWriteElement(
5410 				writer, ISC_XMLCHAR "local-address",
5411 				ISC_XMLCHAR peerbuf));
5412 		}
5413 
5414 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5415 		if (sock->listener) {
5416 			TRY0(xmlTextWriterWriteElement(writer,
5417 						       ISC_XMLCHAR "state",
5418 						       ISC_XMLCHAR "listener"));
5419 		}
5420 		if (sock->connected) {
5421 			TRY0(xmlTextWriterWriteElement(
5422 				writer, ISC_XMLCHAR "state",
5423 				ISC_XMLCHAR "connected"));
5424 		}
5425 		if (sock->connecting) {
5426 			TRY0(xmlTextWriterWriteElement(
5427 				writer, ISC_XMLCHAR "state",
5428 				ISC_XMLCHAR "connecting"));
5429 		}
5430 		if (sock->bound) {
5431 			TRY0(xmlTextWriterWriteElement(writer,
5432 						       ISC_XMLCHAR "state",
5433 						       ISC_XMLCHAR "bound"));
5434 		}
5435 
5436 		TRY0(xmlTextWriterEndElement(writer)); /* states */
5437 
5438 		TRY0(xmlTextWriterEndElement(writer)); /* socket */
5439 
5440 		UNLOCK(&sock->lock);
5441 		sock = ISC_LIST_NEXT(sock, link);
5442 	}
5443 	TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5444 
5445 error:
5446 	if (sock != NULL) {
5447 		UNLOCK(&sock->lock);
5448 	}
5449 
5450 	UNLOCK(&mgr->lock);
5451 
5452 	return (xmlrc);
5453 }
5454 #endif /* HAVE_LIBXML2 */
5455 
5456 #ifdef HAVE_JSON_C
5457 #define CHECKMEM(m)                              \
5458 	do {                                     \
5459 		if (m == NULL) {                 \
5460 			result = ISC_R_NOMEMORY; \
5461 			goto error;              \
5462 		}                                \
5463 	} while (0)
5464 
5465 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr0,void * stats0)5466 isc_socketmgr_renderjson(isc_socketmgr_t *mgr0, void *stats0) {
5467 	isc_result_t result = ISC_R_SUCCESS;
5468 	isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5469 	isc__socket_t *sock = NULL;
5470 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5471 	isc_sockaddr_t addr;
5472 	socklen_t len;
5473 	json_object *obj, *array = json_object_new_array();
5474 	json_object *stats = (json_object *)stats0;
5475 
5476 	CHECKMEM(array);
5477 
5478 	LOCK(&mgr->lock);
5479 
5480 	sock = ISC_LIST_HEAD(mgr->socklist);
5481 	while (sock != NULL) {
5482 		json_object *states, *entry = json_object_new_object();
5483 		char buf[255];
5484 
5485 		CHECKMEM(entry);
5486 		json_object_array_add(array, entry);
5487 
5488 		LOCK(&sock->lock);
5489 
5490 		snprintf(buf, sizeof(buf), "%p", sock);
5491 		obj = json_object_new_string(buf);
5492 		CHECKMEM(obj);
5493 		json_object_object_add(entry, "id", obj);
5494 
5495 		if (sock->name[0] != 0) {
5496 			obj = json_object_new_string(sock->name);
5497 			CHECKMEM(obj);
5498 			json_object_object_add(entry, "name", obj);
5499 		}
5500 
5501 		obj = json_object_new_int(
5502 			(int)isc_refcount_current(&sock->references));
5503 		CHECKMEM(obj);
5504 		json_object_object_add(entry, "references", obj);
5505 
5506 		obj = json_object_new_string(_socktype(sock->type));
5507 		CHECKMEM(obj);
5508 		json_object_object_add(entry, "type", obj);
5509 
5510 		if (sock->connected) {
5511 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5512 					    sizeof(peerbuf));
5513 			obj = json_object_new_string(peerbuf);
5514 			CHECKMEM(obj);
5515 			json_object_object_add(entry, "peer-address", obj);
5516 		}
5517 
5518 		len = sizeof(addr);
5519 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5520 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5521 			obj = json_object_new_string(peerbuf);
5522 			CHECKMEM(obj);
5523 			json_object_object_add(entry, "local-address", obj);
5524 		}
5525 
5526 		states = json_object_new_array();
5527 		CHECKMEM(states);
5528 		json_object_object_add(entry, "states", states);
5529 
5530 		if (sock->listener) {
5531 			obj = json_object_new_string("listener");
5532 			CHECKMEM(obj);
5533 			json_object_array_add(states, obj);
5534 		}
5535 
5536 		if (sock->connected) {
5537 			obj = json_object_new_string("connected");
5538 			CHECKMEM(obj);
5539 			json_object_array_add(states, obj);
5540 		}
5541 
5542 		if (sock->connecting) {
5543 			obj = json_object_new_string("connecting");
5544 			CHECKMEM(obj);
5545 			json_object_array_add(states, obj);
5546 		}
5547 
5548 		if (sock->bound) {
5549 			obj = json_object_new_string("bound");
5550 			CHECKMEM(obj);
5551 			json_object_array_add(states, obj);
5552 		}
5553 
5554 		UNLOCK(&sock->lock);
5555 		sock = ISC_LIST_NEXT(sock, link);
5556 	}
5557 
5558 	json_object_object_add(stats, "sockets", array);
5559 	array = NULL;
5560 	result = ISC_R_SUCCESS;
5561 
5562 error:
5563 	if (array != NULL) {
5564 		json_object_put(array);
5565 	}
5566 
5567 	if (sock != NULL) {
5568 		UNLOCK(&sock->lock);
5569 	}
5570 
5571 	UNLOCK(&mgr->lock);
5572 
5573 	return (result);
5574 }
5575 #endif /* HAVE_JSON_C */
5576 
5577 isc_result_t
isc_socketmgr_createinctx(isc_mem_t * mctx,isc_socketmgr_t ** managerp)5578 isc_socketmgr_createinctx(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
5579 	isc_result_t result;
5580 
5581 	result = isc_socketmgr_create(mctx, managerp);
5582 
5583 	return (result);
5584 }
5585