1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * SPDX-License-Identifier: MPL-2.0
5  *
6  * This Source Code Form is subject to the terms of the Mozilla Public
7  * License, v. 2.0. If a copy of the MPL was not distributed with this
8  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9  *
10  * See the COPYRIGHT file distributed with this work for additional
11  * information regarding copyright ownership.
12  */
13 
14 #pragma once
15 
16 #include <unistd.h>
17 #include <uv.h>
18 
19 #include <openssl/err.h>
20 #include <openssl/ssl.h>
21 
22 #include <isc/astack.h>
23 #include <isc/atomic.h>
24 #include <isc/barrier.h>
25 #include <isc/buffer.h>
26 #include <isc/condition.h>
27 #include <isc/magic.h>
28 #include <isc/mem.h>
29 #include <isc/netmgr.h>
30 #include <isc/queue.h>
31 #include <isc/quota.h>
32 #include <isc/random.h>
33 #include <isc/refcount.h>
34 #include <isc/region.h>
35 #include <isc/result.h>
36 #include <isc/rwlock.h>
37 #include <isc/sockaddr.h>
38 #include <isc/stats.h>
39 #include <isc/thread.h>
40 #include <isc/util.h>
41 
42 #include "uv-compat.h"
43 
44 #define ISC_NETMGR_TID_UNKNOWN -1
45 
46 /* Must be different from ISC_NETMGR_TID_UNKNOWN */
47 #define ISC_NETMGR_NON_INTERLOCKED -2
48 
49 /*
50  * Receive buffers
51  */
52 #if HAVE_DECL_UV_UDP_MMSG_CHUNK
53 /*
54  * The value 20 here is UV__MMSG_MAXWIDTH taken from the current libuv source,
55  * libuv will not receive more that 20 datagrams in a single recvmmsg call.
56  */
57 #define ISC_NETMGR_UDP_RECVBUF_SIZE (20 * UINT16_MAX)
58 #else
59 /*
60  * A single DNS message size
61  */
62 #define ISC_NETMGR_UDP_RECVBUF_SIZE UINT16_MAX
63 #endif
64 
65 /*
66  * The TCP receive buffer can fit one maximum sized DNS message plus its size,
67  * the receive buffer here affects TCP, DoT and DoH.
68  */
69 #define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
70 
71 /* Pick the larger buffer */
72 #define ISC_NETMGR_RECVBUF_SIZE                                     \
73 	(ISC_NETMGR_UDP_RECVBUF_SIZE >= ISC_NETMGR_TCP_RECVBUF_SIZE \
74 		 ? ISC_NETMGR_UDP_RECVBUF_SIZE                      \
75 		 : ISC_NETMGR_TCP_RECVBUF_SIZE)
76 
77 /*
78  * Send buffer
79  */
80 #define ISC_NETMGR_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
81 
82 /*%
83  * Regular TCP buffer size.
84  */
85 #define NM_REG_BUF 4096
86 
87 /*%
88  * Larger buffer for when the regular one isn't enough; this will
89  * hold two full DNS packets with lengths.  netmgr receives 64k at
90  * most in TCPDNS connections, so there's no risk of overrun
91  * when using a buffer this size.
92  */
93 #define NM_BIG_BUF ISC_NETMGR_TCP_RECVBUF_SIZE * 2
94 
95 #if defined(SO_REUSEPORT_LB) || (defined(SO_REUSEPORT) && defined(__linux__))
96 #define HAVE_SO_REUSEPORT_LB 1
97 #endif
98 
99 /*
100  * Define NETMGR_TRACE to activate tracing of handles and sockets.
101  * This will impair performance but enables us to quickly determine,
102  * if netmgr resources haven't been cleaned up on shutdown, which ones
103  * are still in use.
104  */
105 #ifdef NETMGR_TRACE
106 #define TRACE_SIZE 8
107 
108 void
109 isc__nm_dump_active(isc_nm_t *nm);
110 
111 #if defined(__linux__)
112 #include <syscall.h>
113 #define gettid() (uint32_t) syscall(SYS_gettid)
114 #elif defined(_WIN32)
115 #define gettid() (uint32_t) GetCurrentThreadId()
116 #else
117 #define gettid() (uint32_t) pthread_self()
118 #endif
119 
120 #ifdef NETMGR_TRACE_VERBOSE
121 #define NETMGR_TRACE_LOG(format, ...)                                \
122 	fprintf(stderr, "%" PRIu32 ":%d:%s:%u:%s:" format, gettid(), \
123 		isc_nm_tid(), file, line, func, __VA_ARGS__)
124 #else
125 #define NETMGR_TRACE_LOG(format, ...) \
126 	(void)file;                   \
127 	(void)line;                   \
128 	(void)func;
129 #endif
130 
131 #define FLARG_PASS , file, line, func
132 #define FLARG                                              \
133 	, const char *file __attribute__((unused)),        \
134 		unsigned int line __attribute__((unused)), \
135 		const char *func __attribute__((unused))
136 #define FLARG_IEVENT(ievent)              \
137 	const char *file = ievent->file;  \
138 	unsigned int line = ievent->line; \
139 	const char *func = ievent->func;
140 #define FLARG_IEVENT_PASS(ievent) \
141 	ievent->file = file;      \
142 	ievent->line = line;      \
143 	ievent->func = func;
144 #define isc__nm_uvreq_get(req, sock) \
145 	isc___nm_uvreq_get(req, sock, __FILE__, __LINE__, __func__)
146 #define isc__nm_uvreq_put(req, sock) \
147 	isc___nm_uvreq_put(req, sock, __FILE__, __LINE__, __func__)
148 #define isc__nmsocket_init(sock, mgr, type, iface)                      \
149 	isc___nmsocket_init(sock, mgr, type, iface, __FILE__, __LINE__, \
150 			    __func__)
151 #define isc__nmsocket_put(sockp) \
152 	isc___nmsocket_put(sockp, __FILE__, __LINE__, __func__)
153 #define isc__nmsocket_attach(sock, target) \
154 	isc___nmsocket_attach(sock, target, __FILE__, __LINE__, __func__)
155 #define isc__nmsocket_detach(socketp) \
156 	isc___nmsocket_detach(socketp, __FILE__, __LINE__, __func__)
157 #define isc__nmsocket_close(socketp) \
158 	isc___nmsocket_close(socketp, __FILE__, __LINE__, __func__)
159 #define isc__nmhandle_get(sock, peer, local) \
160 	isc___nmhandle_get(sock, peer, local, __FILE__, __LINE__, __func__)
161 #define isc__nmsocket_prep_destroy(sock) \
162 	isc___nmsocket_prep_destroy(sock, __FILE__, __LINE__, __func__)
163 #else
164 #define NETMGR_TRACE_LOG(format, ...)
165 
166 #define FLARG_PASS
167 #define FLARG
168 #define FLARG_IEVENT(ievent)
169 #define FLARG_IEVENT_PASS(ievent)
170 #define isc__nm_uvreq_get(req, sock) isc___nm_uvreq_get(req, sock)
171 #define isc__nm_uvreq_put(req, sock) isc___nm_uvreq_put(req, sock)
172 #define isc__nmsocket_init(sock, mgr, type, iface) \
173 	isc___nmsocket_init(sock, mgr, type, iface)
174 #define isc__nmsocket_put(sockp)	   isc___nmsocket_put(sockp)
175 #define isc__nmsocket_attach(sock, target) isc___nmsocket_attach(sock, target)
176 #define isc__nmsocket_detach(socketp)	   isc___nmsocket_detach(socketp)
177 #define isc__nmsocket_close(socketp)	   isc___nmsocket_close(socketp)
178 #define isc__nmhandle_get(sock, peer, local) \
179 	isc___nmhandle_get(sock, peer, local)
180 #define isc__nmsocket_prep_destroy(sock) isc___nmsocket_prep_destroy(sock)
181 #endif
182 
183 /*
184  * Queue types in the order of processing priority.
185  */
186 typedef enum {
187 	NETIEVENT_PRIORITY = 0,
188 	NETIEVENT_PRIVILEGED = 1,
189 	NETIEVENT_TASK = 2,
190 	NETIEVENT_NORMAL = 3,
191 	NETIEVENT_MAX = 4,
192 } netievent_type_t;
193 
194 /*
195  * Single network event loop worker.
196  */
197 typedef struct isc__networker {
198 	isc_nm_t *mgr;
199 	int id;		  /* thread id */
200 	uv_loop_t loop;	  /* libuv loop structure */
201 	uv_async_t async; /* async channel to send
202 			   * data to this networker */
203 	isc_mutex_t lock;
204 	bool paused;
205 	bool finished;
206 	isc_thread_t thread;
207 	isc_queue_t *ievents[NETIEVENT_MAX];
208 	atomic_uint_fast32_t nievents[NETIEVENT_MAX];
209 	isc_condition_t cond_prio;
210 
211 	isc_refcount_t references;
212 	atomic_int_fast64_t pktcount;
213 	char *recvbuf;
214 	char *sendbuf;
215 	bool recvbuf_inuse;
216 } isc__networker_t;
217 
218 /*
219  * A general handle for a connection bound to a networker.  For UDP
220  * connections we have peer address here, so both TCP and UDP can be
221  * handled with a simple send-like function
222  */
223 #define NMHANDLE_MAGIC ISC_MAGIC('N', 'M', 'H', 'D')
224 #define VALID_NMHANDLE(t)                      \
225 	(ISC_MAGIC_VALID(t, NMHANDLE_MAGIC) && \
226 	 atomic_load(&(t)->references) > 0)
227 
228 typedef void (*isc__nm_closecb)(isc_nmhandle_t *);
229 
230 struct isc_nmhandle {
231 	int magic;
232 	isc_refcount_t references;
233 
234 	/*
235 	 * The socket is not 'attached' in the traditional
236 	 * reference-counting sense. Instead, we keep all handles in an
237 	 * array in the socket object.  This way, we don't have circular
238 	 * dependencies and we can close all handles when we're destroying
239 	 * the socket.
240 	 */
241 	isc_nmsocket_t *sock;
242 
243 	isc_sockaddr_t peer;
244 	isc_sockaddr_t local;
245 	isc_nm_opaquecb_t doreset; /* reset extra callback, external */
246 	isc_nm_opaquecb_t dofree;  /* free extra callback, external */
247 #ifdef NETMGR_TRACE
248 	void *backtrace[TRACE_SIZE];
249 	int backtrace_size;
250 	LINK(isc_nmhandle_t) active_link;
251 #endif
252 	void *opaque;
253 	char extra[];
254 };
255 
256 typedef enum isc__netievent_type {
257 	netievent_udpconnect,
258 	netievent_udpclose,
259 	netievent_udpsend,
260 	netievent_udpread,
261 	netievent_udpcancel,
262 
263 	netievent_tcpconnect,
264 	netievent_tcpclose,
265 	netievent_tcpsend,
266 	netievent_tcpstartread,
267 	netievent_tcppauseread,
268 	netievent_tcpaccept,
269 	netievent_tcpcancel,
270 
271 	netievent_tcpdnsaccept,
272 	netievent_tcpdnsconnect,
273 	netievent_tcpdnsclose,
274 	netievent_tcpdnssend,
275 	netievent_tcpdnsread,
276 	netievent_tcpdnscancel,
277 
278 	netievent_shutdown,
279 	netievent_stop,
280 	netievent_pause,
281 
282 	netievent_connectcb,
283 	netievent_readcb,
284 	netievent_sendcb,
285 
286 	netievent_task,
287 	netievent_privilegedtask,
288 
289 	/*
290 	 * event type values higher than this will be treated
291 	 * as high-priority events, which can be processed
292 	 * while the netmgr is pausing or paused.
293 	 */
294 	netievent_prio = 0xff,
295 
296 	netievent_udplisten,
297 	netievent_udpstop,
298 	netievent_tcplisten,
299 	netievent_tcpstop,
300 	netievent_tcpdnslisten,
301 	netievent_tcpdnsstop,
302 
303 	netievent_resume,
304 	netievent_detach,
305 	netievent_close,
306 } isc__netievent_type;
307 
308 typedef union {
309 	isc_nm_recv_cb_t recv;
310 	isc_nm_cb_t send;
311 	isc_nm_cb_t connect;
312 	isc_nm_accept_cb_t accept;
313 } isc__nm_cb_t;
314 
315 /*
316  * Wrapper around uv_req_t with 'our' fields in it.  req->data should
317  * always point to its parent.  Note that we always allocate more than
318  * sizeof(struct) because we make room for different req types;
319  */
320 #define UVREQ_MAGIC    ISC_MAGIC('N', 'M', 'U', 'R')
321 #define VALID_UVREQ(t) ISC_MAGIC_VALID(t, UVREQ_MAGIC)
322 
323 typedef struct isc__nm_uvreq isc__nm_uvreq_t;
324 struct isc__nm_uvreq {
325 	int magic;
326 	isc_nmsocket_t *sock;
327 	isc_nmhandle_t *handle;
328 	char tcplen[2];	      /* The TCP DNS message length */
329 	uv_buf_t uvbuf;	      /* translated isc_region_t, to be
330 			       * sent or received */
331 	isc_sockaddr_t local; /* local address */
332 	isc_sockaddr_t peer;  /* peer address */
333 	isc__nm_cb_t cb;      /* callback */
334 	void *cbarg;	      /* callback argument */
335 	uv_pipe_t ipc;	      /* used for sending socket
336 			       * uv_handles to other threads */
337 	union {
338 		uv_handle_t handle;
339 		uv_req_t req;
340 		uv_getaddrinfo_t getaddrinfo;
341 		uv_getnameinfo_t getnameinfo;
342 		uv_shutdown_t shutdown;
343 		uv_write_t write;
344 		uv_connect_t connect;
345 		uv_udp_send_t udp_send;
346 		uv_fs_t fs;
347 		uv_work_t work;
348 	} uv_req;
349 	ISC_LINK(isc__nm_uvreq_t) link;
350 };
351 
352 struct isc_nm_timer {
353 	isc_refcount_t references;
354 	uv_timer_t timer;
355 	isc_nmhandle_t *handle;
356 	isc_nm_timer_cb cb;
357 	void *cbarg;
358 };
359 
360 void *
361 isc__nm_get_netievent(isc_nm_t *mgr, isc__netievent_type type);
362 /*%<
363  * Allocate an ievent and set the type.
364  */
365 void
366 isc__nm_put_netievent(isc_nm_t *mgr, void *ievent);
367 
368 /*
369  * The macros here are used to simulate the "inheritance" in C, there's the base
370  * netievent structure that contains just its own type and socket, and there are
371  * extended netievent types that also have handles or requests or other data.
372  *
373  * The macros here ensure that:
374  *
375  *   1. every netievent type has matching definition, declaration and
376  *      implementation
377  *
378  *   2. we handle all the netievent types of same subclass the same, e.g. if the
379  *      extended netievent contains handle, we always attach to the handle in
380  *      the ctor and detach from the handle in dtor.
381  *
382  * There are three macros here for each netievent subclass:
383  *
384  *   1. NETIEVENT_*_TYPE(type) creates the typedef for each type; used below in
385  *   this header
386  *
387  *   2. NETIEVENT_*_DECL(type) generates the declaration of the get and put
388  *      functions (isc__nm_get_netievent_* and isc__nm_put_netievent_*); used
389  *      below in this header
390  *
391  *   3. NETIEVENT_*_DEF(type) generates the definition of the functions; used
392  *   either in netmgr.c or matching protocol file (e.g. udp.c, tcp.c, etc.)
393  */
394 
395 #define NETIEVENT__SOCKET         \
396 	isc__netievent_type type; \
397 	isc_nmsocket_t *sock;     \
398 	const char *file;         \
399 	unsigned int line;        \
400 	const char *func
401 
402 typedef struct isc__netievent__socket {
403 	NETIEVENT__SOCKET;
404 } isc__netievent__socket_t;
405 
406 #define NETIEVENT_SOCKET_TYPE(type) \
407 	typedef isc__netievent__socket_t isc__netievent_##type##_t;
408 
409 #define NETIEVENT_SOCKET_DECL(type)                              \
410 	isc__netievent_##type##_t *isc__nm_get_netievent_##type( \
411 		isc_nm_t *nm, isc_nmsocket_t *sock);             \
412 	void isc__nm_put_netievent_##type(isc_nm_t *nm,          \
413 					  isc__netievent_##type##_t *ievent);
414 
415 #define NETIEVENT_SOCKET_DEF(type)                                             \
416 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(               \
417 		isc_nm_t *nm, isc_nmsocket_t *sock) {                          \
418 		isc__netievent_##type##_t *ievent =                            \
419 			isc__nm_get_netievent(nm, netievent_##type);           \
420 		isc__nmsocket_attach(sock, &ievent->sock);                     \
421                                                                                \
422 		return (ievent);                                               \
423 	}                                                                      \
424                                                                                \
425 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
426 					  isc__netievent_##type##_t *ievent) { \
427 		isc__nmsocket_detach(&ievent->sock);                           \
428 		isc__nm_put_netievent(nm, ievent);                             \
429 	}
430 
431 typedef struct isc__netievent__socket_req {
432 	NETIEVENT__SOCKET;
433 	isc__nm_uvreq_t *req;
434 } isc__netievent__socket_req_t;
435 
436 #define NETIEVENT_SOCKET_REQ_TYPE(type) \
437 	typedef isc__netievent__socket_req_t isc__netievent_##type##_t;
438 
439 #define NETIEVENT_SOCKET_REQ_DECL(type)                                    \
440 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(           \
441 		isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req); \
442 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                    \
443 					  isc__netievent_##type##_t *ievent);
444 
445 #define NETIEVENT_SOCKET_REQ_DEF(type)                                         \
446 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(               \
447 		isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {    \
448 		isc__netievent_##type##_t *ievent =                            \
449 			isc__nm_get_netievent(nm, netievent_##type);           \
450 		isc__nmsocket_attach(sock, &ievent->sock);                     \
451 		ievent->req = req;                                             \
452                                                                                \
453 		return (ievent);                                               \
454 	}                                                                      \
455                                                                                \
456 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
457 					  isc__netievent_##type##_t *ievent) { \
458 		isc__nmsocket_detach(&ievent->sock);                           \
459 		isc__nm_put_netievent(nm, ievent);                             \
460 	}
461 
462 typedef struct isc__netievent__socket_req_result {
463 	isc__netievent_type type;
464 	isc_nmsocket_t *sock;
465 	isc__nm_uvreq_t *req;
466 	isc_result_t result;
467 } isc__netievent__socket_req_result_t;
468 
469 #define NETIEVENT_SOCKET_REQ_RESULT_TYPE(type) \
470 	typedef isc__netievent__socket_req_result_t isc__netievent_##type##_t;
471 
472 #define NETIEVENT_SOCKET_REQ_RESULT_DECL(type)                            \
473 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(          \
474 		isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req, \
475 		isc_result_t result);                                     \
476 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                   \
477 					  isc__netievent_##type##_t *ievent);
478 
479 #define NETIEVENT_SOCKET_REQ_RESULT_DEF(type)                                  \
480 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(               \
481 		isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req,      \
482 		isc_result_t result) {                                         \
483 		isc__netievent_##type##_t *ievent =                            \
484 			isc__nm_get_netievent(nm, netievent_##type);           \
485 		isc__nmsocket_attach(sock, &ievent->sock);                     \
486 		ievent->req = req;                                             \
487 		ievent->result = result;                                       \
488                                                                                \
489 		return (ievent);                                               \
490 	}                                                                      \
491                                                                                \
492 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
493 					  isc__netievent_##type##_t *ievent) { \
494 		isc__nmsocket_detach(&ievent->sock);                           \
495 		isc__nm_put_netievent(nm, ievent);                             \
496 	}
497 
498 typedef struct isc__netievent__socket_handle {
499 	NETIEVENT__SOCKET;
500 	isc_nmhandle_t *handle;
501 } isc__netievent__socket_handle_t;
502 
503 #define NETIEVENT_SOCKET_HANDLE_TYPE(type) \
504 	typedef isc__netievent__socket_handle_t isc__netievent_##type##_t;
505 
506 #define NETIEVENT_SOCKET_HANDLE_DECL(type)                                   \
507 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(             \
508 		isc_nm_t *nm, isc_nmsocket_t *sock, isc_nmhandle_t *handle); \
509 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                      \
510 					  isc__netievent_##type##_t *ievent);
511 
512 #define NETIEVENT_SOCKET_HANDLE_DEF(type)                                      \
513 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(               \
514 		isc_nm_t *nm, isc_nmsocket_t *sock, isc_nmhandle_t *handle) {  \
515 		isc__netievent_##type##_t *ievent =                            \
516 			isc__nm_get_netievent(nm, netievent_##type);           \
517 		isc__nmsocket_attach(sock, &ievent->sock);                     \
518 		isc_nmhandle_attach(handle, &ievent->handle);                  \
519                                                                                \
520 		return (ievent);                                               \
521 	}                                                                      \
522                                                                                \
523 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
524 					  isc__netievent_##type##_t *ievent) { \
525 		isc__nmsocket_detach(&ievent->sock);                           \
526 		isc_nmhandle_detach(&ievent->handle);                          \
527 		isc__nm_put_netievent(nm, ievent);                             \
528 	}
529 
530 typedef struct isc__netievent__socket_quota {
531 	NETIEVENT__SOCKET;
532 	isc_quota_t *quota;
533 } isc__netievent__socket_quota_t;
534 
535 #define NETIEVENT_SOCKET_QUOTA_TYPE(type) \
536 	typedef isc__netievent__socket_quota_t isc__netievent_##type##_t;
537 
538 #define NETIEVENT_SOCKET_QUOTA_DECL(type)                                \
539 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(         \
540 		isc_nm_t *nm, isc_nmsocket_t *sock, isc_quota_t *quota); \
541 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                  \
542 					  isc__netievent_##type##_t *ievent);
543 
544 #define NETIEVENT_SOCKET_QUOTA_DEF(type)                                       \
545 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(               \
546 		isc_nm_t *nm, isc_nmsocket_t *sock, isc_quota_t *quota) {      \
547 		isc__netievent_##type##_t *ievent =                            \
548 			isc__nm_get_netievent(nm, netievent_##type);           \
549 		isc__nmsocket_attach(sock, &ievent->sock);                     \
550 		ievent->quota = quota;                                         \
551                                                                                \
552 		return (ievent);                                               \
553 	}                                                                      \
554                                                                                \
555 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
556 					  isc__netievent_##type##_t *ievent) { \
557 		isc__nmsocket_detach(&ievent->sock);                           \
558 		isc__nm_put_netievent(nm, ievent);                             \
559 	}
560 
561 typedef struct isc__netievent__task {
562 	isc__netievent_type type;
563 	isc_task_t *task;
564 } isc__netievent__task_t;
565 
566 #define NETIEVENT_TASK_TYPE(type) \
567 	typedef isc__netievent__task_t isc__netievent_##type##_t;
568 
569 #define NETIEVENT_TASK_DECL(type)                                \
570 	isc__netievent_##type##_t *isc__nm_get_netievent_##type( \
571 		isc_nm_t *nm, isc_task_t *task);                 \
572 	void isc__nm_put_netievent_##type(isc_nm_t *nm,          \
573 					  isc__netievent_##type##_t *ievent);
574 
575 #define NETIEVENT_TASK_DEF(type)                                               \
576 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(               \
577 		isc_nm_t *nm, isc_task_t *task) {                              \
578 		isc__netievent_##type##_t *ievent =                            \
579 			isc__nm_get_netievent(nm, netievent_##type);           \
580 		ievent->task = task;                                           \
581                                                                                \
582 		return (ievent);                                               \
583 	}                                                                      \
584                                                                                \
585 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
586 					  isc__netievent_##type##_t *ievent) { \
587 		ievent->task = NULL;                                           \
588 		isc__nm_put_netievent(nm, ievent);                             \
589 	}
590 
591 typedef struct isc__netievent_udpsend {
592 	NETIEVENT__SOCKET;
593 	isc_sockaddr_t peer;
594 	isc__nm_uvreq_t *req;
595 } isc__netievent_udpsend_t;
596 
597 typedef struct isc__netievent {
598 	isc__netievent_type type;
599 } isc__netievent_t;
600 
601 #define NETIEVENT_TYPE(type) typedef isc__netievent_t isc__netievent_##type##_t;
602 
603 #define NETIEVENT_DECL(type)                                                   \
604 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(isc_nm_t *nm); \
605 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
606 					  isc__netievent_##type##_t *ievent);
607 
608 #define NETIEVENT_DEF(type)                                                    \
609 	isc__netievent_##type##_t *isc__nm_get_netievent_##type(               \
610 		isc_nm_t *nm) {                                                \
611 		isc__netievent_##type##_t *ievent =                            \
612 			isc__nm_get_netievent(nm, netievent_##type);           \
613                                                                                \
614 		return (ievent);                                               \
615 	}                                                                      \
616                                                                                \
617 	void isc__nm_put_netievent_##type(isc_nm_t *nm,                        \
618 					  isc__netievent_##type##_t *ievent) { \
619 		isc__nm_put_netievent(nm, ievent);                             \
620 	}
621 
622 typedef union {
623 	isc__netievent_t ni;
624 	isc__netievent__socket_t nis;
625 	isc__netievent__socket_req_t nisr;
626 	isc__netievent_udpsend_t nius;
627 	isc__netievent__socket_quota_t nisq;
628 } isc__netievent_storage_t;
629 
630 /*
631  * Work item for a uv_work threadpool.
632  */
633 typedef struct isc__nm_work {
634 	isc_nm_t *netmgr;
635 	uv_work_t req;
636 	isc_nm_workcb_t cb;
637 	isc_nm_after_workcb_t after_cb;
638 	void *data;
639 } isc__nm_work_t;
640 
641 /*
642  * Network manager
643  */
644 #define NM_MAGIC    ISC_MAGIC('N', 'E', 'T', 'M')
645 #define VALID_NM(t) ISC_MAGIC_VALID(t, NM_MAGIC)
646 
647 struct isc_nm {
648 	int magic;
649 	isc_refcount_t references;
650 	isc_mem_t *mctx;
651 	int nworkers;
652 	isc_mutex_t lock;
653 	isc_condition_t wkstatecond;
654 	isc_condition_t wkpausecond;
655 	isc__networker_t *workers;
656 
657 	isc_stats_t *stats;
658 
659 	uint_fast32_t workers_running;
660 	atomic_uint_fast32_t workers_paused;
661 	atomic_uint_fast32_t maxudp;
662 
663 	atomic_bool paused;
664 
665 	/*
666 	 * Active connections are being closed and new connections are
667 	 * no longer allowed.
668 	 */
669 	atomic_bool closing;
670 
671 	/*
672 	 * A worker is actively waiting for other workers, for example to
673 	 * stop listening; that means no other thread can do the same thing
674 	 * or pause, or we'll deadlock. We have to either re-enqueue our
675 	 * event or wait for the other one to finish if we want to pause.
676 	 */
677 	atomic_int interlocked;
678 
679 	/*
680 	 * Timeout values for TCP connections, corresponding to
681 	 * tcp-intiial-timeout, tcp-idle-timeout, tcp-keepalive-timeout,
682 	 * and tcp-advertised-timeout. Note that these are stored in
683 	 * milliseconds so they can be used directly with the libuv timer,
684 	 * but they are configured in tenths of seconds.
685 	 */
686 	atomic_uint_fast32_t init;
687 	atomic_uint_fast32_t idle;
688 	atomic_uint_fast32_t keepalive;
689 	atomic_uint_fast32_t advertised;
690 
691 	isc_barrier_t pausing;
692 	isc_barrier_t resuming;
693 
694 #ifdef NETMGR_TRACE
695 	ISC_LIST(isc_nmsocket_t) active_sockets;
696 #endif
697 };
698 
699 typedef enum isc_nmsocket_type {
700 	isc_nm_udpsocket,
701 	isc_nm_udplistener, /* Aggregate of nm_udpsocks */
702 	isc_nm_tcpsocket,
703 	isc_nm_tcplistener,
704 	isc_nm_tcpdnslistener,
705 	isc_nm_tcpdnssocket,
706 } isc_nmsocket_type;
707 
708 /*%
709  * A universal structure for either a single socket or a group of
710  * dup'd/SO_REUSE_PORT-using sockets listening on the same interface.
711  */
712 #define NMSOCK_MAGIC	ISC_MAGIC('N', 'M', 'S', 'K')
713 #define VALID_NMSOCK(t) ISC_MAGIC_VALID(t, NMSOCK_MAGIC)
714 
715 /*%
716  * Index into socket stat counter arrays.
717  */
718 enum {
719 	STATID_OPEN = 0,
720 	STATID_OPENFAIL = 1,
721 	STATID_CLOSE = 2,
722 	STATID_BINDFAIL = 3,
723 	STATID_CONNECTFAIL = 4,
724 	STATID_CONNECT = 5,
725 	STATID_ACCEPTFAIL = 6,
726 	STATID_ACCEPT = 7,
727 	STATID_SENDFAIL = 8,
728 	STATID_RECVFAIL = 9,
729 	STATID_ACTIVE = 10
730 };
731 
732 typedef void (*isc_nm_closehandlecb_t)(void *arg);
733 /*%<
734  * Opaque callback function, used for isc_nmhandle 'reset' and 'free'
735  * callbacks.
736  */
737 
738 struct isc_nmsocket {
739 	/*% Unlocked, RO */
740 	int magic;
741 	int tid;
742 	isc_nmsocket_type type;
743 	isc_nm_t *mgr;
744 
745 	/*% Parent socket for multithreaded listeners */
746 	isc_nmsocket_t *parent;
747 	/*% Listener socket this connection was accepted on */
748 	isc_nmsocket_t *listener;
749 	/*% Self socket */
750 	isc_nmsocket_t *self;
751 
752 	isc_barrier_t startlistening;
753 	isc_barrier_t stoplistening;
754 
755 	/*%
756 	 * quota is the TCP client, attached when a TCP connection
757 	 * is established. pquota is a non-attached pointer to the
758 	 * TCP client quota, stored in listening sockets but only
759 	 * attached in connected sockets.
760 	 */
761 	isc_quota_t *quota;
762 	isc_quota_t *pquota;
763 	isc_quota_cb_t quotacb;
764 
765 	/*%
766 	 * Socket statistics
767 	 */
768 	const isc_statscounter_t *statsindex;
769 
770 	/*%
771 	 * TCP read/connect timeout timers.
772 	 */
773 	uv_timer_t read_timer;
774 	uint64_t read_timeout;
775 	uint64_t connect_timeout;
776 
777 	/*%
778 	 * TCP write timeout timer.
779 	 */
780 	uv_timer_t write_timer;
781 	uint64_t write_timeout;
782 	int64_t writes;
783 
784 	/*% outer socket is for 'wrapped' sockets - e.g. tcpdns in tcp */
785 	isc_nmsocket_t *outer;
786 
787 	/*% server socket for connections */
788 	isc_nmsocket_t *server;
789 
790 	/*% Child sockets for multi-socket setups */
791 	isc_nmsocket_t *children;
792 	uint_fast32_t nchildren;
793 	isc_sockaddr_t iface;
794 	isc_nmhandle_t *statichandle;
795 	isc_nmhandle_t *outerhandle;
796 
797 	/*% Extra data allocated at the end of each isc_nmhandle_t */
798 	size_t extrahandlesize;
799 
800 	/*% TCP backlog */
801 	int backlog;
802 
803 	/*% libuv data */
804 	uv_os_sock_t fd;
805 	union uv_any_handle uv_handle;
806 
807 	/*% Peer address */
808 	isc_sockaddr_t peer;
809 
810 	/* Atomic */
811 	/*% Number of running (e.g. listening) child sockets */
812 	atomic_uint_fast32_t rchildren;
813 
814 	/*%
815 	 * Socket is active if it's listening, working, etc. If it's
816 	 * closing, then it doesn't make a sense, for example, to
817 	 * push handles or reqs for reuse.
818 	 */
819 	atomic_bool active;
820 	atomic_bool destroying;
821 
822 	/*%
823 	 * Socket is closed if it's not active and all the possible
824 	 * callbacks were fired, there are no active handles, etc.
825 	 * If active==false but closed==false, that means the socket
826 	 * is closing.
827 	 */
828 	atomic_bool closing;
829 	atomic_bool closed;
830 	atomic_bool listening;
831 	atomic_bool connecting;
832 	atomic_bool connected;
833 	bool accepting;
834 	bool reading;
835 	atomic_bool timedout;
836 	isc_refcount_t references;
837 
838 	/*%
839 	 * Established an outgoing connection, as client not server.
840 	 */
841 	atomic_bool client;
842 
843 	/*%
844 	 * TCPDNS socket has been set not to pipeline.
845 	 */
846 	atomic_bool sequential;
847 
848 	/*%
849 	 * The socket is processing read callback, this is guard to not read
850 	 * data before the readcb is back.
851 	 */
852 	bool processing;
853 
854 	/*%
855 	 * A TCP socket has had isc_nm_pauseread() called.
856 	 */
857 	atomic_bool readpaused;
858 
859 	/*%
860 	 * A TCP or TCPDNS socket has been set to use the keepalive
861 	 * timeout instead of the default idle timeout.
862 	 */
863 	atomic_bool keepalive;
864 
865 	/*%
866 	 * 'spare' handles for that can be reused to avoid allocations,
867 	 * for UDP.
868 	 */
869 	isc_astack_t *inactivehandles;
870 	isc_astack_t *inactivereqs;
871 
872 	/*%
873 	 * Used to wait for TCP listening events to complete, and
874 	 * for the number of running children to reach zero during
875 	 * shutdown.
876 	 *
877 	 * We use two condition variables to prevent the race where the netmgr
878 	 * threads would be able to finish and destroy the socket before it's
879 	 * unlocked by the isc_nm_listen<proto>() function.  So, the flow is as
880 	 * follows:
881 	 *
882 	 *   1. parent thread creates all children sockets and passes then to
883 	 *      netthreads, looks at the signaling variable and WAIT(cond) until
884 	 *      the childrens are done initializing
885 	 *
886 	 *   2. the events get picked by netthreads, calls the libuv API (and
887 	 *      either succeeds or fails) and WAIT(scond) until all other
888 	 *      children sockets in netthreads are initialized and the listening
889 	 *      socket lock is unlocked
890 	 *
891 	 *   3. the control is given back to the parent thread which now either
892 	 *      returns success or shutdowns the listener if an error has
893 	 *      occured in the children netthread
894 	 *
895 	 * NOTE: The other approach would be doing an extra attach to the parent
896 	 * listening socket, and then detach it in the parent thread, but that
897 	 * breaks the promise that once the libuv socket is initialized on the
898 	 * nmsocket, the nmsocket needs to be handled only by matching
899 	 * netthread, so in fact that would add a complexity in a way that
900 	 * isc__nmsocket_detach would have to be converted to use an
901 	 * asynchrounous netievent.
902 	 */
903 	isc_mutex_t lock;
904 	isc_condition_t cond;
905 	isc_condition_t scond;
906 
907 	/*%
908 	 * Used to pass a result back from listen or connect events.
909 	 */
910 	isc_result_t result;
911 
912 	/*%
913 	 * Current number of active handles.
914 	 */
915 	atomic_int_fast32_t ah;
916 
917 	/*% Buffer for TCPDNS processing */
918 	size_t buf_size;
919 	size_t buf_len;
920 	unsigned char *buf;
921 
922 	/*%
923 	 * This function will be called with handle->sock
924 	 * as the argument whenever a handle's references drop
925 	 * to zero, after its reset callback has been called.
926 	 */
927 	isc_nm_closehandlecb_t closehandle_cb;
928 
929 	isc_nmhandle_t *recv_handle;
930 	isc_nm_recv_cb_t recv_cb;
931 	void *recv_cbarg;
932 	bool recv_read;
933 
934 	isc_nm_cb_t connect_cb;
935 	void *connect_cbarg;
936 
937 	isc_nm_accept_cb_t accept_cb;
938 	void *accept_cbarg;
939 
940 	atomic_int_fast32_t active_child_connections;
941 
942 #ifdef NETMGR_TRACE
943 	void *backtrace[TRACE_SIZE];
944 	int backtrace_size;
945 	LINK(isc_nmsocket_t) active_link;
946 	ISC_LIST(isc_nmhandle_t) active_handles;
947 #endif
948 };
949 
950 bool
951 isc__nm_in_netthread(void);
952 /*%
953  * Returns 'true' if we're in the network thread.
954  */
955 
956 void
957 isc__nm_maybe_enqueue_ievent(isc__networker_t *worker, isc__netievent_t *event);
958 /*%<
959  * If the caller is already in the matching nmthread, process the netievent
960  * directly, if not enqueue using isc__nm_enqueue_ievent().
961  */
962 
963 void
964 isc__nm_enqueue_ievent(isc__networker_t *worker, isc__netievent_t *event);
965 /*%<
966  * Enqueue an ievent onto a specific worker queue. (This the only safe
967  * way to use an isc__networker_t from another thread.)
968  */
969 
970 void
971 isc__nm_free_uvbuf(isc_nmsocket_t *sock, const uv_buf_t *buf);
972 /*%<
973  * Free a buffer allocated for a receive operation.
974  *
975  * Note that as currently implemented, this doesn't actually
976  * free anything, marks the isc__networker's UDP receive buffer
977  * as "not in use".
978  */
979 
980 isc_nmhandle_t *
981 isc___nmhandle_get(isc_nmsocket_t *sock, isc_sockaddr_t *peer,
982 		   isc_sockaddr_t *local FLARG);
983 /*%<
984  * Get a handle for the socket 'sock', allocating a new one
985  * if there isn't one available in 'sock->inactivehandles'.
986  *
987  * If 'peer' is not NULL, set the handle's peer address to 'peer',
988  * otherwise set it to 'sock->peer'.
989  *
990  * If 'local' is not NULL, set the handle's local address to 'local',
991  * otherwise set it to 'sock->iface->addr'.
992  *
993  * 'sock' will be attached to 'handle->sock'. The caller may need
994  * to detach the socket afterward.
995  */
996 
997 isc__nm_uvreq_t *
998 isc___nm_uvreq_get(isc_nm_t *mgr, isc_nmsocket_t *sock FLARG);
999 /*%<
1000  * Get a UV request structure for the socket 'sock', allocating a
1001  * new one if there isn't one available in 'sock->inactivereqs'.
1002  */
1003 
1004 void
1005 isc___nm_uvreq_put(isc__nm_uvreq_t **req, isc_nmsocket_t *sock FLARG);
1006 /*%<
1007  * Completes the use of a UV request structure, setting '*req' to NULL.
1008  *
1009  * The UV request is pushed onto the 'sock->inactivereqs' stack or,
1010  * if that doesn't work, freed.
1011  */
1012 
1013 void
1014 isc___nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type,
1015 		    isc_sockaddr_t *iface FLARG);
1016 /*%<
1017  * Initialize socket 'sock', attach it to 'mgr', and set it to type 'type'
1018  * and its interface to 'iface'.
1019  */
1020 
1021 void
1022 isc___nmsocket_attach(isc_nmsocket_t *sock, isc_nmsocket_t **target FLARG);
1023 /*%<
1024  * Attach to a socket, increasing refcount
1025  */
1026 
1027 void
1028 isc___nmsocket_detach(isc_nmsocket_t **socketp FLARG);
1029 /*%<
1030  * Detach from socket, decreasing refcount and possibly destroying the
1031  * socket if it's no longer referenced.
1032  */
1033 
1034 void
1035 isc___nmsocket_prep_destroy(isc_nmsocket_t *sock FLARG);
1036 /*%<
1037  * Market 'sock' as inactive, close it if necessary, and destroy it
1038  * if there are no remaining references or active handles.
1039  */
1040 
1041 void
1042 isc__nmsocket_shutdown(isc_nmsocket_t *sock);
1043 /*%<
1044  * Initiate the socket shutdown which actively calls the active
1045  * callbacks.
1046  */
1047 
1048 bool
1049 isc__nmsocket_active(isc_nmsocket_t *sock);
1050 /*%<
1051  * Determine whether 'sock' is active by checking 'sock->active'
1052  * or, for child sockets, 'sock->parent->active'.
1053  */
1054 
1055 bool
1056 isc__nmsocket_deactivate(isc_nmsocket_t *sock);
1057 /*%<
1058  * @brief Deactivate active socket
1059  *
1060  * Atomically deactive the socket by setting @p sock->active or, for child
1061  * sockets, @p sock->parent->active to @c false
1062  *
1063  * @param[in] sock - valid nmsocket
1064  * @return @c false if the socket was already inactive, @c true otherwise
1065  */
1066 
1067 void
1068 isc__nmsocket_clearcb(isc_nmsocket_t *sock);
1069 /*%<
1070  * Clear the recv and accept callbacks in 'sock'.
1071  */
1072 
1073 void
1074 isc__nmsocket_timer_stop(isc_nmsocket_t *sock);
1075 void
1076 isc__nmsocket_timer_start(isc_nmsocket_t *sock);
1077 void
1078 isc__nmsocket_timer_restart(isc_nmsocket_t *sock);
1079 bool
1080 isc__nmsocket_timer_running(isc_nmsocket_t *sock);
1081 /*%<
1082  * Start/stop/restart/check the timeout on the socket
1083  */
1084 
1085 void
1086 isc__nm_connectcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq,
1087 		  isc_result_t eresult, bool async);
1088 
1089 void
1090 isc__nm_async_connectcb(isc__networker_t *worker, isc__netievent_t *ev0);
1091 /*%<
1092  * Issue a connect callback on the socket, used to call the callback
1093  */
1094 
1095 void
1096 isc__nm_readcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq,
1097 	       isc_result_t eresult);
1098 void
1099 isc__nm_async_readcb(isc__networker_t *worker, isc__netievent_t *ev0);
1100 
1101 /*%<
1102  * Issue a read callback on the socket, used to call the callback
1103  * on failed conditions when the event can't be scheduled on the uv loop.
1104  *
1105  */
1106 
1107 void
1108 isc__nm_sendcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq,
1109 	       isc_result_t eresult, bool async);
1110 void
1111 isc__nm_async_sendcb(isc__networker_t *worker, isc__netievent_t *ev0);
1112 /*%<
1113  * Issue a write callback on the socket, used to call the callback
1114  * on failed conditions when the event can't be scheduled on the uv loop.
1115  */
1116 
1117 void
1118 isc__nm_async_shutdown(isc__networker_t *worker, isc__netievent_t *ev0);
1119 /*%<
1120  * Walk through all uv handles, get the underlying sockets and issue
1121  * close on them.
1122  */
1123 
1124 void
1125 isc__nm_udp_send(isc_nmhandle_t *handle, const isc_region_t *region,
1126 		 isc_nm_cb_t cb, void *cbarg);
1127 /*%<
1128  * Back-end implementation of isc_nm_send() for UDP handles.
1129  */
1130 
1131 void
1132 isc__nm_udp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg);
1133 /*
1134  * Back-end implementation of isc_nm_read() for UDP handles.
1135  */
1136 
1137 void
1138 isc__nm_udp_close(isc_nmsocket_t *sock);
1139 /*%<
1140  * Close a UDP socket.
1141  */
1142 
1143 void
1144 isc__nm_udp_cancelread(isc_nmhandle_t *handle);
1145 /*%<
1146  * Stop reading on a connected UDP handle.
1147  */
1148 
1149 void
1150 isc__nm_udp_shutdown(isc_nmsocket_t *sock);
1151 /*%<
1152  * Called during the shutdown process to close and clean up connected
1153  * sockets.
1154  */
1155 
1156 void
1157 isc__nm_udp_stoplistening(isc_nmsocket_t *sock);
1158 /*%<
1159  * Stop listening on 'sock'.
1160  */
1161 
1162 void
1163 isc__nm_udp_settimeout(isc_nmhandle_t *handle, uint32_t timeout);
1164 /*%<
1165  * Set or clear the recv timeout for the UDP socket associated with 'handle'.
1166  */
1167 
1168 void
1169 isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0);
1170 void
1171 isc__nm_async_udpconnect(isc__networker_t *worker, isc__netievent_t *ev0);
1172 void
1173 isc__nm_async_udpstop(isc__networker_t *worker, isc__netievent_t *ev0);
1174 void
1175 isc__nm_async_udpsend(isc__networker_t *worker, isc__netievent_t *ev0);
1176 void
1177 isc__nm_async_udpread(isc__networker_t *worker, isc__netievent_t *ev0);
1178 void
1179 isc__nm_async_udpcancel(isc__networker_t *worker, isc__netievent_t *ev0);
1180 void
1181 isc__nm_async_udpclose(isc__networker_t *worker, isc__netievent_t *ev0);
1182 /*%<
1183  * Callback handlers for asynchronous UDP events (listen, stoplisten, send).
1184  */
1185 
1186 void
1187 isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region,
1188 		 isc_nm_cb_t cb, void *cbarg);
1189 /*%<
1190  * Back-end implementation of isc_nm_send() for TCP handles.
1191  */
1192 
1193 void
1194 isc__nm_tcp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg);
1195 /*
1196  * Back-end implementation of isc_nm_read() for TCP handles.
1197  */
1198 
1199 void
1200 isc__nm_tcp_close(isc_nmsocket_t *sock);
1201 /*%<
1202  * Close a TCP socket.
1203  */
1204 void
1205 isc__nm_tcp_pauseread(isc_nmhandle_t *handle);
1206 /*%<
1207  * Pause reading on this handle, while still remembering the callback.
1208  */
1209 
1210 void
1211 isc__nm_tcp_resumeread(isc_nmhandle_t *handle);
1212 /*%<
1213  * Resume reading from socket.
1214  *
1215  */
1216 
1217 void
1218 isc__nm_tcp_shutdown(isc_nmsocket_t *sock);
1219 /*%<
1220  * Called during the shutdown process to close and clean up connected
1221  * sockets.
1222  */
1223 
1224 void
1225 isc__nm_tcp_cancelread(isc_nmhandle_t *handle);
1226 /*%<
1227  * Stop reading on a connected TCP handle.
1228  */
1229 
1230 void
1231 isc__nm_tcp_stoplistening(isc_nmsocket_t *sock);
1232 /*%<
1233  * Stop listening on 'sock'.
1234  */
1235 
1236 int_fast32_t
1237 isc__nm_tcp_listener_nactive(isc_nmsocket_t *sock);
1238 /*%<
1239  * Returns the number of active connections for the TCP listener socket.
1240  */
1241 
1242 void
1243 isc__nm_tcp_settimeout(isc_nmhandle_t *handle, uint32_t timeout);
1244 /*%<
1245  * Set the read timeout for the TCP socket associated with 'handle'.
1246  */
1247 
1248 void
1249 isc__nm_async_tcpconnect(isc__networker_t *worker, isc__netievent_t *ev0);
1250 void
1251 isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0);
1252 void
1253 isc__nm_async_tcpaccept(isc__networker_t *worker, isc__netievent_t *ev0);
1254 void
1255 isc__nm_async_tcpstop(isc__networker_t *worker, isc__netievent_t *ev0);
1256 void
1257 isc__nm_async_tcpsend(isc__networker_t *worker, isc__netievent_t *ev0);
1258 void
1259 isc__nm_async_startread(isc__networker_t *worker, isc__netievent_t *ev0);
1260 void
1261 isc__nm_async_pauseread(isc__networker_t *worker, isc__netievent_t *ev0);
1262 void
1263 isc__nm_async_tcpstartread(isc__networker_t *worker, isc__netievent_t *ev0);
1264 void
1265 isc__nm_async_tcppauseread(isc__networker_t *worker, isc__netievent_t *ev0);
1266 void
1267 isc__nm_async_tcpcancel(isc__networker_t *worker, isc__netievent_t *ev0);
1268 void
1269 isc__nm_async_tcpclose(isc__networker_t *worker, isc__netievent_t *ev0);
1270 /*%<
1271  * Callback handlers for asynchronous TCP events (connect, listen,
1272  * stoplisten, send, read, pause, close).
1273  */
1274 
1275 void
1276 isc__nm_async_tcpdnsaccept(isc__networker_t *worker, isc__netievent_t *ev0);
1277 void
1278 isc__nm_async_tcpdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0);
1279 void
1280 isc__nm_async_tcpdnslisten(isc__networker_t *worker, isc__netievent_t *ev0);
1281 
1282 void
1283 isc__nm_tcpdns_send(isc_nmhandle_t *handle, isc_region_t *region,
1284 		    isc_nm_cb_t cb, void *cbarg);
1285 /*%<
1286  * Back-end implementation of isc_nm_send() for TCPDNS handles.
1287  */
1288 
1289 void
1290 isc__nm_tcpdns_shutdown(isc_nmsocket_t *sock);
1291 
1292 void
1293 isc__nm_tcpdns_close(isc_nmsocket_t *sock);
1294 /*%<
1295  * Close a TCPDNS socket.
1296  */
1297 
1298 void
1299 isc__nm_tcpdns_stoplistening(isc_nmsocket_t *sock);
1300 /*%<
1301  * Stop listening on 'sock'.
1302  */
1303 
1304 void
1305 isc__nm_tcpdns_settimeout(isc_nmhandle_t *handle, uint32_t timeout);
1306 /*%<
1307  * Set the read timeout and reset the timer for the TCPDNS socket
1308  * associated with 'handle', and the TCP socket it wraps around.
1309  */
1310 
1311 void
1312 isc__nm_async_tcpdnsaccept(isc__networker_t *worker, isc__netievent_t *ev0);
1313 void
1314 isc__nm_async_tcpdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0);
1315 void
1316 isc__nm_async_tcpdnslisten(isc__networker_t *worker, isc__netievent_t *ev0);
1317 void
1318 isc__nm_async_tcpdnscancel(isc__networker_t *worker, isc__netievent_t *ev0);
1319 void
1320 isc__nm_async_tcpdnsclose(isc__networker_t *worker, isc__netievent_t *ev0);
1321 void
1322 isc__nm_async_tcpdnssend(isc__networker_t *worker, isc__netievent_t *ev0);
1323 void
1324 isc__nm_async_tcpdnsstop(isc__networker_t *worker, isc__netievent_t *ev0);
1325 void
1326 isc__nm_async_tcpdnsread(isc__networker_t *worker, isc__netievent_t *ev0);
1327 /*%<
1328  * Callback handlers for asynchronous TCPDNS events.
1329  */
1330 
1331 void
1332 isc__nm_tcpdns_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg);
1333 /*
1334  * Back-end implementation of isc_nm_read() for TCPDNS handles.
1335  */
1336 
1337 void
1338 isc__nm_tcpdns_cancelread(isc_nmhandle_t *handle);
1339 /*%<
1340  * Stop reading on a connected TCPDNS handle.
1341  */
1342 
1343 #define isc__nm_uverr2result(x) \
1344 	isc___nm_uverr2result(x, true, __FILE__, __LINE__, __func__)
1345 isc_result_t
1346 isc___nm_uverr2result(int uverr, bool dolog, const char *file,
1347 		      unsigned int line, const char *func);
1348 /*%<
1349  * Convert a libuv error value into an isc_result_t.  The
1350  * list of supported error values is not complete; new users
1351  * of this function should add any expected errors that are
1352  * not already there.
1353  */
1354 
1355 bool
1356 isc__nm_acquire_interlocked(isc_nm_t *mgr);
1357 /*%<
1358  * Try to acquire interlocked state; return true if successful.
1359  */
1360 
1361 void
1362 isc__nm_drop_interlocked(isc_nm_t *mgr);
1363 /*%<
1364  * Drop interlocked state; signal waiters.
1365  */
1366 
1367 void
1368 isc__nm_acquire_interlocked_force(isc_nm_t *mgr);
1369 /*%<
1370  * Actively wait for interlocked state.
1371  */
1372 
1373 void
1374 isc__nm_incstats(isc_nm_t *mgr, isc_statscounter_t counterid);
1375 /*%<
1376  * Increment socket-related statistics counters.
1377  */
1378 
1379 void
1380 isc__nm_decstats(isc_nm_t *mgr, isc_statscounter_t counterid);
1381 /*%<
1382  * Decrement socket-related statistics counters.
1383  */
1384 
1385 isc_result_t
1386 isc__nm_socket(int domain, int type, int protocol, uv_os_sock_t *sockp);
1387 /*%<
1388  * Platform independent socket() version
1389  */
1390 
1391 void
1392 isc__nm_closesocket(uv_os_sock_t sock);
1393 /*%<
1394  * Platform independent closesocket() version
1395  */
1396 
1397 isc_result_t
1398 isc__nm_socket_freebind(uv_os_sock_t fd, sa_family_t sa_family);
1399 /*%<
1400  * Set the IP_FREEBIND (or equivalent) socket option on the uv_handle
1401  */
1402 
1403 isc_result_t
1404 isc__nm_socket_reuse(uv_os_sock_t fd);
1405 /*%<
1406  * Set the SO_REUSEADDR or SO_REUSEPORT (or equivalent) socket option on the fd
1407  */
1408 
1409 isc_result_t
1410 isc__nm_socket_reuse_lb(uv_os_sock_t fd);
1411 /*%<
1412  * Set the SO_REUSEPORT_LB (or equivalent) socket option on the fd
1413  */
1414 
1415 isc_result_t
1416 isc__nm_socket_incoming_cpu(uv_os_sock_t fd);
1417 /*%<
1418  * Set the SO_INCOMING_CPU socket option on the fd if available
1419  */
1420 
1421 isc_result_t
1422 isc__nm_socket_disable_pmtud(uv_os_sock_t fd, sa_family_t sa_family);
1423 /*%<
1424  * Disable the Path MTU Discovery, either by disabling IP(V6)_DONTFRAG socket
1425  * option, or setting the IP(V6)_MTU_DISCOVER socket option to IP_PMTUDISC_OMIT
1426  */
1427 
1428 isc_result_t
1429 isc__nm_socket_connectiontimeout(uv_os_sock_t fd, int timeout_ms);
1430 /*%<
1431  * Set the connection timeout in milliseconds, on non-Linux platforms,
1432  * the minimum value must be at least 1000 (1 second).
1433  */
1434 
1435 isc_result_t
1436 isc__nm_socket_tcp_nodelay(uv_os_sock_t fd);
1437 /*%<
1438  * Disables Nagle's algorithm on a TCP socket (sets TCP_NODELAY).
1439  */
1440 
1441 /*
1442  * typedef all the netievent types
1443  */
1444 
1445 NETIEVENT_SOCKET_TYPE(close);
1446 NETIEVENT_SOCKET_TYPE(tcpclose);
1447 NETIEVENT_SOCKET_TYPE(tcplisten);
1448 NETIEVENT_SOCKET_TYPE(tcppauseread);
1449 NETIEVENT_SOCKET_TYPE(tcpstop);
1450 NETIEVENT_SOCKET_TYPE(udpclose);
1451 NETIEVENT_SOCKET_TYPE(udplisten);
1452 NETIEVENT_SOCKET_TYPE(udpread);
1453 /* NETIEVENT_SOCKET_TYPE(udpsend); */ /* unique type, defined independently */
1454 NETIEVENT_SOCKET_TYPE(udpstop);
1455 
1456 NETIEVENT_SOCKET_TYPE(tcpdnsclose);
1457 NETIEVENT_SOCKET_TYPE(tcpdnsread);
1458 NETIEVENT_SOCKET_TYPE(tcpdnsstop);
1459 NETIEVENT_SOCKET_TYPE(tcpdnslisten);
1460 NETIEVENT_SOCKET_REQ_TYPE(tcpdnsconnect);
1461 NETIEVENT_SOCKET_REQ_TYPE(tcpdnssend);
1462 NETIEVENT_SOCKET_HANDLE_TYPE(tcpdnscancel);
1463 NETIEVENT_SOCKET_QUOTA_TYPE(tcpdnsaccept);
1464 
1465 NETIEVENT_SOCKET_REQ_TYPE(tcpconnect);
1466 NETIEVENT_SOCKET_REQ_TYPE(tcpsend);
1467 NETIEVENT_SOCKET_TYPE(tcpstartread);
1468 NETIEVENT_SOCKET_REQ_TYPE(udpconnect);
1469 
1470 NETIEVENT_SOCKET_REQ_RESULT_TYPE(connectcb);
1471 NETIEVENT_SOCKET_REQ_RESULT_TYPE(readcb);
1472 NETIEVENT_SOCKET_REQ_RESULT_TYPE(sendcb);
1473 
1474 NETIEVENT_SOCKET_HANDLE_TYPE(detach);
1475 NETIEVENT_SOCKET_HANDLE_TYPE(tcpcancel);
1476 NETIEVENT_SOCKET_HANDLE_TYPE(udpcancel);
1477 
1478 NETIEVENT_SOCKET_QUOTA_TYPE(tcpaccept);
1479 
1480 NETIEVENT_TYPE(pause);
1481 NETIEVENT_TYPE(resume);
1482 NETIEVENT_TYPE(shutdown);
1483 NETIEVENT_TYPE(stop);
1484 
1485 NETIEVENT_TASK_TYPE(task);
1486 NETIEVENT_TASK_TYPE(privilegedtask);
1487 
1488 /* Now declared the helper functions */
1489 
1490 NETIEVENT_SOCKET_DECL(close);
1491 NETIEVENT_SOCKET_DECL(tcpclose);
1492 NETIEVENT_SOCKET_DECL(tcplisten);
1493 NETIEVENT_SOCKET_DECL(tcppauseread);
1494 NETIEVENT_SOCKET_DECL(tcpstartread);
1495 NETIEVENT_SOCKET_DECL(tcpstop);
1496 NETIEVENT_SOCKET_DECL(udpclose);
1497 NETIEVENT_SOCKET_DECL(udplisten);
1498 NETIEVENT_SOCKET_DECL(udpread);
1499 NETIEVENT_SOCKET_DECL(udpsend);
1500 NETIEVENT_SOCKET_DECL(udpstop);
1501 
1502 NETIEVENT_SOCKET_DECL(tcpdnsclose);
1503 NETIEVENT_SOCKET_DECL(tcpdnsread);
1504 NETIEVENT_SOCKET_DECL(tcpdnsstop);
1505 NETIEVENT_SOCKET_DECL(tcpdnslisten);
1506 NETIEVENT_SOCKET_REQ_DECL(tcpdnsconnect);
1507 NETIEVENT_SOCKET_REQ_DECL(tcpdnssend);
1508 NETIEVENT_SOCKET_HANDLE_DECL(tcpdnscancel);
1509 NETIEVENT_SOCKET_QUOTA_DECL(tcpdnsaccept);
1510 
1511 NETIEVENT_SOCKET_REQ_DECL(tcpconnect);
1512 NETIEVENT_SOCKET_REQ_DECL(tcpsend);
1513 NETIEVENT_SOCKET_REQ_DECL(udpconnect);
1514 
1515 NETIEVENT_SOCKET_REQ_RESULT_DECL(connectcb);
1516 NETIEVENT_SOCKET_REQ_RESULT_DECL(readcb);
1517 NETIEVENT_SOCKET_REQ_RESULT_DECL(sendcb);
1518 
1519 NETIEVENT_SOCKET_HANDLE_DECL(udpcancel);
1520 NETIEVENT_SOCKET_HANDLE_DECL(tcpcancel);
1521 NETIEVENT_SOCKET_DECL(detach);
1522 
1523 NETIEVENT_SOCKET_QUOTA_DECL(tcpaccept);
1524 
1525 NETIEVENT_DECL(pause);
1526 NETIEVENT_DECL(resume);
1527 NETIEVENT_DECL(shutdown);
1528 NETIEVENT_DECL(stop);
1529 
1530 NETIEVENT_TASK_DECL(task);
1531 NETIEVENT_TASK_DECL(privilegedtask);
1532 
1533 void
1534 isc__nm_udp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result);
1535 void
1536 isc__nm_tcp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result);
1537 void
1538 isc__nm_tcpdns_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result);
1539 
1540 isc_result_t
1541 isc__nm_tcpdns_processbuffer(isc_nmsocket_t *sock);
1542 
1543 isc__nm_uvreq_t *
1544 isc__nm_get_read_req(isc_nmsocket_t *sock, isc_sockaddr_t *sockaddr);
1545 
1546 void
1547 isc__nm_alloc_cb(uv_handle_t *handle, size_t size, uv_buf_t *buf);
1548 
1549 void
1550 isc__nm_udp_read_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf,
1551 		    const struct sockaddr *addr, unsigned flags);
1552 void
1553 isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf);
1554 void
1555 isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf);
1556 
1557 void
1558 isc__nm_start_reading(isc_nmsocket_t *sock);
1559 void
1560 isc__nm_stop_reading(isc_nmsocket_t *sock);
1561 void
1562 isc__nm_process_sock_buffer(isc_nmsocket_t *sock);
1563 void
1564 isc__nm_resume_processing(void *arg);
1565 bool
1566 isc__nmsocket_closing(isc_nmsocket_t *sock);
1567 bool
1568 isc__nm_closing(isc_nmsocket_t *sock);
1569 
1570 void
1571 isc__nm_alloc_dnsbuf(isc_nmsocket_t *sock, size_t len);
1572 
1573 void
1574 isc__nm_failed_send_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req,
1575 		       isc_result_t eresult);
1576 void
1577 isc__nm_failed_accept_cb(isc_nmsocket_t *sock, isc_result_t eresult);
1578 void
1579 isc__nm_failed_connect_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req,
1580 			  isc_result_t eresult, bool async);
1581 void
1582 isc__nm_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, bool async);
1583 
1584 void
1585 isc__nm_accept_connection_log(isc_result_t result, bool can_log_quota);
1586 
1587 /*
1588  * Timeout callbacks
1589  */
1590 void
1591 isc__nmsocket_connecttimeout_cb(uv_timer_t *timer);
1592 void
1593 isc__nmsocket_readtimeout_cb(uv_timer_t *timer);
1594 void
1595 isc__nmsocket_writetimeout_cb(uv_timer_t *timer);
1596 
1597 /*%<
1598  *
1599  * Maximum number of simultaneous handles in flight supported for a single
1600  * connected TCPDNS socket. This value was chosen arbitrarily, and may be
1601  * changed in the future.
1602  */
1603 #define STREAM_CLIENTS_PER_CONN 23
1604 
1605 #define UV_RUNTIME_CHECK(func, ret)                                           \
1606 	if (ret != 0) {                                                       \
1607 		isc_error_fatal(__FILE__, __LINE__, "%s failed: %s\n", #func, \
1608 				uv_strerror(ret));                            \
1609 	}
1610