xref: /openbsd/usr.sbin/nsd/server.c (revision 4bdff4be)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #include "ixfr.h"
85 #ifdef USE_DNSTAP
86 #include "dnstap/dnstap_collector.h"
87 #endif
88 #include "verify.h"
89 #include "util/proxy_protocol.h"
90 
91 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
92 
93 #ifdef USE_DNSTAP
94 /*
95  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
96  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
97  */
98 static void
99 log_addr(const char* descr,
100 #ifdef INET6
101 	struct sockaddr_storage* addr
102 #else
103 	struct sockaddr_in* addr
104 #endif
105 	)
106 {
107 	char str_buf[64];
108 	if(verbosity < 6)
109 		return;
110 	if(
111 #ifdef INET6
112 		addr->ss_family == AF_INET
113 #else
114 		addr->sin_family == AF_INET
115 #endif
116 		) {
117 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
118 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
119 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
120 #ifdef INET6
121 	} else {
122 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
123 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
124 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
125 #endif
126 	}
127 }
128 #endif /* USE_DNSTAP */
129 
130 #ifdef USE_TCP_FASTOPEN
131   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
132   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
133 #endif
134 
135 /* header state for the PROXYv2 header (for TCP) */
136 enum pp2_header_state {
137 	/* no header encounter yet */
138 	pp2_header_none = 0,
139 	/* read the static part of the header */
140 	pp2_header_init,
141 	/* read the full header */
142 	pp2_header_done
143 };
144 
145 /*
146  * Data for the UDP handlers.
147  */
148 struct udp_handler_data
149 {
150 	struct nsd        *nsd;
151 	struct nsd_socket *socket;
152 	struct event       event;
153 	/* if set, PROXYv2 is expected on this connection */
154 	int pp2_enabled;
155 };
156 
157 struct tcp_accept_handler_data {
158 	struct nsd        *nsd;
159 	struct nsd_socket *socket;
160 	int                event_added;
161 	struct event       event;
162 #ifdef HAVE_SSL
163 	/* handler accepts TLS connections on the dedicated port */
164 	int                tls_accept;
165 #endif
166 	/* if set, PROXYv2 is expected on this connection */
167 	int pp2_enabled;
168 };
169 
170 /*
171  * These globals are used to enable the TCP accept handlers
172  * when the number of TCP connection drops below the maximum
173  * number of TCP connections.
174  */
175 static size_t tcp_accept_handler_count;
176 static struct tcp_accept_handler_data *tcp_accept_handlers;
177 
178 static struct event slowaccept_event;
179 static int slowaccept;
180 
181 #ifdef HAVE_SSL
182 static unsigned char *ocspdata = NULL;
183 static long ocspdata_len = 0;
184 #endif
185 
186 #ifdef NONBLOCKING_IS_BROKEN
187 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
188    read multiple times from a socket when reported ready by select. */
189 # define NUM_RECV_PER_SELECT (1)
190 #else /* !NONBLOCKING_IS_BROKEN */
191 # define NUM_RECV_PER_SELECT (100)
192 #endif /* NONBLOCKING_IS_BROKEN */
193 
194 #ifndef HAVE_MMSGHDR
195 struct mmsghdr {
196 	struct msghdr msg_hdr;
197 	unsigned int  msg_len;
198 };
199 #endif
200 
201 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
202 static struct iovec iovecs[NUM_RECV_PER_SELECT];
203 static struct query *queries[NUM_RECV_PER_SELECT];
204 
205 /*
206  * Data for the TCP connection handlers.
207  *
208  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
209  * blocking the entire server on a slow TCP connection, but does make
210  * reading from and writing to the socket more complicated.
211  *
212  * Basically, whenever a read/write would block (indicated by the
213  * EAGAIN errno variable) we remember the position we were reading
214  * from/writing to and return from the TCP reading/writing event
215  * handler.  When the socket becomes readable/writable again we
216  * continue from the same position.
217  */
218 struct tcp_handler_data
219 {
220 	/*
221 	 * The region used to allocate all TCP connection related
222 	 * data, including this structure.  This region is destroyed
223 	 * when the connection is closed.
224 	 */
225 	region_type*		region;
226 
227 	/*
228 	 * The global nsd structure.
229 	 */
230 	struct nsd*			nsd;
231 
232 	/*
233 	 * The current query data for this TCP connection.
234 	 */
235 	query_type*			query;
236 
237 	/*
238 	 * The query_state is used to remember if we are performing an
239 	 * AXFR, if we're done processing, or if we should discard the
240 	 * query and connection.
241 	 */
242 	query_state_type	query_state;
243 
244 	/*
245 	 * The event for the file descriptor and tcp timeout
246 	 */
247 	struct event event;
248 
249 	/*
250 	 * The bytes_transmitted field is used to remember the number
251 	 * of bytes transmitted when receiving or sending a DNS
252 	 * packet.  The count includes the two additional bytes used
253 	 * to specify the packet length on a TCP connection.
254 	 */
255 	size_t				bytes_transmitted;
256 
257 	/* If the query is restarted and needs a reset */
258 	int query_needs_reset;
259 
260 	/*
261 	 * The number of queries handled by this specific TCP connection.
262 	 */
263 	int					query_count;
264 
265 	/*
266 	 * The timeout in msec for this tcp connection
267 	 */
268 	int	tcp_timeout;
269 
270 	/*
271 	 * If the connection is allowed to have further queries on it.
272 	 */
273 	int tcp_no_more_queries;
274 
275 #ifdef USE_DNSTAP
276 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
277 	struct nsd_socket *socket;
278 #endif /* USE_DNSTAP */
279 
280 	/* if set, PROXYv2 is expected on this connection */
281 	int pp2_enabled;
282 
283 	/* header state for the PROXYv2 header (for TCP) */
284 	enum pp2_header_state pp2_header_state;
285 
286 #ifdef HAVE_SSL
287 	/*
288 	 * TLS object.
289 	 */
290 	SSL* tls;
291 
292 	/*
293 	 * TLS handshake state.
294 	 */
295 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
296 		tls_hs_read_event, tls_hs_write_event } shake_state;
297 #endif
298 	/* list of connections, for service of remaining tcp channels */
299 	struct tcp_handler_data *prev, *next;
300 };
301 /* global that is the list of active tcp channels */
302 static struct tcp_handler_data *tcp_active_list = NULL;
303 
304 /*
305  * Handle incoming queries on the UDP server sockets.
306  */
307 static void handle_udp(int fd, short event, void* arg);
308 
309 /*
310  * Handle incoming connections on the TCP sockets.  These handlers
311  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
312  * connection) but are disabled when the number of current TCP
313  * connections is equal to the maximum number of TCP connections.
314  * Disabling is done by changing the handler to wait for the
315  * NETIO_EVENT_NONE type.  This is done using the function
316  * configure_tcp_accept_handlers.
317  */
318 static void handle_tcp_accept(int fd, short event, void* arg);
319 
320 /*
321  * Handle incoming queries on a TCP connection.  The TCP connections
322  * are configured to be non-blocking and the handler may be called
323  * multiple times before a complete query is received.
324  */
325 static void handle_tcp_reading(int fd, short event, void* arg);
326 
327 /*
328  * Handle outgoing responses on a TCP connection.  The TCP connections
329  * are configured to be non-blocking and the handler may be called
330  * multiple times before a complete response is sent.
331  */
332 static void handle_tcp_writing(int fd, short event, void* arg);
333 
334 #ifdef HAVE_SSL
335 /* Create SSL object and associate fd */
336 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
337 /*
338  * Handle TLS handshake. May be called multiple times if incomplete.
339  */
340 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
341 
342 /*
343  * Handle incoming queries on a TLS over TCP connection.  The TLS
344  * connections are configured to be non-blocking and the handler may
345  * be called multiple times before a complete query is received.
346  */
347 static void handle_tls_reading(int fd, short event, void* arg);
348 
349 /*
350  * Handle outgoing responses on a TLS over TCP connection.  The TLS
351  * connections are configured to be non-blocking and the handler may
352  * be called multiple times before a complete response is sent.
353  */
354 static void handle_tls_writing(int fd, short event, void* arg);
355 #endif
356 
357 /*
358  * Send all children the quit nonblocking, then close pipe.
359  */
360 static void send_children_quit(struct nsd* nsd);
361 /* same, for shutdown time, waits for child to exit to avoid restart issues */
362 static void send_children_quit_and_wait(struct nsd* nsd);
363 
364 /* set childrens flags to send NSD_STATS to them */
365 #ifdef BIND8_STATS
366 static void set_children_stats(struct nsd* nsd);
367 #endif /* BIND8_STATS */
368 
369 /*
370  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
371  */
372 static void configure_handler_event_types(short event_types);
373 
374 static uint16_t *compressed_dname_offsets = 0;
375 static uint32_t compression_table_capacity = 0;
376 static uint32_t compression_table_size = 0;
377 static domain_type* compressed_dnames[MAXRRSPP];
378 
379 #ifdef USE_TCP_FASTOPEN
380 /* Checks to see if the kernel value must be manually changed in order for
381    TCP Fast Open to support server mode */
382 static void report_tcp_fastopen_config() {
383 
384 	int tcp_fastopen_fp;
385 	uint8_t tcp_fastopen_value;
386 
387 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
388 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
389 	}
390 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
391 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
392 		close(tcp_fastopen_fp);
393 	}
394 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
395 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
396 		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
397 		log_msg(LOG_WARNING, "To enable TFO use the command:");
398 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
399 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
400 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
401 		close(tcp_fastopen_fp);
402 	}
403 	close(tcp_fastopen_fp);
404 }
405 #endif
406 
407 /*
408  * Remove the specified pid from the list of child pids.  Returns -1 if
409  * the pid is not in the list, child_num otherwise.  The field is set to 0.
410  */
411 static int
412 delete_child_pid(struct nsd *nsd, pid_t pid)
413 {
414 	size_t i;
415 	for (i = 0; i < nsd->child_count; ++i) {
416 		if (nsd->children[i].pid == pid) {
417 			nsd->children[i].pid = 0;
418 			if(!nsd->children[i].need_to_exit) {
419 				if(nsd->children[i].child_fd != -1)
420 					close(nsd->children[i].child_fd);
421 				nsd->children[i].child_fd = -1;
422 				if(nsd->children[i].handler)
423 					nsd->children[i].handler->fd = -1;
424 			}
425 			return i;
426 		}
427 	}
428 	return -1;
429 }
430 
431 /*
432  * Restart child servers if necessary.
433  */
434 static int
435 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
436 	int* xfrd_sock_p)
437 {
438 	struct main_ipc_handler_data *ipc_data;
439 	size_t i;
440 	int sv[2];
441 
442 	/* Fork the child processes... */
443 	for (i = 0; i < nsd->child_count; ++i) {
444 		if (nsd->children[i].pid <= 0) {
445 			if (nsd->children[i].child_fd != -1)
446 				close(nsd->children[i].child_fd);
447 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
448 				log_msg(LOG_ERR, "socketpair: %s",
449 					strerror(errno));
450 				return -1;
451 			}
452 			nsd->children[i].child_fd = sv[0];
453 			nsd->children[i].parent_fd = sv[1];
454 			nsd->children[i].pid = fork();
455 			switch (nsd->children[i].pid) {
456 			default: /* SERVER MAIN */
457 				close(nsd->children[i].parent_fd);
458 				nsd->children[i].parent_fd = -1;
459 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
460 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
461 				}
462 				if(!nsd->children[i].handler)
463 				{
464 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
465 						region, sizeof(struct main_ipc_handler_data));
466 					ipc_data->nsd = nsd;
467 					ipc_data->child = &nsd->children[i];
468 					ipc_data->child_num = i;
469 					ipc_data->xfrd_sock = xfrd_sock_p;
470 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
471 					ipc_data->forward_mode = 0;
472 					ipc_data->got_bytes = 0;
473 					ipc_data->total_bytes = 0;
474 					ipc_data->acl_num = 0;
475 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
476 						region, sizeof(struct netio_handler));
477 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
478 					nsd->children[i].handler->timeout = NULL;
479 					nsd->children[i].handler->user_data = ipc_data;
480 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
481 					nsd->children[i].handler->event_handler = parent_handle_child_command;
482 					netio_add_handler(netio, nsd->children[i].handler);
483 				}
484 				/* clear any ongoing ipc */
485 				ipc_data = (struct main_ipc_handler_data*)
486 					nsd->children[i].handler->user_data;
487 				ipc_data->forward_mode = 0;
488 				/* restart - update fd */
489 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
490 				break;
491 			case 0: /* CHILD */
492 #ifdef MEMCLEAN /* OS collects memory pages */
493 				region_destroy(region);
494 #endif
495 
496 				if (pledge("stdio rpath inet", NULL) == -1) {
497 					log_msg(LOG_ERR, "pledge");
498 					exit(1);
499 				}
500 
501 				nsd->pid = 0;
502 				nsd->child_count = 0;
503 				nsd->server_kind = nsd->children[i].kind;
504 				nsd->this_child = &nsd->children[i];
505 				nsd->this_child->child_num = i;
506 				/* remove signal flags inherited from parent
507 				   the parent will handle them. */
508 				nsd->signal_hint_reload_hup = 0;
509 				nsd->signal_hint_reload = 0;
510 				nsd->signal_hint_child = 0;
511 				nsd->signal_hint_quit = 0;
512 				nsd->signal_hint_shutdown = 0;
513 				nsd->signal_hint_stats = 0;
514 				nsd->signal_hint_statsusr = 0;
515 				close(*xfrd_sock_p);
516 				close(nsd->this_child->child_fd);
517 				nsd->this_child->child_fd = -1;
518 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
519 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
520 				}
521 				server_child(nsd);
522 				/* NOTREACH */
523 				exit(0);
524 			case -1:
525 				log_msg(LOG_ERR, "fork failed: %s",
526 					strerror(errno));
527 				return -1;
528 			}
529 		}
530 	}
531 	return 0;
532 }
533 
534 #ifdef BIND8_STATS
535 static void set_bind8_alarm(struct nsd* nsd)
536 {
537 	/* resync so that the next alarm is on the next whole minute */
538 	if(nsd->st_period > 0) /* % by 0 gives divbyzero error */
539 		alarm(nsd->st_period - (time(NULL) % nsd->st_period));
540 }
541 #endif
542 
543 /* set zone stat ids for zones initially read in */
544 static void
545 zonestatid_tree_set(struct nsd* nsd)
546 {
547 	struct radnode* n;
548 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
549 		zone_type* zone = (zone_type*)n->elem;
550 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
551 	}
552 }
553 
554 #ifdef USE_ZONE_STATS
555 void
556 server_zonestat_alloc(struct nsd* nsd)
557 {
558 	size_t num = (nsd->options->zonestatnames->count==0?1:
559 			nsd->options->zonestatnames->count);
560 	size_t sz = sizeof(struct nsdst)*num;
561 	char tmpfile[256];
562 	uint8_t z = 0;
563 
564 	/* file names */
565 	nsd->zonestatfname[0] = 0;
566 	nsd->zonestatfname[1] = 0;
567 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
568 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
569 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
570 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
571 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
572 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
573 
574 	/* file descriptors */
575 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
576 	if(nsd->zonestatfd[0] == -1) {
577 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
578 			strerror(errno));
579 		exit(1);
580 	}
581 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
582 	if(nsd->zonestatfd[0] == -1) {
583 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
584 			strerror(errno));
585 		close(nsd->zonestatfd[0]);
586 		unlink(nsd->zonestatfname[0]);
587 		exit(1);
588 	}
589 
590 #ifdef HAVE_MMAP
591 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
592 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
593 			strerror(errno));
594 		exit(1);
595 	}
596 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
597 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
598 			nsd->zonestatfname[0], strerror(errno));
599 		exit(1);
600 	}
601 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
602 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
603 			strerror(errno));
604 		exit(1);
605 	}
606 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
607 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
608 			nsd->zonestatfname[1], strerror(errno));
609 		exit(1);
610 	}
611 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
612 		MAP_SHARED, nsd->zonestatfd[0], 0);
613 	if(nsd->zonestat[0] == MAP_FAILED) {
614 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
615 		unlink(nsd->zonestatfname[0]);
616 		unlink(nsd->zonestatfname[1]);
617 		exit(1);
618 	}
619 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
620 		MAP_SHARED, nsd->zonestatfd[1], 0);
621 	if(nsd->zonestat[1] == MAP_FAILED) {
622 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
623 		unlink(nsd->zonestatfname[0]);
624 		unlink(nsd->zonestatfname[1]);
625 		exit(1);
626 	}
627 	memset(nsd->zonestat[0], 0, sz);
628 	memset(nsd->zonestat[1], 0, sz);
629 	nsd->zonestatsize[0] = num;
630 	nsd->zonestatsize[1] = num;
631 	nsd->zonestatdesired = num;
632 	nsd->zonestatsizenow = num;
633 	nsd->zonestatnow = nsd->zonestat[0];
634 #endif /* HAVE_MMAP */
635 }
636 
637 void
638 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
639 {
640 #ifdef HAVE_MMAP
641 #ifdef MREMAP_MAYMOVE
642 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
643 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
644 		MREMAP_MAYMOVE);
645 	if(nsd->zonestat[idx] == MAP_FAILED) {
646 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
647 		exit(1);
648 	}
649 #else /* !HAVE MREMAP */
650 	if(msync(nsd->zonestat[idx],
651 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
652 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
653 	if(munmap(nsd->zonestat[idx],
654 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
655 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
656 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
657 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
658 	if(nsd->zonestat[idx] == MAP_FAILED) {
659 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
660 		exit(1);
661 	}
662 #endif /* MREMAP */
663 #endif /* HAVE_MMAP */
664 }
665 
666 /* realloc the zonestat array for the one that is not currently in use,
667  * to match the desired new size of the array (if applicable) */
668 void
669 server_zonestat_realloc(struct nsd* nsd)
670 {
671 #ifdef HAVE_MMAP
672 	uint8_t z = 0;
673 	size_t sz;
674 	int idx = 0; /* index of the zonestat array that is not in use */
675 	if(nsd->zonestatnow == nsd->zonestat[0])
676 		idx = 1;
677 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
678 		return;
679 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
680 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
681 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
682 			strerror(errno));
683 		exit(1);
684 	}
685 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
686 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
687 			nsd->zonestatfname[idx], strerror(errno));
688 		exit(1);
689 	}
690 	zonestat_remap(nsd, idx, sz);
691 	/* zero the newly allocated region */
692 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
693 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
694 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
695 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
696 	}
697 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
698 #endif /* HAVE_MMAP */
699 }
700 
701 /* switchover to use the other array for the new children, that
702  * briefly coexist with the old children.  And we want to avoid them
703  * both writing to the same statistics arrays. */
704 void
705 server_zonestat_switch(struct nsd* nsd)
706 {
707 	if(nsd->zonestatnow == nsd->zonestat[0]) {
708 		nsd->zonestatnow = nsd->zonestat[1];
709 		nsd->zonestatsizenow = nsd->zonestatsize[1];
710 	} else {
711 		nsd->zonestatnow = nsd->zonestat[0];
712 		nsd->zonestatsizenow = nsd->zonestatsize[0];
713 	}
714 }
715 #endif /* USE_ZONE_STATS */
716 
717 #ifdef BIND8_STATS
718 void
719 server_stat_alloc(struct nsd* nsd)
720 {
721 	char tmpfile[256];
722 	size_t sz = sizeof(struct nsdst) * nsd->child_count * 2;
723 	uint8_t z = 0;
724 
725 	/* file name */
726 	nsd->statfname = 0;
727 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat",
728 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
729 	nsd->statfname = region_strdup(nsd->region, tmpfile);
730 
731 	/* file descriptor */
732 	nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600);
733 	if(nsd->statfd == -1) {
734 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname,
735 			strerror(errno));
736 		unlink(nsd->zonestatfname[0]);
737 		unlink(nsd->zonestatfname[1]);
738 		exit(1);
739 	}
740 
741 #ifdef HAVE_MMAP
742 	if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) {
743 		log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname,
744 			strerror(errno));
745 		goto fail_exit;
746 	}
747 	if(write(nsd->statfd, &z, 1) == -1) {
748 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
749 			nsd->statfname, strerror(errno));
750 		goto fail_exit;
751 	}
752 	nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
753 		MAP_SHARED, nsd->statfd, 0);
754 	if(nsd->stat_map == MAP_FAILED) {
755 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
756 fail_exit:
757 		close(nsd->statfd);
758 		unlink(nsd->statfname);
759 		unlink(nsd->zonestatfname[0]);
760 		unlink(nsd->zonestatfname[1]);
761 		exit(1);
762 	}
763 	memset(nsd->stat_map, 0, sz);
764 	nsd->stats_per_child[0] = nsd->stat_map;
765 	nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count];
766 	nsd->stat_current = 0;
767 	nsd->st = &nsd->stats_per_child[nsd->stat_current][0];
768 #endif /* HAVE_MMAP */
769 }
770 #endif /* BIND8_STATS */
771 
772 #ifdef BIND8_STATS
773 void
774 server_stat_free(struct nsd* nsd)
775 {
776 	unlink(nsd->statfname);
777 }
778 #endif /* BIND8_STATS */
779 
780 static void
781 cleanup_dname_compression_tables(void *ptr)
782 {
783 	free(ptr);
784 	compressed_dname_offsets = NULL;
785 	compression_table_capacity = 0;
786 }
787 
788 static void
789 initialize_dname_compression_tables(struct nsd *nsd)
790 {
791 	size_t needed = domain_table_count(nsd->db->domains) + 1;
792 	needed += EXTRA_DOMAIN_NUMBERS;
793 	if(compression_table_capacity < needed) {
794 		if(compressed_dname_offsets) {
795 			region_remove_cleanup(nsd->db->region,
796 				cleanup_dname_compression_tables,
797 				compressed_dname_offsets);
798 			free(compressed_dname_offsets);
799 		}
800 		compressed_dname_offsets = (uint16_t *) xmallocarray(
801 			needed, sizeof(uint16_t));
802 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
803 			compressed_dname_offsets);
804 		compression_table_capacity = needed;
805 		compression_table_size=domain_table_count(nsd->db->domains)+1;
806 	}
807 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
808 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
809 }
810 
811 static int
812 set_cloexec(struct nsd_socket *sock)
813 {
814 	assert(sock != NULL);
815 
816 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
817 		const char *socktype =
818 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
819 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
820 			socktype, strerror(errno));
821 		return -1;
822 	}
823 
824 	return 1;
825 }
826 
827 static int
828 set_reuseport(struct nsd_socket *sock)
829 {
830 #ifdef SO_REUSEPORT
831 	int on = 1;
832 #ifdef SO_REUSEPORT_LB
833 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
834 	 * SO_REUSEPORT on Linux. This is what the users want with the config
835 	 * option in nsd.conf; if we actually need local address and port reuse
836 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
837 	 * _LB they want.
838 	 */
839 	int opt = SO_REUSEPORT_LB;
840 	static const char optname[] = "SO_REUSEPORT_LB";
841 #else /* !SO_REUSEPORT_LB */
842 	int opt = SO_REUSEPORT;
843 	static const char optname[] = "SO_REUSEPORT";
844 #endif /* SO_REUSEPORT_LB */
845 
846 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
847 		return 1;
848 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
849 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
850 			optname, strerror(errno));
851 	}
852 	return -1;
853 #else
854 	(void)sock;
855 #endif /* SO_REUSEPORT */
856 
857 	return 0;
858 }
859 
860 static int
861 set_reuseaddr(struct nsd_socket *sock)
862 {
863 #ifdef SO_REUSEADDR
864 	int on = 1;
865 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
866 		return 1;
867 	}
868 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
869 		strerror(errno));
870 	return -1;
871 #endif /* SO_REUSEADDR */
872 	return 0;
873 }
874 
875 static int
876 set_rcvbuf(struct nsd_socket *sock, int rcv)
877 {
878 #ifdef SO_RCVBUF
879 #ifdef SO_RCVBUFFORCE
880 	if(0 == setsockopt(
881 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
882 	{
883 		return 1;
884 	}
885 	if(errno == EPERM || errno == ENOBUFS) {
886 		return 0;
887 	}
888 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
889 		strerror(errno));
890 	return -1;
891 #else /* !SO_RCVBUFFORCE */
892 	if (0 == setsockopt(
893 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
894 	{
895 		return 1;
896 	}
897 	if(errno == ENOSYS || errno == ENOBUFS) {
898 		return 0;
899 	}
900 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
901 		strerror(errno));
902 	return -1;
903 #endif /* SO_RCVBUFFORCE */
904 #endif /* SO_RCVBUF */
905 
906 	return 0;
907 }
908 
909 static int
910 set_sndbuf(struct nsd_socket *sock, int snd)
911 {
912 #ifdef SO_SNDBUF
913 #ifdef SO_SNDBUFFORCE
914 	if(0 == setsockopt(
915 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
916 	{
917 		return 1;
918 	}
919 	if(errno == EPERM || errno == ENOBUFS) {
920 		return 0;
921 	}
922 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
923 		strerror(errno));
924 	return -1;
925 #else /* !SO_SNDBUFFORCE */
926 	if(0 == setsockopt(
927 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
928 	{
929 		return 1;
930 	}
931 	if(errno == ENOSYS || errno == ENOBUFS) {
932 		return 0;
933 	}
934 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
935 		strerror(errno));
936 	return -1;
937 #endif /* SO_SNDBUFFORCE */
938 #endif /* SO_SNDBUF */
939 
940 	return 0;
941 }
942 
943 static int
944 set_nonblock(struct nsd_socket *sock)
945 {
946 	const char *socktype =
947 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
948 
949 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
950 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
951 			socktype, strerror(errno));
952 		return -1;
953 	}
954 
955 	return 1;
956 }
957 
958 #ifdef INET6
959 static int
960 set_ipv6_v6only(struct nsd_socket *sock)
961 {
962 #ifdef IPV6_V6ONLY
963 	int on = 1;
964 	const char *socktype =
965 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
966 
967 	if(0 == setsockopt(
968 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
969 	{
970 		return 1;
971 	}
972 
973 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
974 		socktype, strerror(errno));
975 	return -1;
976 #else
977 	(void)sock;
978 #endif /* IPV6_V6ONLY */
979 
980 	return 0;
981 }
982 #endif /* INET6 */
983 
984 #ifdef INET6
985 static int
986 set_ipv6_use_min_mtu(struct nsd_socket *sock)
987 {
988 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
989 #if defined(IPV6_USE_MIN_MTU)
990 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
991 	 * network. Therefore we do not send UDP datagrams larger than the
992 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
993 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
994 	 */
995 	int opt = IPV6_USE_MIN_MTU;
996 	int optval = 1;
997 	static const char optname[] = "IPV6_USE_MIN_MTU";
998 #elif defined(IPV6_MTU)
999 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
1000 	 * to the MIN MTU to get the same.
1001 	 */
1002 	int opt = IPV6_MTU;
1003 	int optval = IPV6_MIN_MTU;
1004 	static const char optname[] = "IPV6_MTU";
1005 #endif
1006 	if(0 == setsockopt(
1007 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
1008 	{
1009 		return 1;
1010 	}
1011 
1012 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
1013 		optname, strerror(errno));
1014 	return -1;
1015 #else
1016 	(void)sock;
1017 #endif /* INET6 */
1018 
1019 	return 0;
1020 }
1021 #endif /* INET6 */
1022 
1023 static int
1024 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
1025 {
1026 	int ret = 0;
1027 
1028 #if defined(IP_MTU_DISCOVER)
1029 	int opt = IP_MTU_DISCOVER;
1030 	int optval;
1031 # if defined(IP_PMTUDISC_OMIT)
1032 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
1033 	 * information and send packets with DF=0. Fragmentation is allowed if
1034 	 * and only if the packet size exceeds the outgoing interface MTU or
1035 	 * the packet encounters smaller MTU link in network. This mitigates
1036 	 * DNS fragmentation attacks by preventing forged PMTU information.
1037 	 * FreeBSD already has same semantics without setting the option.
1038 	 */
1039 	optval = IP_PMTUDISC_OMIT;
1040 	if(0 == setsockopt(
1041 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1042 	{
1043 		return 1;
1044 	}
1045 
1046 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1047 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
1048 # endif /* IP_PMTUDISC_OMIT */
1049 # if defined(IP_PMTUDISC_DONT)
1050 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
1051 	optval = IP_PMTUDISC_DONT;
1052 	if(0 == setsockopt(
1053 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1054 	{
1055 		return 1;
1056 	}
1057 
1058 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1059 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
1060 # endif
1061 	ret = -1;
1062 #elif defined(IP_DONTFRAG)
1063 	int off = 0;
1064 	if (0 == setsockopt(
1065 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
1066 	{
1067 		return 1;
1068 	}
1069 
1070 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
1071 		strerror(errno));
1072 	ret = -1;
1073 #else
1074 	(void)sock;
1075 #endif
1076 
1077 	return ret;
1078 }
1079 
1080 static int
1081 set_ip_freebind(struct nsd_socket *sock)
1082 {
1083 #ifdef IP_FREEBIND
1084 	int on = 1;
1085 	const char *socktype =
1086 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1087 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1088 	{
1089 		return 1;
1090 	}
1091 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1092 		socktype, strerror(errno));
1093 	return -1;
1094 #else
1095 	(void)sock;
1096 #endif /* IP_FREEBIND */
1097 
1098 	return 0;
1099 }
1100 
1101 static int
1102 set_ip_transparent(struct nsd_socket *sock)
1103 {
1104 	/*
1105 	The scandalous preprocessor blob here calls for some explanation :)
1106 	POSIX does not specify an option to bind non-local IPs, so
1107 	platforms developed several implementation-specific options,
1108 	all set in the same way, but with different names.
1109 	For additional complexity, some platform manage this setting
1110 	differently for different address families (IPv4 vs IPv6).
1111 	This scandalous preprocessor blob below abstracts such variability
1112 	in the way which leaves the C code as lean and clear as possible.
1113 	*/
1114 
1115 #if defined(IP_TRANSPARENT)
1116 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1117 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1118 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1119 // as of 2020-01, Linux does not support this on IPv6 programmatically
1120 #elif defined(SO_BINDANY)
1121 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1122 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1123 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1124 #elif defined(IP_BINDANY)
1125 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1126 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1127 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1128 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1129 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1130 #endif
1131 
1132 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1133 	(void)sock;
1134 #else
1135 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1136 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1137 #	endif
1138 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1139 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1140 #	endif
1141 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1142 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1143 #	endif
1144 
1145 	int on = 1;
1146 	const char *socktype =
1147 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1148 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1149 
1150 	if(0 == setsockopt(
1151 		sock->s,
1152 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1153 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1154 		&on, sizeof(on)))
1155 	{
1156 		return 1;
1157 	}
1158 
1159 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1160 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1161 	return -1;
1162 #endif
1163 
1164 	return 0;
1165 }
1166 
1167 static int
1168 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1169 {
1170 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1171 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1172 		return 1;
1173 	}
1174 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1175 		strerror(errno));
1176 	return -1;
1177 #else
1178 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1179 #endif
1180 	return 0;
1181 }
1182 
1183 #ifdef USE_TCP_FASTOPEN
1184 static int
1185 set_tcp_fastopen(struct nsd_socket *sock)
1186 {
1187 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1188 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1189 	 */
1190 	int qlen;
1191 
1192 #ifdef __APPLE__
1193 	/* macOS X implementation only supports qlen of 1 via this call. The
1194 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1195 	 * kernel parameter.
1196 	 */
1197 	qlen = 1;
1198 #else
1199 	/* 5 is recommended on Linux. */
1200 	qlen = 5;
1201 #endif
1202 	if (0 == setsockopt(
1203 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1204 	{
1205 		return 1;
1206 	}
1207 
1208 	if (errno == EPERM) {
1209 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1210 				 "; this could likely be because sysctl "
1211 				 "net.inet.tcp.fastopen.enabled, "
1212 				 "net.inet.tcp.fastopen.server_enable, or "
1213 				 "net.ipv4.tcp_fastopen is disabled",
1214 			strerror(errno));
1215 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1216 	 * disabled, except when verbosity enabled for debugging
1217 	 */
1218 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1219 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1220 			strerror(errno));
1221 	}
1222 
1223 	return (errno == ENOPROTOOPT ? 0 : -1);
1224 }
1225 #endif /* USE_TCP_FASTOPEN */
1226 
1227 static int
1228 set_bindtodevice(struct nsd_socket *sock)
1229 {
1230 #if defined(SO_BINDTODEVICE)
1231 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1232 		sock->device, strlen(sock->device)) == -1)
1233 	{
1234 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1235 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1236 		return -1;
1237 	}
1238 
1239 	return 1;
1240 #else
1241 	(void)sock;
1242 	return 0;
1243 #endif
1244 }
1245 
1246 static int
1247 set_setfib(struct nsd_socket *sock)
1248 {
1249 #if defined(SO_SETFIB)
1250 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1251 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1252 	{
1253 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1254 		                 "SO_SETFIB", sock->fib, strerror(errno));
1255 		return -1;
1256 	}
1257 
1258 	return 1;
1259 #else
1260 	(void)sock;
1261 	return 0;
1262 #endif
1263 }
1264 
1265 static int
1266 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1267 {
1268 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1269 
1270 	if(-1 == (sock->s = socket(
1271 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1272 	{
1273 #ifdef INET6
1274 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1275 		   (sock->addr.ai_family == AF_INET6) &&
1276 		   (errno == EAFNOSUPPORT))
1277 		{
1278 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1279 				"not supported");
1280 			return 0;
1281 		}
1282 #endif
1283 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1284 		return -1;
1285 	}
1286 
1287 	set_cloexec(sock);
1288 
1289 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1290 		*reuseport_works = (set_reuseport(sock) == 1);
1291 
1292 	if(nsd->options->receive_buffer_size > 0)
1293 		rcv = nsd->options->receive_buffer_size;
1294 	if(set_rcvbuf(sock, rcv) == -1)
1295 		return -1;
1296 
1297 	if(nsd->options->send_buffer_size > 0)
1298 		snd = nsd->options->send_buffer_size;
1299 	if(set_sndbuf(sock, snd) == -1)
1300 		return -1;
1301 #ifdef INET6
1302 	if(sock->addr.ai_family == AF_INET6) {
1303 		if(set_ipv6_v6only(sock) == -1 ||
1304 		   set_ipv6_use_min_mtu(sock) == -1)
1305 			return -1;
1306 	} else
1307 #endif /* INET6 */
1308 	if(sock->addr.ai_family == AF_INET) {
1309 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1310 			return -1;
1311 	}
1312 
1313 	/* Set socket to non-blocking. Otherwise, on operating systems
1314 	 * with thundering herd problems, the UDP recv could block
1315 	 * after select returns readable.
1316 	 */
1317 	set_nonblock(sock);
1318 
1319 	if(nsd->options->ip_freebind)
1320 		(void)set_ip_freebind(sock);
1321 	if(nsd->options->ip_transparent)
1322 		(void)set_ip_transparent(sock);
1323 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1324 		return -1;
1325 	if(sock->fib != -1 && set_setfib(sock) == -1)
1326 		return -1;
1327 
1328 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1329 		char buf[256];
1330 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1331 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1332 			buf, strerror(errno));
1333 		return -1;
1334 	}
1335 
1336 	return 1;
1337 }
1338 
1339 static int
1340 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1341 {
1342 #ifdef USE_TCP_FASTOPEN
1343 	report_tcp_fastopen_config();
1344 #endif
1345 
1346 	(void)reuseport_works;
1347 
1348 	if(-1 == (sock->s = socket(
1349 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1350 	{
1351 #ifdef INET6
1352 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1353 		   (sock->addr.ai_family == AF_INET6) &&
1354 		   (errno == EAFNOSUPPORT))
1355 		{
1356 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1357 			                     "not supported");
1358 			return 0;
1359 		}
1360 #endif /* INET6 */
1361 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1362 		return -1;
1363 	}
1364 
1365 	set_cloexec(sock);
1366 
1367 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1368 		*reuseport_works = (set_reuseport(sock) == 1);
1369 
1370 	(void)set_reuseaddr(sock);
1371 
1372 #ifdef INET6
1373 	if(sock->addr.ai_family == AF_INET6) {
1374 		if (set_ipv6_v6only(sock) == -1 ||
1375 		    set_ipv6_use_min_mtu(sock) == -1)
1376 			return -1;
1377 	}
1378 #endif
1379 
1380 	if(nsd->tcp_mss > 0)
1381 		set_tcp_maxseg(sock, nsd->tcp_mss);
1382 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1383 	   it may block in accept, even if select() says readable. */
1384 	(void)set_nonblock(sock);
1385 	if(nsd->options->ip_freebind)
1386 		(void)set_ip_freebind(sock);
1387 	if(nsd->options->ip_transparent)
1388 		(void)set_ip_transparent(sock);
1389 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1390 		return -1;
1391 	if(sock->fib != -1 && set_setfib(sock) == -1)
1392 		return -1;
1393 
1394 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1395 		char buf[256];
1396 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1397 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1398 			buf, strerror(errno));
1399 		return -1;
1400 	}
1401 
1402 #ifdef USE_TCP_FASTOPEN
1403 	(void)set_tcp_fastopen(sock);
1404 #endif
1405 
1406 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1407 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1408 		return -1;
1409 	}
1410 
1411 	return 1;
1412 }
1413 
1414 /*
1415  * Initialize the server, reuseport, create and bind the sockets.
1416  */
1417 int
1418 server_init(struct nsd *nsd)
1419 {
1420 	size_t i;
1421 	int reuseport = 1; /* Determine if REUSEPORT works. */
1422 
1423 	/* open server interface ports */
1424 	for(i = 0; i < nsd->ifs; i++) {
1425 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1426 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1427 		{
1428 			return -1;
1429 		}
1430 	}
1431 
1432 	if(nsd->reuseport && reuseport) {
1433 		size_t ifs = nsd->ifs * nsd->reuseport;
1434 
1435 		/* increase the size of the interface arrays, there are going
1436 		 * to be separate interface file descriptors for every server
1437 		 * instance */
1438 		region_remove_cleanup(nsd->region, free, nsd->udp);
1439 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1440 
1441 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1442 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1443 		region_add_cleanup(nsd->region, free, nsd->udp);
1444 		region_add_cleanup(nsd->region, free, nsd->tcp);
1445 		if(ifs > nsd->ifs) {
1446 			memset(&nsd->udp[nsd->ifs], 0,
1447 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1448 			memset(&nsd->tcp[nsd->ifs], 0,
1449 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1450 		}
1451 
1452 		for(i = nsd->ifs; i < ifs; i++) {
1453 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1454 			nsd->udp[i].s = -1;
1455 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1456 				return -1;
1457 			}
1458 			/* Turn off REUSEPORT for TCP by copying the socket
1459 			 * file descriptor.
1460 			 * This means we should not close TCP used by
1461 			 * other servers in reuseport enabled mode, in
1462 			 * server_child().
1463 			 */
1464 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1465 		}
1466 
1467 		nsd->ifs = ifs;
1468 	} else {
1469 		nsd->reuseport = 0;
1470 	}
1471 
1472 	/* open server interface ports for verifiers */
1473 	for(i = 0; i < nsd->verify_ifs; i++) {
1474 		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1475 		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1476 		{
1477 			return -1;
1478 		}
1479 	}
1480 
1481 	return 0;
1482 }
1483 
1484 /*
1485  * Prepare the server for take off.
1486  *
1487  */
1488 int
1489 server_prepare(struct nsd *nsd)
1490 {
1491 #ifdef RATELIMIT
1492 	/* set secret modifier for hashing (rate limits) */
1493 #ifdef HAVE_GETRANDOM
1494 	uint32_t v;
1495 	if(getrandom(&v, sizeof(v), 0) == -1) {
1496 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1497 		exit(1);
1498 	}
1499 	hash_set_raninit(v);
1500 #elif defined(HAVE_ARC4RANDOM)
1501 	hash_set_raninit(arc4random());
1502 #else
1503 	uint32_t v = getpid() ^ time(NULL);
1504 	srandom((unsigned long)v);
1505 #  ifdef HAVE_SSL
1506 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1507 		hash_set_raninit(v);
1508 	else
1509 #  endif
1510 		hash_set_raninit(random());
1511 #endif
1512 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1513 		nsd->options->rrl_ratelimit,
1514 		nsd->options->rrl_whitelist_ratelimit,
1515 		nsd->options->rrl_slip,
1516 		nsd->options->rrl_ipv4_prefix_length,
1517 		nsd->options->rrl_ipv6_prefix_length);
1518 #endif /* RATELIMIT */
1519 
1520 	/* Open the database... */
1521 	if ((nsd->db = namedb_open(nsd->options)) == NULL) {
1522 		log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno));
1523 		unlink(nsd->task[0]->fname);
1524 		unlink(nsd->task[1]->fname);
1525 #ifdef USE_ZONE_STATS
1526 		unlink(nsd->zonestatfname[0]);
1527 		unlink(nsd->zonestatfname[1]);
1528 #endif
1529 #ifdef BIND8_STATS
1530 		server_stat_free(nsd);
1531 #endif
1532 		xfrd_del_tempdir(nsd);
1533 		return -1;
1534 	}
1535 	/* check if zone files can be read */
1536 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1537 	 * for all zones */
1538 	namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1539 	zonestatid_tree_set(nsd);
1540 
1541 	compression_table_capacity = 0;
1542 	initialize_dname_compression_tables(nsd);
1543 
1544 #ifdef	BIND8_STATS
1545 	/* Initialize times... */
1546 	time(&nsd->st->boot);
1547 	set_bind8_alarm(nsd);
1548 #endif /* BIND8_STATS */
1549 
1550 	return 0;
1551 }
1552 
1553 /*
1554  * Fork the required number of servers.
1555  */
1556 static int
1557 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1558 	int* xfrd_sock_p)
1559 {
1560 	size_t i;
1561 
1562 	/* Start all child servers initially.  */
1563 	for (i = 0; i < nsd->child_count; ++i) {
1564 		nsd->children[i].pid = 0;
1565 	}
1566 
1567 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1568 }
1569 
1570 static void
1571 server_close_socket(struct nsd_socket *sock)
1572 {
1573 	if(sock->s != -1) {
1574 		close(sock->s);
1575 		sock->s = -1;
1576 	}
1577 }
1578 
1579 void
1580 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1581 {
1582 	size_t i;
1583 
1584 	/* Close all the sockets... */
1585 	for (i = 0; i < n; ++i) {
1586 		server_close_socket(&sockets[i]);
1587 	}
1588 }
1589 
1590 /*
1591  * Close the sockets, shutdown the server and exit.
1592  * Does not return.
1593  */
1594 void
1595 server_shutdown(struct nsd *nsd)
1596 {
1597 	size_t i;
1598 
1599 	server_close_all_sockets(nsd->udp, nsd->ifs);
1600 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1601 	/* CHILD: close command channel to parent */
1602 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1603 	{
1604 		close(nsd->this_child->parent_fd);
1605 		nsd->this_child->parent_fd = -1;
1606 	}
1607 	/* SERVER: close command channels to children */
1608 	if(!nsd->this_child)
1609 	{
1610 		for(i=0; i < nsd->child_count; ++i)
1611 			if(nsd->children[i].child_fd != -1)
1612 			{
1613 				close(nsd->children[i].child_fd);
1614 				nsd->children[i].child_fd = -1;
1615 			}
1616 	}
1617 
1618 	tsig_finalize();
1619 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1620 #ifdef HAVE_SSL
1621 	if (nsd->tls_ctx)
1622 		SSL_CTX_free(nsd->tls_ctx);
1623 #endif
1624 
1625 #ifdef MEMCLEAN /* OS collects memory pages */
1626 #ifdef RATELIMIT
1627 	rrl_mmap_deinit_keep_mmap();
1628 #endif
1629 #ifdef USE_DNSTAP
1630 	dt_collector_destroy(nsd->dt_collector, nsd);
1631 #endif
1632 	udb_base_free_keep_mmap(nsd->task[0]);
1633 	udb_base_free_keep_mmap(nsd->task[1]);
1634 	namedb_free_ixfr(nsd->db);
1635 	namedb_close(nsd->db);
1636 	nsd_options_destroy(nsd->options);
1637 	region_destroy(nsd->region);
1638 #endif
1639 	log_finalize();
1640 	exit(0);
1641 }
1642 
1643 void
1644 server_prepare_xfrd(struct nsd* nsd)
1645 {
1646 	char tmpfile[256];
1647 	/* create task mmaps */
1648 	nsd->mytask = 0;
1649 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1650 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1651 	nsd->task[0] = task_file_create(tmpfile);
1652 	if(!nsd->task[0]) {
1653 #ifdef USE_ZONE_STATS
1654 		unlink(nsd->zonestatfname[0]);
1655 		unlink(nsd->zonestatfname[1]);
1656 #endif
1657 #ifdef BIND8_STATS
1658 		server_stat_free(nsd);
1659 #endif
1660 		xfrd_del_tempdir(nsd);
1661 		exit(1);
1662 	}
1663 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1664 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1665 	nsd->task[1] = task_file_create(tmpfile);
1666 	if(!nsd->task[1]) {
1667 		unlink(nsd->task[0]->fname);
1668 #ifdef USE_ZONE_STATS
1669 		unlink(nsd->zonestatfname[0]);
1670 		unlink(nsd->zonestatfname[1]);
1671 #endif
1672 #ifdef BIND8_STATS
1673 		server_stat_free(nsd);
1674 #endif
1675 		xfrd_del_tempdir(nsd);
1676 		exit(1);
1677 	}
1678 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1679 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1680 	/* create xfrd listener structure */
1681 	nsd->xfrd_listener = region_alloc(nsd->region,
1682 		sizeof(netio_handler_type));
1683 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1684 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1685 	nsd->xfrd_listener->fd = -1;
1686 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1687 		nsd;
1688 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1689 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1690 }
1691 
1692 
1693 void
1694 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1695 {
1696 	pid_t pid;
1697 	int sockets[2] = {0,0};
1698 	struct ipc_handler_conn_data *data;
1699 
1700 	if(nsd->xfrd_listener->fd != -1)
1701 		close(nsd->xfrd_listener->fd);
1702 	if(del_db) {
1703 		/* recreate taskdb that xfrd was using, it may be corrupt */
1704 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1705 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1706 		nsd->task[1-nsd->mytask]->fname = NULL;
1707 		/* free alloc already, so udb does not shrink itself */
1708 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1709 		nsd->task[1-nsd->mytask]->alloc = NULL;
1710 		udb_base_free(nsd->task[1-nsd->mytask]);
1711 		/* create new file, overwrite the old one */
1712 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1713 		free(tmpfile);
1714 	}
1715 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1716 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1717 		return;
1718 	}
1719 	pid = fork();
1720 	switch (pid) {
1721 	case -1:
1722 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1723 		break;
1724 	default:
1725 		/* PARENT: close first socket, use second one */
1726 		close(sockets[0]);
1727 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1728 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1729 		}
1730 		if(del_db) xfrd_free_namedb(nsd);
1731 		/* use other task than I am using, since if xfrd died and is
1732 		 * restarted, the reload is using nsd->mytask */
1733 		nsd->mytask = 1 - nsd->mytask;
1734 
1735 #ifdef HAVE_SETPROCTITLE
1736 		setproctitle("xfrd");
1737 #endif
1738 #ifdef HAVE_CPUSET_T
1739 		if(nsd->use_cpu_affinity) {
1740 			set_cpu_affinity(nsd->xfrd_cpuset);
1741 		}
1742 #endif
1743 
1744 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1745 		/* ENOTREACH */
1746 		break;
1747 	case 0:
1748 		/* CHILD: close second socket, use first one */
1749 		close(sockets[1]);
1750 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1751 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1752 		}
1753 		nsd->xfrd_listener->fd = sockets[0];
1754 		break;
1755 	}
1756 	/* server-parent only */
1757 	nsd->xfrd_listener->timeout = NULL;
1758 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1759 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1760 	/* clear ongoing ipc reads */
1761 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1762 	data->conn->is_reading = 0;
1763 }
1764 
1765 /** add all soainfo to taskdb */
1766 static void
1767 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1768 {
1769 	struct radnode* n;
1770 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1771 	/* add all SOA INFO to mytask */
1772 	udb_ptr_init(&task_last, taskudb);
1773 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1774 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1775 	}
1776 	udb_ptr_unlink(&task_last, taskudb);
1777 }
1778 
1779 void
1780 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1781 {
1782 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1783 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1784 	 *   then they exchange and process.
1785 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1786 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1787 	 *   expire notifications can be sent back via a normal reload later
1788 	 *   (xfrd will wait for current running reload to finish if any).
1789 	 */
1790 	sig_atomic_t cmd = 0;
1791 	pid_t mypid;
1792 	int xfrd_sock = nsd->xfrd_listener->fd;
1793 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1794 	udb_ptr t;
1795 	if(!shortsoa) {
1796 		if(nsd->signal_hint_shutdown) {
1797 		shutdown:
1798 			log_msg(LOG_WARNING, "signal received, shutting down...");
1799 			server_close_all_sockets(nsd->udp, nsd->ifs);
1800 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1801 			daemon_remote_close(nsd->rc);
1802 			/* Unlink it if possible... */
1803 			unlinkpid(nsd->pidfile);
1804 			unlink(nsd->task[0]->fname);
1805 			unlink(nsd->task[1]->fname);
1806 #ifdef USE_ZONE_STATS
1807 			unlink(nsd->zonestatfname[0]);
1808 			unlink(nsd->zonestatfname[1]);
1809 #endif
1810 #ifdef BIND8_STATS
1811 			server_stat_free(nsd);
1812 #endif
1813 			server_shutdown(nsd);
1814 			/* ENOTREACH */
1815 			exit(0);
1816 		}
1817 	}
1818 	if(shortsoa) {
1819 		/* put SOA in xfrd task because mytask may be in use */
1820 		taskudb = nsd->task[1-nsd->mytask];
1821 	}
1822 
1823 	add_all_soa_to_task(nsd, taskudb);
1824 	if(!shortsoa) {
1825 		/* wait for xfrd to signal task is ready, RELOAD signal */
1826 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1827 			cmd != NSD_RELOAD) {
1828 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1829 			exit(1);
1830 		}
1831 		if(nsd->signal_hint_shutdown) {
1832 			goto shutdown;
1833 		}
1834 	}
1835 	/* give xfrd our task, signal it with RELOAD_DONE */
1836 	task_process_sync(taskudb);
1837 	cmd = NSD_RELOAD_DONE;
1838 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1839 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1840 			(int)nsd->pid, strerror(errno));
1841 	}
1842 	mypid = getpid();
1843 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1844 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1845 			strerror(errno));
1846 	}
1847 
1848 	if(!shortsoa) {
1849 		/* process the xfrd task works (expiry data) */
1850 		nsd->mytask = 1 - nsd->mytask;
1851 		taskudb = nsd->task[nsd->mytask];
1852 		task_remap(taskudb);
1853 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1854 		while(!udb_ptr_is_null(&t)) {
1855 			task_process_expire(nsd->db, TASKLIST(&t));
1856 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1857 		}
1858 		udb_ptr_unlink(&t, taskudb);
1859 		task_clear(taskudb);
1860 
1861 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1862 		cmd = NSD_RELOAD_DONE;
1863 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1864 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1865 				(int)nsd->pid, strerror(errno));
1866 		}
1867 	}
1868 }
1869 
1870 #ifdef HAVE_SSL
1871 static void
1872 log_crypto_from_err(const char* str, unsigned long err)
1873 {
1874 	/* error:[error code]:[library name]:[function name]:[reason string] */
1875 	char buf[128];
1876 	unsigned long e;
1877 	ERR_error_string_n(err, buf, sizeof(buf));
1878 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1879 	while( (e=ERR_get_error()) ) {
1880 		ERR_error_string_n(e, buf, sizeof(buf));
1881 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1882 	}
1883 }
1884 
1885 void
1886 log_crypto_err(const char* str)
1887 {
1888 	log_crypto_from_err(str, ERR_get_error());
1889 }
1890 
1891 /** true if the ssl handshake error has to be squelched from the logs */
1892 static int
1893 squelch_err_ssl_handshake(unsigned long err)
1894 {
1895 	if(verbosity >= 3)
1896 		return 0; /* only squelch on low verbosity */
1897 	/* this is very specific, we could filter on ERR_GET_REASON()
1898 	 * (the third element in ERR_PACK) */
1899 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1900 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1901 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1902 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1903 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1904 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1905 #endif
1906 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1907 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1908 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1909 #  ifdef SSL_R_VERSION_TOO_LOW
1910 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1911 #  endif
1912 #endif
1913 		)
1914 		return 1;
1915 	return 0;
1916 }
1917 
1918 void
1919 perform_openssl_init(void)
1920 {
1921 	/* init SSL library */
1922 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1923 	ERR_load_crypto_strings();
1924 #endif
1925 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1926 	ERR_load_SSL_strings();
1927 #endif
1928 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1929 	OpenSSL_add_all_algorithms();
1930 #else
1931 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1932 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1933 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1934 #endif
1935 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1936 	(void)SSL_library_init();
1937 #else
1938 	OPENSSL_init_ssl(0, NULL);
1939 #endif
1940 
1941 	if(!RAND_status()) {
1942 		/* try to seed it */
1943 		unsigned char buf[256];
1944 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1945 		size_t i;
1946 		v = seed;
1947 		for(i=0; i<256/sizeof(v); i++) {
1948 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1949 			v = v*seed + (unsigned int)i;
1950 		}
1951 		RAND_seed(buf, 256);
1952 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1953 	}
1954 }
1955 
1956 static int
1957 get_ocsp(char *filename, unsigned char **ocsp)
1958 {
1959 	BIO *bio;
1960 	OCSP_RESPONSE *response;
1961 	int len = -1;
1962 	unsigned char *p, *buf;
1963 	assert(filename);
1964 
1965 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1966 		log_crypto_err("get_ocsp: BIO_new_file failed");
1967 		return -1;
1968 	}
1969 
1970 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1971 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1972 		BIO_free(bio);
1973 		return -1;
1974 	}
1975 
1976 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1977 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1978 		OCSP_RESPONSE_free(response);
1979 		BIO_free(bio);
1980 		return -1;
1981 	}
1982 
1983 	if ((buf = malloc((size_t) len)) == NULL) {
1984 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1985 		OCSP_RESPONSE_free(response);
1986 		BIO_free(bio);
1987 		return -1;
1988 	}
1989 
1990 	p = buf;
1991 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1992 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1993 		free(buf);
1994 		OCSP_RESPONSE_free(response);
1995 		BIO_free(bio);
1996 		return -1;
1997 	}
1998 
1999 	OCSP_RESPONSE_free(response);
2000 	BIO_free(bio);
2001 
2002 	*ocsp = buf;
2003 	return len;
2004 }
2005 
2006 /* further setup ssl ctx after the keys are loaded */
2007 static void
2008 listen_sslctx_setup_2(void* ctxt)
2009 {
2010 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
2011 	(void)ctx;
2012 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
2013 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
2014 		/* ENOTREACH */
2015 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
2016 	}
2017 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
2018 	if(1) {
2019 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
2020 		if (!ecdh) {
2021 			log_crypto_err("could not find p256, not enabling ECDHE");
2022 		} else {
2023 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
2024 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
2025 			}
2026 			EC_KEY_free (ecdh);
2027 		}
2028 	}
2029 #endif
2030 }
2031 
2032 static int
2033 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
2034 {
2035 	if(ocspdata) {
2036 		unsigned char *p;
2037 		if ((p=malloc(ocspdata_len)) == NULL) {
2038 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
2039 			return SSL_TLSEXT_ERR_NOACK;
2040 		}
2041 		memcpy(p, ocspdata, ocspdata_len);
2042 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
2043 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
2044 			free(p);
2045 			return SSL_TLSEXT_ERR_NOACK;
2046 		}
2047 		return SSL_TLSEXT_ERR_OK;
2048 	} else {
2049 		return SSL_TLSEXT_ERR_NOACK;
2050 	}
2051 }
2052 
2053 SSL_CTX*
2054 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
2055 {
2056 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
2057 	if(!ctx) {
2058 		log_crypto_err("could not SSL_CTX_new");
2059 		return NULL;
2060 	}
2061 	/* no SSLv2, SSLv3 because has defects */
2062 #if SSL_OP_NO_SSLv2 != 0
2063 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
2064 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
2065 		SSL_CTX_free(ctx);
2066 		return NULL;
2067 	}
2068 #endif
2069 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
2070 		!= SSL_OP_NO_SSLv3){
2071 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
2072 		SSL_CTX_free(ctx);
2073 		return 0;
2074 	}
2075 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
2076 	/* if we have tls 1.1 disable 1.0 */
2077 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
2078 		!= SSL_OP_NO_TLSv1){
2079 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
2080 		SSL_CTX_free(ctx);
2081 		return 0;
2082 	}
2083 #endif
2084 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
2085 	/* if we have tls 1.2 disable 1.1 */
2086 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
2087 		!= SSL_OP_NO_TLSv1_1){
2088 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
2089 		SSL_CTX_free(ctx);
2090 		return 0;
2091 	}
2092 #endif
2093 #if defined(SSL_OP_NO_RENEGOTIATION)
2094 	/* disable client renegotiation */
2095 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2096 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2097 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2098 		SSL_CTX_free(ctx);
2099 		return 0;
2100 	}
2101 #endif
2102 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2103 	/* if we detect system-wide crypto policies, use those */
2104 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2105 		/* if we have sha256, set the cipher list to have no known vulns */
2106 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2107 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2108 	}
2109 #endif
2110 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2111 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2112 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2113 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2114 		SSL_CTX_free(ctx);
2115 		return 0;
2116 	}
2117 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2118 	SSL_CTX_set_security_level(ctx, 0);
2119 #endif
2120 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2121 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2122 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2123 		SSL_CTX_free(ctx);
2124 		return NULL;
2125 	}
2126 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2127 		log_msg(LOG_ERR, "error for private key file: %s", key);
2128 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2129 		SSL_CTX_free(ctx);
2130 		return NULL;
2131 	}
2132 	if(!SSL_CTX_check_private_key(ctx)) {
2133 		log_msg(LOG_ERR, "error for key file: %s", key);
2134 		log_crypto_err("Error in SSL_CTX check_private_key");
2135 		SSL_CTX_free(ctx);
2136 		return NULL;
2137 	}
2138 	listen_sslctx_setup_2(ctx);
2139 	if(verifypem && verifypem[0]) {
2140 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2141 			log_crypto_err("Error in SSL_CTX verify locations");
2142 			SSL_CTX_free(ctx);
2143 			return NULL;
2144 		}
2145 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2146 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2147 	}
2148 	return ctx;
2149 }
2150 
2151 SSL_CTX*
2152 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2153 {
2154 	char *key, *pem;
2155 	SSL_CTX *ctx;
2156 
2157 	key = nsd->options->tls_service_key;
2158 	pem = nsd->options->tls_service_pem;
2159 	if(!key || key[0] == 0) {
2160 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2161 		return NULL;
2162 	}
2163 	if(!pem || pem[0] == 0) {
2164 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2165 		return NULL;
2166 	}
2167 
2168 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2169 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2170 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2171 	if(!ctx) {
2172 		log_msg(LOG_ERR, "could not setup server TLS context");
2173 		return NULL;
2174 	}
2175 	if(ocspfile && ocspfile[0]) {
2176 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2177 			log_crypto_err("Error reading OCSPfile");
2178 			SSL_CTX_free(ctx);
2179 			return NULL;
2180 		} else {
2181 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2182 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2183 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2184 				SSL_CTX_free(ctx);
2185 				return NULL;
2186 			}
2187 		}
2188 	}
2189 	return ctx;
2190 }
2191 
2192 /* check if tcp_handler_accept_data created for TLS dedicated port */
2193 int
2194 using_tls_port(struct sockaddr* addr, const char* tls_port)
2195 {
2196 	in_port_t port = 0;
2197 
2198 	if (addr->sa_family == AF_INET)
2199 		port = ((struct sockaddr_in*)addr)->sin_port;
2200 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2201 	else
2202 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2203 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2204 	if (atoi(tls_port) == ntohs(port))
2205 		return 1;
2206 
2207 	return 0;
2208 }
2209 #endif
2210 
2211 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2212 ssize_t
2213 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2214 {
2215 	uint8_t* buf = (uint8_t*) p;
2216 	ssize_t total = 0;
2217 	struct pollfd fd;
2218 	memset(&fd, 0, sizeof(fd));
2219 	fd.fd = s;
2220 	fd.events = POLLIN;
2221 
2222 	while( total < sz) {
2223 		ssize_t ret;
2224 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2225 		if(ret == -1) {
2226 			if(errno == EAGAIN)
2227 				/* blocking read */
2228 				continue;
2229 			if(errno == EINTR) {
2230 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2231 					return -1;
2232 				/* other signals can be handled later */
2233 				continue;
2234 			}
2235 			/* some error */
2236 			return -1;
2237 		}
2238 		if(ret == 0) {
2239 			/* operation timed out */
2240 			return -2;
2241 		}
2242 		ret = read(s, buf+total, sz-total);
2243 		if(ret == -1) {
2244 			if(errno == EAGAIN)
2245 				/* blocking read */
2246 				continue;
2247 			if(errno == EINTR) {
2248 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2249 					return -1;
2250 				/* other signals can be handled later */
2251 				continue;
2252 			}
2253 			/* some error */
2254 			return -1;
2255 		}
2256 		if(ret == 0) {
2257 			/* closed connection! */
2258 			return 0;
2259 		}
2260 		total += ret;
2261 	}
2262 	return total;
2263 }
2264 
2265 static void
2266 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2267 {
2268 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2269 	udb_ptr t, next;
2270 	udb_base* u = nsd->task[nsd->mytask];
2271 	udb_ptr_init(&next, u);
2272 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2273 	udb_base_set_userdata(u, 0);
2274 	while(!udb_ptr_is_null(&t)) {
2275 		/* store next in list so this one can be deleted or reused */
2276 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2277 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2278 
2279 		/* process task t */
2280 		/* append results for task t and update last_task */
2281 		task_process_in_reload(nsd, u, last_task, &t);
2282 
2283 		/* go to next */
2284 		udb_ptr_set_ptr(&t, u, &next);
2285 
2286 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2287 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2288 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2289 			if(cmd == NSD_QUIT) {
2290 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2291 				/* unlink files of remainder of tasks */
2292 				while(!udb_ptr_is_null(&t)) {
2293 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2294 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2295 					}
2296 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2297 				}
2298 				udb_ptr_unlink(&t, u);
2299 				udb_ptr_unlink(&next, u);
2300 				exit(0);
2301 			}
2302 		}
2303 
2304 	}
2305 	udb_ptr_unlink(&t, u);
2306 	udb_ptr_unlink(&next, u);
2307 }
2308 
2309 void server_verify(struct nsd *nsd, int cmdsocket);
2310 
2311 /*
2312  * Reload the database, stop parent, re-fork children and continue.
2313  * as server_main.
2314  */
2315 static void
2316 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2317 	int cmdsocket)
2318 {
2319 	pid_t mypid;
2320 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2321 	int ret;
2322 	udb_ptr last_task;
2323 	struct sigaction old_sigchld, ign_sigchld;
2324 	struct radnode* node;
2325 	zone_type* zone;
2326 	enum soainfo_hint hint;
2327 	/* ignore SIGCHLD from the previous server_main that used this pid */
2328 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2329 	ign_sigchld.sa_handler = SIG_IGN;
2330 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2331 
2332 #ifdef HAVE_SETPROCTITLE
2333 	setproctitle("main");
2334 #endif
2335 #ifdef HAVE_CPUSET_T
2336 	if(nsd->use_cpu_affinity) {
2337 		set_cpu_affinity(nsd->cpuset);
2338 	}
2339 #endif
2340 
2341 	/* see what tasks we got from xfrd */
2342 	task_remap(nsd->task[nsd->mytask]);
2343 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2344 	reload_process_tasks(nsd, &last_task, cmdsocket);
2345 
2346 #ifndef NDEBUG
2347 	if(nsd_debug_level >= 1)
2348 		region_log_stats(nsd->db->region);
2349 #endif /* NDEBUG */
2350 	initialize_dname_compression_tables(nsd);
2351 
2352 #ifdef BIND8_STATS
2353 	/* Restart dumping stats if required.  */
2354 	time(&nsd->st->boot);
2355 	set_bind8_alarm(nsd);
2356 	/* Switch to a different set of stat array for new server processes,
2357 	 * because they can briefly coexist with the old processes. They
2358 	 * have their own stat structure. */
2359 	nsd->stat_current = (nsd->stat_current==0?1:0);
2360 #endif
2361 #ifdef USE_ZONE_STATS
2362 	server_zonestat_realloc(nsd); /* realloc for new children */
2363 	server_zonestat_switch(nsd);
2364 #endif
2365 
2366 	if(nsd->options->verify_enable) {
2367 #ifdef RATELIMIT
2368 		/* allocate resources for rate limiting. use a slot that is guaranteed
2369 		   not mapped to a file so no persistent data is overwritten */
2370 		rrl_init(nsd->child_count + 1);
2371 #endif
2372 
2373 		/* spin-up server and execute verifiers for each zone */
2374 		server_verify(nsd, cmdsocket);
2375 #ifdef RATELIMIT
2376 		/* deallocate rate limiting resources */
2377 		rrl_deinit(nsd->child_count + 1);
2378 #endif
2379 	}
2380 
2381 	for(node = radix_first(nsd->db->zonetree);
2382 	    node != NULL;
2383 	    node = radix_next(node))
2384 	{
2385 		zone = (zone_type *)node->elem;
2386 		if(zone->is_updated) {
2387 			if(zone->is_bad) {
2388 				nsd->mode = NSD_RELOAD_FAILED;
2389 				hint = soainfo_bad;
2390 			} else {
2391 				hint = soainfo_ok;
2392 			}
2393 			/* update(s), verified or not, possibly with subsequent
2394 			   skipped update(s). skipped update(s) are picked up
2395 			   by failed update check in xfrd */
2396 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2397 			                 zone, hint);
2398 		} else if(zone->is_skipped) {
2399 			/* corrupt or inconsistent update without preceding
2400 			   update(s), communicate soainfo_gone */
2401 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2402 			                 zone, soainfo_gone);
2403 		}
2404 		zone->is_updated = 0;
2405 		zone->is_skipped = 0;
2406 	}
2407 
2408 	if(nsd->mode == NSD_RELOAD_FAILED) {
2409 		exit(NSD_RELOAD_FAILED);
2410 	}
2411 
2412 	/* listen for the signals of failed children again */
2413 	sigaction(SIGCHLD, &old_sigchld, NULL);
2414 #ifdef USE_DNSTAP
2415 	if (nsd->dt_collector) {
2416 		int *swap_fd_send;
2417 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2418 		/* Swap fd_send with fd_swap so old serve child and new serve
2419 		 * childs will not write to the same pipe ends simultaneously */
2420 		swap_fd_send = nsd->dt_collector_fd_send;
2421 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2422 		nsd->dt_collector_fd_swap = swap_fd_send;
2423 
2424 	}
2425 #endif
2426 	/* Start new child processes */
2427 	if (server_start_children(nsd, server_region, netio, &nsd->
2428 		xfrd_listener->fd) != 0) {
2429 		send_children_quit(nsd);
2430 		exit(1);
2431 	}
2432 
2433 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2434 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2435 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2436 		if(cmd == NSD_QUIT) {
2437 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2438 			send_children_quit(nsd);
2439 			exit(0);
2440 		}
2441 	}
2442 
2443 	/* Send quit command to parent: blocking, wait for receipt. */
2444 	do {
2445 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2446 		cmd = NSD_QUIT_SYNC;
2447 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2448 		{
2449 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2450 				strerror(errno));
2451 		}
2452 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2453 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2454 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2455 			RELOAD_SYNC_TIMEOUT);
2456 		if(ret == -2) {
2457 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2458 		}
2459 	} while (ret == -2);
2460 	if(ret == -1) {
2461 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2462 			strerror(errno));
2463 	}
2464 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2465 	if(cmd == NSD_QUIT) {
2466 		/* small race condition possible here, parent got quit cmd. */
2467 		send_children_quit(nsd);
2468 		exit(1);
2469 	}
2470 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2471 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2472 	task_process_sync(nsd->task[nsd->mytask]);
2473 #ifdef USE_ZONE_STATS
2474 	server_zonestat_realloc(nsd); /* realloc for next children */
2475 #endif
2476 
2477 	/* send soainfo to the xfrd process, signal it that reload is done,
2478 	 * it picks up the taskudb */
2479 	cmd = NSD_RELOAD_DONE;
2480 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2481 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2482 			strerror(errno));
2483 	}
2484 	mypid = getpid();
2485 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2486 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2487 			strerror(errno));
2488 	}
2489 
2490 	/* try to reopen file */
2491 	if (nsd->file_rotation_ok)
2492 		log_reopen(nsd->log_filename, 1);
2493 	/* exit reload, continue as new server_main */
2494 }
2495 
2496 /*
2497  * Get the mode depending on the signal hints that have been received.
2498  * Multiple signal hints can be received and will be handled in turn.
2499  */
2500 static sig_atomic_t
2501 server_signal_mode(struct nsd *nsd)
2502 {
2503 	if(nsd->signal_hint_quit) {
2504 		nsd->signal_hint_quit = 0;
2505 		return NSD_QUIT;
2506 	}
2507 	else if(nsd->signal_hint_shutdown) {
2508 		nsd->signal_hint_shutdown = 0;
2509 		return NSD_SHUTDOWN;
2510 	}
2511 	else if(nsd->signal_hint_child) {
2512 		nsd->signal_hint_child = 0;
2513 		return NSD_REAP_CHILDREN;
2514 	}
2515 	else if(nsd->signal_hint_reload) {
2516 		nsd->signal_hint_reload = 0;
2517 		return NSD_RELOAD;
2518 	}
2519 	else if(nsd->signal_hint_reload_hup) {
2520 		nsd->signal_hint_reload_hup = 0;
2521 		return NSD_RELOAD_REQ;
2522 	}
2523 	else if(nsd->signal_hint_stats) {
2524 		nsd->signal_hint_stats = 0;
2525 #ifdef BIND8_STATS
2526 		set_bind8_alarm(nsd);
2527 #endif
2528 		return NSD_STATS;
2529 	}
2530 	else if(nsd->signal_hint_statsusr) {
2531 		nsd->signal_hint_statsusr = 0;
2532 		return NSD_STATS;
2533 	}
2534 	return NSD_RUN;
2535 }
2536 
2537 /*
2538  * The main server simply waits for signals and child processes to
2539  * terminate.  Child processes are restarted as necessary.
2540  */
2541 void
2542 server_main(struct nsd *nsd)
2543 {
2544 	region_type *server_region = region_create(xalloc, free);
2545 	netio_type *netio = netio_create(server_region);
2546 	netio_handler_type reload_listener;
2547 	int reload_sockets[2] = {-1, -1};
2548 	struct timespec timeout_spec;
2549 	int status;
2550 	pid_t child_pid;
2551 	pid_t reload_pid = -1;
2552 	sig_atomic_t mode;
2553 
2554 	/* Ensure we are the main process */
2555 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2556 
2557 	/* Add listener for the XFRD process */
2558 	netio_add_handler(netio, nsd->xfrd_listener);
2559 
2560 #ifdef BIND8_STATS
2561 	nsd->st = &nsd->stat_map[0];
2562 	nsd->st->db_disk = 0;
2563 	nsd->st->db_mem = region_get_mem(nsd->db->region);
2564 #endif
2565 
2566 	/* Start the child processes that handle incoming queries */
2567 	if (server_start_children(nsd, server_region, netio,
2568 		&nsd->xfrd_listener->fd) != 0) {
2569 		send_children_quit(nsd);
2570 		exit(1);
2571 	}
2572 	reload_listener.fd = -1;
2573 
2574 	/* This_child MUST be 0, because this is the parent process */
2575 	assert(nsd->this_child == 0);
2576 
2577 	/* Run the server until we get a shutdown signal */
2578 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2579 		/* Did we receive a signal that changes our mode? */
2580 		if(mode == NSD_RUN) {
2581 			nsd->mode = mode = server_signal_mode(nsd);
2582 		}
2583 
2584 		switch (mode) {
2585 		case NSD_RUN:
2586 			/* see if any child processes terminated */
2587 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2588 				int is_child = delete_child_pid(nsd, child_pid);
2589 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2590 					if(nsd->children[is_child].child_fd == -1)
2591 						nsd->children[is_child].has_exited = 1;
2592 					parent_check_all_children_exited(nsd);
2593 				} else if(is_child != -1) {
2594 					log_msg(LOG_WARNING,
2595 					       "server %d died unexpectedly with status %d, restarting",
2596 					       (int) child_pid, status);
2597 					restart_child_servers(nsd, server_region, netio,
2598 						&nsd->xfrd_listener->fd);
2599 				} else if (child_pid == reload_pid) {
2600 					sig_atomic_t cmd = NSD_RELOAD_FAILED;
2601 					pid_t mypid;
2602 					log_msg(LOG_WARNING,
2603 					       "Reload process %d failed with status %d, continuing with old database",
2604 					       (int) child_pid, status);
2605 					reload_pid = -1;
2606 					if(reload_listener.fd != -1) close(reload_listener.fd);
2607 					netio_remove_handler(netio, &reload_listener);
2608 					reload_listener.fd = -1;
2609 					reload_listener.event_types = NETIO_EVENT_NONE;
2610 					task_process_sync(nsd->task[nsd->mytask]);
2611 					/* inform xfrd reload attempt ended */
2612 					if(!write_socket(nsd->xfrd_listener->fd,
2613 						&cmd, sizeof(cmd))) {
2614 						log_msg(LOG_ERR, "problems "
2615 						  "sending SOAEND to xfrd: %s",
2616 						  strerror(errno));
2617 					}
2618 					mypid = getpid();
2619 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2620 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2621 							strerror(errno));
2622 					}
2623 #ifdef USE_DNSTAP
2624 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2625 					log_msg(LOG_WARNING,
2626 					       "dnstap-collector %d terminated with status %d",
2627 					       (int) child_pid, status);
2628 					if(nsd->dt_collector) {
2629 						dt_collector_close(nsd->dt_collector, nsd);
2630 						dt_collector_destroy(nsd->dt_collector, nsd);
2631 						nsd->dt_collector = NULL;
2632 					}
2633 					/* Only respawn a crashed (or exited)
2634 					 * dnstap-collector when not reloading,
2635 					 * to not induce a reload during a
2636 					 * reload (which would seriously
2637 					 * disrupt nsd procedures and lead to
2638 					 * unpredictable results)!
2639 					 *
2640 					 * This will *leave* a dnstap-collector
2641 					 * process terminated, but because
2642 					 * signalling of the reload process to
2643 					 * the main process to respawn in this
2644 					 * situation will be cumbersome, and
2645 					 * because this situation is so
2646 					 * specific (and therefore hopefully
2647 					 * extremely rare or non-existing at
2648 					 * all), plus the fact that we are left
2649 					 * with a perfectly function NSD
2650 					 * (besides not logging dnstap
2651 					 * messages), I consider it acceptable
2652 					 * to leave this unresolved.
2653 					 */
2654 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2655 						nsd->dt_collector = dt_collector_create(nsd);
2656 						dt_collector_start(nsd->dt_collector, nsd);
2657 						nsd->mode = NSD_RELOAD_REQ;
2658 					}
2659 #endif
2660 				} else if(status != 0) {
2661 					/* check for status, because we get
2662 					 * the old-servermain because reload
2663 					 * is the process-parent of old-main,
2664 					 * and we get older server-processes
2665 					 * that are exiting after a reload */
2666 					log_msg(LOG_WARNING,
2667 					       "process %d terminated with status %d",
2668 					       (int) child_pid, status);
2669 				}
2670 			}
2671 			if (child_pid == -1) {
2672 				if (errno == EINTR) {
2673 					continue;
2674 				}
2675 				if (errno != ECHILD)
2676 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2677 			}
2678 			if (nsd->mode != NSD_RUN)
2679 				break;
2680 
2681 			/* timeout to collect processes. In case no sigchild happens. */
2682 			timeout_spec.tv_sec = 60;
2683 			timeout_spec.tv_nsec = 0;
2684 
2685 			/* listen on ports, timeout for collecting terminated children */
2686 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2687 				if (errno != EINTR) {
2688 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2689 				}
2690 			}
2691 			if(nsd->restart_children) {
2692 				restart_child_servers(nsd, server_region, netio,
2693 					&nsd->xfrd_listener->fd);
2694 				nsd->restart_children = 0;
2695 			}
2696 			if(nsd->reload_failed) {
2697 				sig_atomic_t cmd = NSD_RELOAD_FAILED;
2698 				pid_t mypid;
2699 				nsd->reload_failed = 0;
2700 				log_msg(LOG_WARNING,
2701 				       "Reload process %d failed, continuing with old database",
2702 				       (int) reload_pid);
2703 				reload_pid = -1;
2704 				if(reload_listener.fd != -1) close(reload_listener.fd);
2705 				netio_remove_handler(netio, &reload_listener);
2706 				reload_listener.fd = -1;
2707 				reload_listener.event_types = NETIO_EVENT_NONE;
2708 				task_process_sync(nsd->task[nsd->mytask]);
2709 				/* inform xfrd reload attempt ended */
2710 				if(!write_socket(nsd->xfrd_listener->fd,
2711 					&cmd, sizeof(cmd))) {
2712 					log_msg(LOG_ERR, "problems "
2713 					  "sending SOAEND to xfrd: %s",
2714 					  strerror(errno));
2715 				}
2716 				mypid = getpid();
2717 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2718 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2719 						strerror(errno));
2720 				}
2721 			}
2722 
2723 			break;
2724 		case NSD_RELOAD_REQ: {
2725 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2726 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2727 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2728 				"main: ipc send reload_req to xfrd"));
2729 			if(!write_socket(nsd->xfrd_listener->fd,
2730 				&cmd, sizeof(cmd))) {
2731 				log_msg(LOG_ERR, "server_main: could not send "
2732 				"reload_req to xfrd: %s", strerror(errno));
2733 			}
2734 			nsd->mode = NSD_RUN;
2735 			} break;
2736 		case NSD_RELOAD:
2737 			/* Continue to run nsd after reload */
2738 			nsd->mode = NSD_RUN;
2739 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2740 			if (reload_pid != -1) {
2741 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2742 				       (int) reload_pid);
2743 				break;
2744 			}
2745 
2746 			/* switch the mytask to keep track of who owns task*/
2747 			nsd->mytask = 1 - nsd->mytask;
2748 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2749 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2750 				reload_pid = -1;
2751 				break;
2752 			}
2753 
2754 			/* Do actual reload */
2755 			reload_pid = fork();
2756 			switch (reload_pid) {
2757 			case -1:
2758 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2759 				break;
2760 			default:
2761 				/* PARENT */
2762 				close(reload_sockets[0]);
2763 				server_reload(nsd, server_region, netio,
2764 					reload_sockets[1]);
2765 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2766 				close(reload_sockets[1]);
2767 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2768 				/* drop stale xfrd ipc data */
2769 				((struct ipc_handler_conn_data*)nsd->
2770 					xfrd_listener->user_data)
2771 					->conn->is_reading = 0;
2772 				reload_pid = -1;
2773 				reload_listener.fd = -1;
2774 				reload_listener.event_types = NETIO_EVENT_NONE;
2775 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2776 				break;
2777 			case 0:
2778 				/* CHILD */
2779 				/* server_main keep running until NSD_QUIT_SYNC
2780 				 * received from reload. */
2781 				close(reload_sockets[1]);
2782 				reload_listener.fd = reload_sockets[0];
2783 				reload_listener.timeout = NULL;
2784 				reload_listener.user_data = nsd;
2785 				reload_listener.event_types = NETIO_EVENT_READ;
2786 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2787 				netio_add_handler(netio, &reload_listener);
2788 				reload_pid = getppid();
2789 				break;
2790 			}
2791 			break;
2792 		case NSD_QUIT_SYNC:
2793 			/* synchronisation of xfrd, parent and reload */
2794 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2795 				sig_atomic_t cmd = NSD_RELOAD;
2796 				/* stop xfrd ipc writes in progress */
2797 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2798 					"main: ipc send indication reload"));
2799 				if(!write_socket(nsd->xfrd_listener->fd,
2800 					&cmd, sizeof(cmd))) {
2801 					log_msg(LOG_ERR, "server_main: could not send reload "
2802 					"indication to xfrd: %s", strerror(errno));
2803 				}
2804 				/* wait for ACK from xfrd */
2805 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2806 				nsd->quit_sync_done = 1;
2807 			}
2808 			nsd->mode = NSD_RUN;
2809 			break;
2810 		case NSD_QUIT:
2811 			/* silent shutdown during reload */
2812 			if(reload_listener.fd != -1) {
2813 				/* acknowledge the quit, to sync reload that we will really quit now */
2814 				sig_atomic_t cmd = NSD_RELOAD;
2815 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2816 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2817 					log_msg(LOG_ERR, "server_main: "
2818 						"could not ack quit: %s", strerror(errno));
2819 				}
2820 				close(reload_listener.fd);
2821 			}
2822 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2823 			/* only quit children after xfrd has acked */
2824 			send_children_quit(nsd);
2825 
2826 #ifdef MEMCLEAN /* OS collects memory pages */
2827 			region_destroy(server_region);
2828 #endif
2829 			server_shutdown(nsd);
2830 
2831 			/* ENOTREACH */
2832 			break;
2833 		case NSD_SHUTDOWN:
2834 			break;
2835 		case NSD_REAP_CHILDREN:
2836 			/* continue; wait for child in run loop */
2837 			nsd->mode = NSD_RUN;
2838 			break;
2839 		case NSD_STATS:
2840 #ifdef BIND8_STATS
2841 			set_children_stats(nsd);
2842 #endif
2843 			nsd->mode = NSD_RUN;
2844 			break;
2845 		default:
2846 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2847 			nsd->mode = NSD_RUN;
2848 			break;
2849 		}
2850 	}
2851 	log_msg(LOG_WARNING, "signal received, shutting down...");
2852 
2853 	/* close opened ports to avoid race with restart of nsd */
2854 	server_close_all_sockets(nsd->udp, nsd->ifs);
2855 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2856 	daemon_remote_close(nsd->rc);
2857 	send_children_quit_and_wait(nsd);
2858 
2859 	/* Unlink it if possible... */
2860 	unlinkpid(nsd->pidfile);
2861 	unlink(nsd->task[0]->fname);
2862 	unlink(nsd->task[1]->fname);
2863 #ifdef USE_ZONE_STATS
2864 	unlink(nsd->zonestatfname[0]);
2865 	unlink(nsd->zonestatfname[1]);
2866 #endif
2867 #ifdef BIND8_STATS
2868 	server_stat_free(nsd);
2869 #endif
2870 #ifdef USE_DNSTAP
2871 	dt_collector_close(nsd->dt_collector, nsd);
2872 #endif
2873 
2874 	if(reload_listener.fd != -1) {
2875 		sig_atomic_t cmd = NSD_QUIT;
2876 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2877 			"main: ipc send quit to reload-process"));
2878 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2879 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2880 				strerror(errno));
2881 		}
2882 		fsync(reload_listener.fd);
2883 		close(reload_listener.fd);
2884 		/* wait for reload to finish processing */
2885 		while(1) {
2886 			if(waitpid(reload_pid, NULL, 0) == -1) {
2887 				if(errno == EINTR) continue;
2888 				if(errno == ECHILD) break;
2889 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2890 					(int)reload_pid, strerror(errno));
2891 			}
2892 			break;
2893 		}
2894 	}
2895 	if(nsd->xfrd_listener->fd != -1) {
2896 		/* complete quit, stop xfrd */
2897 		sig_atomic_t cmd = NSD_QUIT;
2898 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2899 			"main: ipc send quit to xfrd"));
2900 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2901 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2902 				strerror(errno));
2903 		}
2904 		fsync(nsd->xfrd_listener->fd);
2905 		close(nsd->xfrd_listener->fd);
2906 		(void)kill(nsd->pid, SIGTERM);
2907 	}
2908 
2909 #ifdef MEMCLEAN /* OS collects memory pages */
2910 	region_destroy(server_region);
2911 #endif
2912 	server_shutdown(nsd);
2913 }
2914 
2915 static query_state_type
2916 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2917 {
2918 	return query_process(query, nsd, now_p);
2919 }
2920 
2921 static query_state_type
2922 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2923 {
2924 #ifdef RATELIMIT
2925 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2926 		if(query->edns.cookie_status != COOKIE_VALID
2927 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2928 		&& rrl_process_query(query))
2929 			return rrl_slip(query);
2930 		else	return QUERY_PROCESSED;
2931 	}
2932 	return QUERY_DISCARDED;
2933 #else
2934 	return query_process(query, nsd, now_p);
2935 #endif
2936 }
2937 
2938 const char*
2939 nsd_event_vs(void)
2940 {
2941 #ifdef USE_MINI_EVENT
2942 	return "";
2943 #else
2944 	return event_get_version();
2945 #endif
2946 }
2947 
2948 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2949 static const char* ub_ev_backend2str(int b)
2950 {
2951 	switch(b) {
2952 	case EVBACKEND_SELECT:	return "select";
2953 	case EVBACKEND_POLL:	return "poll";
2954 	case EVBACKEND_EPOLL:	return "epoll";
2955 	case EVBACKEND_KQUEUE:	return "kqueue";
2956 	case EVBACKEND_DEVPOLL: return "devpoll";
2957 	case EVBACKEND_PORT:	return "evport";
2958 	}
2959 	return "unknown";
2960 }
2961 #endif
2962 
2963 const char*
2964 nsd_event_method(void)
2965 {
2966 #ifdef USE_MINI_EVENT
2967 	return "select";
2968 #else
2969 	struct event_base* b = nsd_child_event_base();
2970 	const char* m;
2971 #  ifdef EV_FEATURE_BACKENDS
2972 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2973 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2974 	m = event_base_get_method(b);
2975 #  else
2976 	m = "?";
2977 #  endif
2978 #  ifdef MEMCLEAN
2979 	event_base_free(b);
2980 #  endif
2981 	return m;
2982 #endif
2983 }
2984 
2985 struct event_base*
2986 nsd_child_event_base(void)
2987 {
2988 	struct event_base* base;
2989 #ifdef USE_MINI_EVENT
2990 	static time_t secs;
2991 	static struct timeval now;
2992 	base = event_init(&secs, &now);
2993 #else
2994 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2995 	/* libev */
2996 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2997 #  else
2998 	/* libevent */
2999 #    ifdef HAVE_EVENT_BASE_NEW
3000 	base = event_base_new();
3001 #    else
3002 	base = event_init();
3003 #    endif
3004 #  endif
3005 #endif
3006 	return base;
3007 }
3008 
3009 static void
3010 add_udp_handler(
3011 	struct nsd *nsd,
3012 	struct nsd_socket *sock,
3013 	struct udp_handler_data *data)
3014 {
3015 	struct event *handler = &data->event;
3016 
3017 	data->nsd = nsd;
3018 	data->socket = sock;
3019 
3020 	if(nsd->options->proxy_protocol_port &&
3021 		sockaddr_uses_proxy_protocol_port(nsd->options,
3022 		(struct sockaddr *)&sock->addr.ai_addr)) {
3023 		data->pp2_enabled = 1;
3024 	}
3025 
3026 	memset(handler, 0, sizeof(*handler));
3027 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
3028 	if(event_base_set(nsd->event_base, handler) != 0)
3029 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
3030 	if(event_add(handler, NULL) != 0)
3031 		log_msg(LOG_ERR, "nsd udp: event_add failed");
3032 }
3033 
3034 void
3035 add_tcp_handler(
3036 	struct nsd *nsd,
3037 	struct nsd_socket *sock,
3038 	struct tcp_accept_handler_data *data)
3039 {
3040 	struct event *handler = &data->event;
3041 
3042 	data->nsd = nsd;
3043 	data->socket = sock;
3044 
3045 	if(nsd->options->proxy_protocol_port &&
3046 		sockaddr_uses_proxy_protocol_port(nsd->options,
3047 		(struct sockaddr *)&sock->addr.ai_addr)) {
3048 		data->pp2_enabled = 1;
3049 	}
3050 
3051 #ifdef HAVE_SSL
3052 	if (nsd->tls_ctx &&
3053 	    nsd->options->tls_port &&
3054 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
3055 	{
3056 		data->tls_accept = 1;
3057 		if(verbosity >= 2) {
3058 			char buf[48];
3059 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
3060 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3061 		}
3062 	} else {
3063 		data->tls_accept = 0;
3064 	}
3065 #endif
3066 
3067 	memset(handler, 0, sizeof(*handler));
3068 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
3069 	if(event_base_set(nsd->event_base, handler) != 0)
3070 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3071 	if(event_add(handler, NULL) != 0)
3072 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
3073 	data->event_added = 1;
3074 }
3075 
3076 /*
3077  * Serve DNS request to verifiers (short-lived)
3078  */
3079 void server_verify(struct nsd *nsd, int cmdsocket)
3080 {
3081 	size_t size = 0;
3082 	struct event cmd_event, signal_event, exit_event;
3083 	struct zone *zone;
3084 
3085 	assert(nsd != NULL);
3086 
3087 	zone = verify_next_zone(nsd, NULL);
3088 	if(zone == NULL)
3089 		return;
3090 
3091 	nsd->server_region = region_create(xalloc, free);
3092 	nsd->event_base = nsd_child_event_base();
3093 
3094 	nsd->next_zone_to_verify = zone;
3095 	nsd->verifier_count = 0;
3096 	nsd->verifier_limit = nsd->options->verifier_count;
3097 	size = sizeof(struct verifier) * nsd->verifier_limit;
3098 	if(pipe(nsd->verifier_pipe) == -1) {
3099 		log_msg(LOG_ERR, "verify: could not create pipe: %s",
3100 				strerror(errno));
3101 		goto fail_pipe;
3102 	}
3103 	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3104 	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3105 	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3106 
3107 	for(size_t i = 0; i < nsd->verifier_limit; i++) {
3108 		nsd->verifiers[i].nsd = nsd;
3109 		nsd->verifiers[i].zone = NULL;
3110 		nsd->verifiers[i].pid = -1;
3111 		nsd->verifiers[i].output_stream.fd = -1;
3112 		nsd->verifiers[i].output_stream.priority = LOG_INFO;
3113 		nsd->verifiers[i].error_stream.fd = -1;
3114 		nsd->verifiers[i].error_stream.priority = LOG_ERR;
3115 	}
3116 
3117 	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3118 	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3119 	   event_add(&cmd_event, NULL) != 0)
3120 	{
3121 		log_msg(LOG_ERR, "verify: could not add command event");
3122 		goto fail;
3123 	}
3124 
3125 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3126 	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3127 	   signal_add(&signal_event, NULL) != 0)
3128 	{
3129 		log_msg(LOG_ERR, "verify: could not add signal event");
3130 		goto fail;
3131 	}
3132 
3133 	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3134 	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3135 	   event_add(&exit_event, NULL) != 0)
3136   {
3137 		log_msg(LOG_ERR, "verify: could not add exit event");
3138 		goto fail;
3139 	}
3140 
3141 	memset(msgs, 0, sizeof(msgs));
3142 	for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
3143 		queries[i] = query_create(nsd->server_region,
3144 			compressed_dname_offsets,
3145 			compression_table_size, compressed_dnames);
3146 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3147 		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3148 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3149 		msgs[i].msg_hdr.msg_iov = &iovecs[i];
3150 		msgs[i].msg_hdr.msg_iovlen = 1;
3151 		msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr;
3152 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3153 	}
3154 
3155 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3156 		struct udp_handler_data *data;
3157 		data = region_alloc_zero(
3158 			nsd->server_region, sizeof(*data));
3159 		add_udp_handler(nsd, &nsd->verify_udp[i], data);
3160 	}
3161 
3162 	tcp_accept_handler_count = nsd->verify_ifs;
3163 	tcp_accept_handlers = region_alloc_array(nsd->server_region,
3164 		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3165 
3166 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3167 		struct tcp_accept_handler_data *data;
3168 		data = &tcp_accept_handlers[i];
3169 		memset(data, 0, sizeof(*data));
3170 		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3171 	}
3172 
3173 	while(nsd->next_zone_to_verify != NULL &&
3174 	      nsd->verifier_count < nsd->verifier_limit)
3175 	{
3176 		verify_zone(nsd, nsd->next_zone_to_verify);
3177 		nsd->next_zone_to_verify
3178 			= verify_next_zone(nsd, nsd->next_zone_to_verify);
3179 	}
3180 
3181 	/* short-lived main loop */
3182 	event_base_dispatch(nsd->event_base);
3183 
3184 	/* remove command and exit event handlers */
3185 	event_del(&exit_event);
3186 	event_del(&signal_event);
3187 	event_del(&cmd_event);
3188 
3189 	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3190 	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3191 fail:
3192 	close(nsd->verifier_pipe[0]);
3193 	close(nsd->verifier_pipe[1]);
3194 fail_pipe:
3195 	event_base_free(nsd->event_base);
3196 	region_destroy(nsd->server_region);
3197 
3198 	nsd->event_base = NULL;
3199 	nsd->server_region = NULL;
3200 	nsd->verifier_limit = 0;
3201 	nsd->verifier_pipe[0] = -1;
3202 	nsd->verifier_pipe[1] = -1;
3203 	nsd->verifiers = NULL;
3204 }
3205 
3206 /*
3207  * Serve DNS requests.
3208  */
3209 void
3210 server_child(struct nsd *nsd)
3211 {
3212 	size_t i, from, numifs;
3213 	region_type *server_region = region_create(xalloc, free);
3214 	struct event_base* event_base = nsd_child_event_base();
3215 	sig_atomic_t mode;
3216 
3217 	if(!event_base) {
3218 		log_msg(LOG_ERR, "nsd server could not create event base");
3219 		exit(1);
3220 	}
3221 	nsd->event_base = event_base;
3222 	nsd->server_region = server_region;
3223 
3224 #ifdef RATELIMIT
3225 	rrl_init(nsd->this_child->child_num);
3226 #endif
3227 
3228 	assert(nsd->server_kind != NSD_SERVER_MAIN);
3229 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3230 
3231 #ifdef HAVE_SETPROCTITLE
3232 	setproctitle("server %d", nsd->this_child->child_num + 1);
3233 #endif
3234 #ifdef HAVE_CPUSET_T
3235 	if(nsd->use_cpu_affinity) {
3236 		set_cpu_affinity(nsd->this_child->cpuset);
3237 	}
3238 #endif
3239 #ifdef BIND8_STATS
3240 	nsd->st = &nsd->stats_per_child[nsd->stat_current]
3241 		[nsd->this_child->child_num];
3242 	nsd->st->boot = nsd->stat_map[0].boot;
3243 	memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc));
3244 #endif
3245 
3246 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3247 		server_close_all_sockets(nsd->tcp, nsd->ifs);
3248 	}
3249 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3250 		server_close_all_sockets(nsd->udp, nsd->ifs);
3251 	}
3252 
3253 	if (nsd->this_child->parent_fd != -1) {
3254 		struct event *handler;
3255 		struct ipc_handler_conn_data* user_data =
3256 			(struct ipc_handler_conn_data*)region_alloc(
3257 			server_region, sizeof(struct ipc_handler_conn_data));
3258 		user_data->nsd = nsd;
3259 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3260 
3261 		handler = (struct event*) region_alloc(
3262 			server_region, sizeof(*handler));
3263 		memset(handler, 0, sizeof(*handler));
3264 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3265 			EV_READ, child_handle_parent_command, user_data);
3266 		if(event_base_set(event_base, handler) != 0)
3267 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3268 		if(event_add(handler, NULL) != 0)
3269 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3270 	}
3271 
3272 	if(nsd->reuseport) {
3273 		numifs = nsd->ifs / nsd->reuseport;
3274 		from = numifs * nsd->this_child->child_num;
3275 		if(from+numifs > nsd->ifs) { /* should not happen */
3276 			from = 0;
3277 			numifs = nsd->ifs;
3278 		}
3279 	} else {
3280 		from = 0;
3281 		numifs = nsd->ifs;
3282 	}
3283 
3284 	if (nsd->server_kind & NSD_SERVER_UDP) {
3285 		int child = nsd->this_child->child_num;
3286 		memset(msgs, 0, sizeof(msgs));
3287 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3288 			queries[i] = query_create(server_region,
3289 				compressed_dname_offsets,
3290 				compression_table_size, compressed_dnames);
3291 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3292 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3293 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3294 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3295 			msgs[i].msg_hdr.msg_iovlen  = 1;
3296 			msgs[i].msg_hdr.msg_name    = &queries[i]->remote_addr;
3297 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3298 		}
3299 
3300 		for (i = 0; i < nsd->ifs; i++) {
3301 			int listen;
3302 			struct udp_handler_data *data;
3303 
3304 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3305 
3306 			if(i >= from && i < (from + numifs) && listen) {
3307 				data = region_alloc_zero(
3308 					nsd->server_region, sizeof(*data));
3309 				add_udp_handler(nsd, &nsd->udp[i], data);
3310 			} else {
3311 				/* close sockets intended for other servers */
3312 				server_close_socket(&nsd->udp[i]);
3313 			}
3314 		}
3315 	}
3316 
3317 	/*
3318 	 * Keep track of all the TCP accept handlers so we can enable
3319 	 * and disable them based on the current number of active TCP
3320 	 * connections.
3321 	 */
3322 	if (nsd->server_kind & NSD_SERVER_TCP) {
3323 		int child = nsd->this_child->child_num;
3324 		tcp_accept_handler_count = numifs;
3325 		tcp_accept_handlers = region_alloc_array(server_region,
3326 			numifs, sizeof(*tcp_accept_handlers));
3327 
3328 		for (i = 0; i < nsd->ifs; i++) {
3329 			int listen;
3330 			struct tcp_accept_handler_data *data;
3331 
3332 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3333 
3334 			if(i >= from && i < (from + numifs) && listen) {
3335 				data = &tcp_accept_handlers[i-from];
3336 				memset(data, 0, sizeof(*data));
3337 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3338 			} else {
3339 				/* close sockets intended for other servers */
3340 				/*
3341 				 * uncomment this once tcp servers are no
3342 				 * longer copied in the tcp fd copy line
3343 				 * in server_init().
3344 				server_close_socket(&nsd->tcp[i]);
3345 				*/
3346 				/* close sockets not meant for this server*/
3347 				if(!listen)
3348 					server_close_socket(&nsd->tcp[i]);
3349 			}
3350 		}
3351 	} else {
3352 		tcp_accept_handler_count = 0;
3353 	}
3354 
3355 	/* The main loop... */
3356 	while ((mode = nsd->mode) != NSD_QUIT) {
3357 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3358 
3359 		/* Do we need to do the statistics... */
3360 		if (mode == NSD_STATS) {
3361 #ifdef BIND8_STATS
3362 			int p = nsd->st_period;
3363 			nsd->st_period = 1; /* force stats printout */
3364 			/* Dump the statistics */
3365 			bind8_stats(nsd);
3366 			nsd->st_period = p;
3367 #else /* !BIND8_STATS */
3368 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3369 #endif /* BIND8_STATS */
3370 
3371 			nsd->mode = NSD_RUN;
3372 		}
3373 		else if (mode == NSD_REAP_CHILDREN) {
3374 			/* got signal, notify parent. parent reaps terminated children. */
3375 			if (nsd->this_child->parent_fd != -1) {
3376 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3377 				if (write(nsd->this_child->parent_fd,
3378 				    &parent_notify,
3379 				    sizeof(parent_notify)) == -1)
3380 				{
3381 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3382 						(int) nsd->this_child->pid, strerror(errno));
3383 				}
3384 			} else /* no parent, so reap 'em */
3385 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3386 			nsd->mode = NSD_RUN;
3387 		}
3388 		else if(mode == NSD_RUN) {
3389 			/* Wait for a query... */
3390 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3391 				if (errno != EINTR) {
3392 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3393 					break;
3394 				}
3395 			}
3396 		} else if(mode == NSD_QUIT) {
3397 			/* ignore here, quit */
3398 		} else {
3399 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3400 				(int)mode);
3401 			nsd->mode = NSD_RUN;
3402 		}
3403 	}
3404 
3405 	service_remaining_tcp(nsd);
3406 #ifdef	BIND8_STATS
3407 	bind8_stats(nsd);
3408 #endif /* BIND8_STATS */
3409 
3410 #ifdef MEMCLEAN /* OS collects memory pages */
3411 #ifdef RATELIMIT
3412 	rrl_deinit(nsd->this_child->child_num);
3413 #endif
3414 	event_base_free(event_base);
3415 	region_destroy(server_region);
3416 #endif
3417 	server_shutdown(nsd);
3418 }
3419 
3420 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3421 {
3422 	int* timed_out = (int*)arg;
3423         assert(event & EV_TIMEOUT); (void)event;
3424 	/* wake up the service tcp thread, note event is no longer
3425 	 * registered */
3426 	*timed_out = 1;
3427 }
3428 
3429 void
3430 service_remaining_tcp(struct nsd* nsd)
3431 {
3432 	struct tcp_handler_data* p;
3433 	struct event_base* event_base;
3434 	/* check if it is needed */
3435 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3436 		return;
3437 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3438 #ifdef USE_DNSTAP
3439 	/* remove dnstap collector, we cannot write there because the new
3440 	 * child process is using the file descriptor, or the child
3441 	 * process after that. */
3442 	dt_collector_destroy(nsd->dt_collector, nsd);
3443 	nsd->dt_collector = NULL;
3444 #endif
3445 	/* setup event base */
3446 	event_base = nsd_child_event_base();
3447 	if(!event_base) {
3448 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3449 		return;
3450 	}
3451 	/* register tcp connections */
3452 	for(p = tcp_active_list; p != NULL; p = p->next) {
3453 		struct timeval timeout;
3454 		int fd = p->event.ev_fd;
3455 #ifdef USE_MINI_EVENT
3456 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3457 #else
3458 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3459 #endif
3460 		void (*fn)(int, short, void*);
3461 #ifdef HAVE_SSL
3462 		if(p->tls) {
3463 			if((event&EV_READ))
3464 				fn = handle_tls_reading;
3465 			else	fn = handle_tls_writing;
3466 		} else {
3467 #endif
3468 			if((event&EV_READ))
3469 				fn = handle_tcp_reading;
3470 			else	fn = handle_tcp_writing;
3471 #ifdef HAVE_SSL
3472 		}
3473 #endif
3474 
3475 		p->tcp_no_more_queries = 1;
3476 		/* set timeout to 3 seconds (previously 1/10 second) */
3477 		if(p->tcp_timeout > 3000)
3478 			p->tcp_timeout = 3000;
3479 		timeout.tv_sec = p->tcp_timeout / 1000;
3480 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3481 		event_del(&p->event);
3482 		memset(&p->event, 0, sizeof(p->event));
3483 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3484 			fn, p);
3485 		if(event_base_set(event_base, &p->event) != 0)
3486 			log_msg(LOG_ERR, "event base set failed");
3487 		if(event_add(&p->event, &timeout) != 0)
3488 			log_msg(LOG_ERR, "event add failed");
3489 	}
3490 
3491 	/* handle it */
3492 	while(nsd->current_tcp_count > 0) {
3493 		mode_t m = server_signal_mode(nsd);
3494 		struct event timeout;
3495 		struct timeval tv;
3496 		int timed_out = 0;
3497 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3498 			m == NSD_REAP_CHILDREN) {
3499 			/* quit */
3500 			break;
3501 		}
3502 		/* timer */
3503 		/* have to do something every 3 seconds */
3504 		tv.tv_sec = 3;
3505 		tv.tv_usec = 0;
3506 		memset(&timeout, 0, sizeof(timeout));
3507 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3508 			&timed_out);
3509 		if(event_base_set(event_base, &timeout) != 0)
3510 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3511 		if(event_add(&timeout, &tv) != 0)
3512 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3513 
3514 		/* service loop */
3515 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3516 			if (errno != EINTR) {
3517 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3518 				break;
3519 			}
3520 		}
3521 		if(!timed_out) {
3522 			event_del(&timeout);
3523 		} else {
3524 			/* timed out, quit */
3525 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3526 			break;
3527 		}
3528 	}
3529 #ifdef MEMCLEAN
3530 	event_base_free(event_base);
3531 #endif
3532 	/* continue to quit after return */
3533 }
3534 
3535 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3536  * are always used, even if nonblocking operations are broken, in which case
3537  * NUM_RECV_PER_SELECT is defined to 1 (one).
3538  */
3539 #if defined(HAVE_RECVMMSG)
3540 #define nsd_recvmmsg recvmmsg
3541 #else /* !HAVE_RECVMMSG */
3542 
3543 static int
3544 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3545              int flags, struct timespec *timeout)
3546 {
3547 	unsigned int vpos = 0;
3548 	ssize_t rcvd;
3549 
3550 	/* timeout is ignored, ensure caller does not expect it to work */
3551 	assert(timeout == NULL); (void)timeout;
3552 
3553 	while(vpos < vlen) {
3554 		rcvd = recvfrom(sockfd,
3555 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3556 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3557 		                flags,
3558 		                msgvec[vpos].msg_hdr.msg_name,
3559 		               &msgvec[vpos].msg_hdr.msg_namelen);
3560 		if(rcvd < 0) {
3561 			break;
3562 		} else {
3563 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3564 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3565 			vpos++;
3566 		}
3567 	}
3568 
3569 	if(vpos) {
3570 		/* error will be picked up next time */
3571 		return (int)vpos;
3572 	} else if(errno == 0) {
3573 		return 0;
3574 	} else if(errno == EAGAIN) {
3575 		return 0;
3576 	}
3577 
3578 	return -1;
3579 }
3580 #endif /* HAVE_RECVMMSG */
3581 
3582 #ifdef HAVE_SENDMMSG
3583 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3584 #else /* !HAVE_SENDMMSG */
3585 
3586 static int
3587 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3588 {
3589 	unsigned int vpos = 0;
3590 	ssize_t snd;
3591 
3592 	while(vpos < vlen) {
3593 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3594 		snd = sendto(sockfd,
3595 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3596 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3597 		             flags,
3598 		             msgvec[vpos].msg_hdr.msg_name,
3599 		             msgvec[vpos].msg_hdr.msg_namelen);
3600 		if(snd < 0) {
3601 			break;
3602 		} else {
3603 			msgvec[vpos].msg_len = (unsigned int)snd;
3604 			vpos++;
3605 		}
3606 	}
3607 
3608 	if(vpos) {
3609 		return (int)vpos;
3610 	} else if(errno == 0) {
3611 		return 0;
3612 	}
3613 
3614 	return -1;
3615 }
3616 #endif /* HAVE_SENDMMSG */
3617 
3618 static int
3619 port_is_zero(
3620 #ifdef INET6
3621         struct sockaddr_storage *addr
3622 #else
3623         struct sockaddr_in *addr
3624 #endif
3625 	)
3626 {
3627 #ifdef INET6
3628 	if(addr->ss_family == AF_INET6) {
3629 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3630 	} else if(addr->ss_family == AF_INET) {
3631 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3632 	}
3633 	return 0;
3634 #else
3635 	if(addr->sin_family == AF_INET) {
3636 		return addr->sin_port == 0;
3637 	}
3638 	return 0;
3639 #endif
3640 }
3641 
3642 /* Parses the PROXYv2 header from buf and updates the struct.
3643  * Returns 1 on success, 0 on failure. */
3644 static int
3645 consume_pp2_header(struct buffer* buf, struct query* q, int stream)
3646 {
3647 	size_t size;
3648 	struct pp2_header* header;
3649 	int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf));
3650 	if(err) {
3651 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse "
3652 			"PROXYv2 header: %s", pp_lookup_error(err)));
3653 		return 0;
3654 	}
3655 	header = (struct pp2_header*)buffer_begin(buf);
3656 	size = PP2_HEADER_SIZE + read_uint16(&header->len);
3657 	if(size > buffer_limit(buf)) {
3658 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer "
3659 			"size to read PROXYv2 header"));
3660 		return 0;
3661 	}
3662 	if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) {
3663 		/* A connection from the proxy itself.
3664 		 * No need to do anything with addresses. */
3665 		goto done;
3666 	}
3667 	if(header->fam_prot == PP2_UNSPEC_UNSPEC) {
3668 		/* Unspecified family and protocol. This could be used for
3669 		 * health checks by proxies.
3670 		 * No need to do anything with addresses. */
3671 		goto done;
3672 	}
3673 	/* Read the proxied address */
3674 	switch(header->fam_prot) {
3675 		case PP2_INET_STREAM:
3676 		case PP2_INET_DGRAM:
3677 			{
3678 			struct sockaddr_in* addr =
3679 				(struct sockaddr_in*)&q->client_addr;
3680 			addr->sin_family = AF_INET;
3681 			memmove(&addr->sin_addr.s_addr,
3682 				&header->addr.addr4.src_addr, 4);
3683 			memmove(&addr->sin_port, &header->addr.addr4.src_port,
3684 				2);
3685 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in);
3686 			}
3687 			/* Ignore the destination address; it should be us. */
3688 			break;
3689 #ifdef INET6
3690 		case PP2_INET6_STREAM:
3691 		case PP2_INET6_DGRAM:
3692 			{
3693 			struct sockaddr_in6* addr =
3694 				(struct sockaddr_in6*)&q->client_addr;
3695 			memset(addr, 0, sizeof(*addr));
3696 			addr->sin6_family = AF_INET6;
3697 			memmove(&addr->sin6_addr,
3698 				header->addr.addr6.src_addr, 16);
3699 			memmove(&addr->sin6_port, &header->addr.addr6.src_port,
3700 				2);
3701 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6);
3702 			}
3703 			/* Ignore the destination address; it should be us. */
3704 			break;
3705 #endif /* INET6 */
3706 		default:
3707 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported "
3708 				"family and protocol 0x%x",
3709 				(int)header->fam_prot));
3710 			return 0;
3711 	}
3712 	q->is_proxied = 1;
3713 done:
3714 	if(!stream) {
3715 		/* We are reading a whole packet;
3716 		 * Move the rest of the data to overwrite the PROXYv2 header */
3717 		/* XXX can we do better to avoid memmove? */
3718 		memmove(header, ((char*)header)+size, buffer_limit(buf)-size);
3719 		buffer_set_limit(buf, buffer_limit(buf)-size);
3720 	}
3721 	return 1;
3722 }
3723 
3724 static void
3725 handle_udp(int fd, short event, void* arg)
3726 {
3727 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3728 	int received, sent, recvcount, i;
3729 	struct query *q;
3730 	uint32_t now = 0;
3731 
3732 	if (!(event & EV_READ)) {
3733 		return;
3734 	}
3735 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3736 	/* this printf strangely gave a performance increase on Linux */
3737 	/* printf("recvcount %d \n", recvcount); */
3738 	if (recvcount == -1) {
3739 		if (errno != EAGAIN && errno != EINTR) {
3740 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3741 			STATUP(data->nsd, rxerr);
3742 			/* No zone statup */
3743 		}
3744 		/* Simply no data available */
3745 		return;
3746 	}
3747 	for (i = 0; i < recvcount; i++) {
3748 	loopstart:
3749 		received = msgs[i].msg_len;
3750 		queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen;
3751 		queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr);
3752 		queries[i]->is_proxied = 0;
3753 		q = queries[i];
3754 		if (received == -1) {
3755 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3756 #if defined(HAVE_RECVMMSG)
3757 				msgs[i].msg_hdr.msg_flags
3758 #else
3759 				errno
3760 #endif
3761 				));
3762 			STATUP(data->nsd, rxerr);
3763 			/* No zone statup */
3764 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3765 			iovecs[i].iov_len = buffer_remaining(q->packet);
3766 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3767 			goto swap_drop;
3768 		}
3769 
3770 		/* Account... */
3771 #ifdef BIND8_STATS
3772 		if (data->socket->addr.ai_family == AF_INET) {
3773 			STATUP(data->nsd, qudp);
3774 		} else if (data->socket->addr.ai_family == AF_INET6) {
3775 			STATUP(data->nsd, qudp6);
3776 		}
3777 #endif
3778 
3779 		buffer_skip(q->packet, received);
3780 		buffer_flip(q->packet);
3781 		if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) {
3782 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not "
3783 				"consume PROXYv2 header"));
3784 			goto swap_drop;
3785 		}
3786 		if(!q->is_proxied) {
3787 			q->client_addrlen = q->remote_addrlen;
3788 			memmove(&q->client_addr, &q->remote_addr,
3789 				q->remote_addrlen);
3790 		}
3791 #ifdef USE_DNSTAP
3792 		/*
3793 		 * sending UDP-query with server address (local) and client address to dnstap process
3794 		 */
3795 		log_addr("query from client", &q->client_addr);
3796 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3797 		if(verbosity >= 6 && q->is_proxied)
3798 			log_addr("query via proxy", &q->remote_addr);
3799 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen,
3800 			q->tcp, q->packet);
3801 #endif /* USE_DNSTAP */
3802 
3803 		/* Process and answer the query... */
3804 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3805 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3806 				STATUP(data->nsd, nona);
3807 				ZTATUP(data->nsd, q->zone, nona);
3808 			}
3809 
3810 #ifdef USE_ZONE_STATS
3811 			if (data->socket->addr.ai_family == AF_INET) {
3812 				ZTATUP(data->nsd, q->zone, qudp);
3813 			} else if (data->socket->addr.ai_family == AF_INET6) {
3814 				ZTATUP(data->nsd, q->zone, qudp6);
3815 			}
3816 #endif
3817 
3818 			/* Add EDNS0 and TSIG info if necessary.  */
3819 			query_add_optional(q, data->nsd, &now);
3820 
3821 			buffer_flip(q->packet);
3822 			iovecs[i].iov_len = buffer_remaining(q->packet);
3823 #ifdef BIND8_STATS
3824 			/* Account the rcode & TC... */
3825 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3826 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3827 			if (TC(q->packet)) {
3828 				STATUP(data->nsd, truncated);
3829 				ZTATUP(data->nsd, q->zone, truncated);
3830 			}
3831 #endif /* BIND8_STATS */
3832 #ifdef USE_DNSTAP
3833 			/*
3834 			 * sending UDP-response with server address (local) and client address to dnstap process
3835 			 */
3836 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3837 			log_addr("response to client", &q->client_addr);
3838 			if(verbosity >= 6 && q->is_proxied)
3839 				log_addr("response via proxy", &q->remote_addr);
3840 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3841 				&q->client_addr, q->client_addrlen, q->tcp, q->packet,
3842 				q->zone);
3843 #endif /* USE_DNSTAP */
3844 		} else {
3845 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3846 			iovecs[i].iov_len = buffer_remaining(q->packet);
3847 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3848 		swap_drop:
3849 			STATUP(data->nsd, dropped);
3850 			ZTATUP(data->nsd, q->zone, dropped);
3851 			if(i != recvcount-1) {
3852 				/* swap with last and decrease recvcount */
3853 				struct mmsghdr mtmp = msgs[i];
3854 				struct iovec iotmp = iovecs[i];
3855 				recvcount--;
3856 				msgs[i] = msgs[recvcount];
3857 				iovecs[i] = iovecs[recvcount];
3858 				queries[i] = queries[recvcount];
3859 				msgs[recvcount] = mtmp;
3860 				iovecs[recvcount] = iotmp;
3861 				queries[recvcount] = q;
3862 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3863 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3864 				goto loopstart;
3865 			} else { recvcount --; }
3866 		}
3867 	}
3868 
3869 	/* send until all are sent */
3870 	i = 0;
3871 	while(i<recvcount) {
3872 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3873 		if(sent == -1) {
3874 			if(errno == ENOBUFS ||
3875 #ifdef EWOULDBLOCK
3876 				errno == EWOULDBLOCK ||
3877 #endif
3878 				errno == EAGAIN) {
3879 				/* block to wait until send buffer avail */
3880 				int flag, errstore;
3881 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3882 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3883 					flag = 0;
3884 				}
3885 				flag &= ~O_NONBLOCK;
3886 				if(fcntl(fd, F_SETFL, flag) == -1)
3887 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3888 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3889 				errstore = errno;
3890 				flag |= O_NONBLOCK;
3891 				if(fcntl(fd, F_SETFL, flag) == -1)
3892 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3893 				if(sent != -1) {
3894 					i += sent;
3895 					continue;
3896 				}
3897 				errno = errstore;
3898 			}
3899 			if(errno == EINVAL) {
3900 				/* skip the invalid argument entry,
3901 				 * send the remaining packets in the list */
3902 				if(!(port_is_zero((void*)&queries[i]->remote_addr) &&
3903 					verbosity < 3)) {
3904 					const char* es = strerror(errno);
3905 					char a[64];
3906 					addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
3907 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3908 				}
3909 				i += 1;
3910 				continue;
3911 			}
3912 			/* don't log transient network full errors, unless
3913 			 * on higher verbosity */
3914 			if(!(errno == ENOBUFS && verbosity < 1) &&
3915 #ifdef EWOULDBLOCK
3916 			   errno != EWOULDBLOCK &&
3917 #endif
3918 			   errno != EAGAIN) {
3919 				const char* es = strerror(errno);
3920 				char a[64];
3921 				addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
3922 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3923 			}
3924 #ifdef BIND8_STATS
3925 			data->nsd->st->txerr += recvcount-i;
3926 #endif /* BIND8_STATS */
3927 			break;
3928 		}
3929 		i += sent;
3930 	}
3931 	for(i=0; i<recvcount; i++) {
3932 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3933 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3934 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3935 	}
3936 }
3937 
3938 #ifdef HAVE_SSL
3939 /*
3940  * Setup an event for the tcp handler.
3941  */
3942 static void
3943 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3944        int fd, short event)
3945 {
3946 	struct timeval timeout;
3947 	struct event_base* ev_base;
3948 
3949 	timeout.tv_sec = data->nsd->tcp_timeout;
3950 	timeout.tv_usec = 0L;
3951 
3952 	ev_base = data->event.ev_base;
3953 	event_del(&data->event);
3954 	memset(&data->event, 0, sizeof(data->event));
3955 	event_set(&data->event, fd, event, fn, data);
3956 	if(event_base_set(ev_base, &data->event) != 0)
3957 		log_msg(LOG_ERR, "event base set failed");
3958 	if(event_add(&data->event, &timeout) != 0)
3959 		log_msg(LOG_ERR, "event add failed");
3960 }
3961 #endif /* HAVE_SSL */
3962 
3963 static void
3964 cleanup_tcp_handler(struct tcp_handler_data* data)
3965 {
3966 	event_del(&data->event);
3967 #ifdef HAVE_SSL
3968 	if(data->tls) {
3969 		SSL_shutdown(data->tls);
3970 		SSL_free(data->tls);
3971 		data->tls = NULL;
3972 	}
3973 #endif
3974 	data->pp2_header_state = pp2_header_none;
3975 	close(data->event.ev_fd);
3976 	if(data->prev)
3977 		data->prev->next = data->next;
3978 	else	tcp_active_list = data->next;
3979 	if(data->next)
3980 		data->next->prev = data->prev;
3981 
3982 	/*
3983 	 * Enable the TCP accept handlers when the current number of
3984 	 * TCP connections is about to drop below the maximum number
3985 	 * of TCP connections.
3986 	 */
3987 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3988 		configure_handler_event_types(EV_READ|EV_PERSIST);
3989 		if(slowaccept) {
3990 			event_del(&slowaccept_event);
3991 			slowaccept = 0;
3992 		}
3993 	}
3994 	--data->nsd->current_tcp_count;
3995 	assert(data->nsd->current_tcp_count >= 0);
3996 
3997 	region_destroy(data->region);
3998 }
3999 
4000 /* Read more data into the buffer for tcp read. Pass the amount of additional
4001  * data required. Returns false if nothing needs to be done this event, or
4002  * true if the additional data is in the buffer. */
4003 static int
4004 more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos,
4005 	size_t add_amount, ssize_t* received)
4006 {
4007 	*received = read(fd, bufpos, add_amount);
4008 	if (*received == -1) {
4009 		if (errno == EAGAIN || errno == EINTR) {
4010 			/*
4011 			 * Read would block, wait until more
4012 			 * data is available.
4013 			 */
4014 			return 0;
4015 		} else {
4016 			char buf[48];
4017 			addr2str(&data->query->remote_addr, buf, sizeof(buf));
4018 #ifdef ECONNRESET
4019 			if (verbosity >= 2 || errno != ECONNRESET)
4020 #endif /* ECONNRESET */
4021 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
4022 			cleanup_tcp_handler(data);
4023 			return 0;
4024 		}
4025 	} else if (*received == 0) {
4026 		/* EOF */
4027 		cleanup_tcp_handler(data);
4028 		return 0;
4029 	}
4030 	return 1;
4031 }
4032 
4033 static void
4034 handle_tcp_reading(int fd, short event, void* arg)
4035 {
4036 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4037 	ssize_t received;
4038 	struct event_base* ev_base;
4039 	struct timeval timeout;
4040 	uint32_t now = 0;
4041 
4042 	if ((event & EV_TIMEOUT)) {
4043 		/* Connection timed out.  */
4044 		cleanup_tcp_handler(data);
4045 		return;
4046 	}
4047 
4048 	if ((data->nsd->tcp_query_count > 0 &&
4049 	     data->query_count >= data->nsd->tcp_query_count) ||
4050 	    (data->query_count > 0 && data->tcp_no_more_queries))
4051   {
4052 		/* No more queries allowed on this tcp connection. */
4053 		cleanup_tcp_handler(data);
4054 		return;
4055 	}
4056 
4057 	assert((event & EV_READ));
4058 
4059 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4060 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4061 		data->query_needs_reset = 0;
4062 	}
4063 
4064 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4065 		struct pp2_header* header = NULL;
4066 		size_t want_read_size = 0;
4067 		size_t current_read_size = 0;
4068 		if(data->pp2_header_state == pp2_header_none) {
4069 			want_read_size = PP2_HEADER_SIZE;
4070 			if(buffer_remaining(data->query->packet) <
4071 				want_read_size) {
4072 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4073 				cleanup_tcp_handler(data);
4074 				return;
4075 			}
4076 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4077 			current_read_size = want_read_size;
4078 			if(data->bytes_transmitted < current_read_size) {
4079 				if(!more_read_buf_tcp(fd, data,
4080 					(void*)buffer_at(data->query->packet,
4081 						data->bytes_transmitted),
4082 					current_read_size - data->bytes_transmitted,
4083 					&received))
4084 					return;
4085 				data->bytes_transmitted += received;
4086 				buffer_skip(data->query->packet, received);
4087 				if(data->bytes_transmitted != current_read_size)
4088 					return;
4089 				data->pp2_header_state = pp2_header_init;
4090 			}
4091 		}
4092 		if(data->pp2_header_state == pp2_header_init) {
4093 			int err;
4094 			err = pp2_read_header(buffer_begin(data->query->packet),
4095 				buffer_limit(data->query->packet));
4096 			if(err) {
4097 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4098 				cleanup_tcp_handler(data);
4099 				return;
4100 			}
4101 			header = (struct pp2_header*)buffer_begin(data->query->packet);
4102 			want_read_size = ntohs(header->len);
4103 			if(buffer_limit(data->query->packet) <
4104 				PP2_HEADER_SIZE + want_read_size) {
4105 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4106 				cleanup_tcp_handler(data);
4107 				return;
4108 			}
4109 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4110 			current_read_size = PP2_HEADER_SIZE + want_read_size;
4111 			if(want_read_size == 0) {
4112 				/* nothing more to read; header is complete */
4113 				data->pp2_header_state = pp2_header_done;
4114 			} else if(data->bytes_transmitted < current_read_size) {
4115 				if(!more_read_buf_tcp(fd, data,
4116 					(void*)buffer_at(data->query->packet,
4117 						data->bytes_transmitted),
4118 					current_read_size - data->bytes_transmitted,
4119 					&received))
4120 					return;
4121 				data->bytes_transmitted += received;
4122 				buffer_skip(data->query->packet, received);
4123 				if(data->bytes_transmitted != current_read_size)
4124 					return;
4125 				data->pp2_header_state = pp2_header_done;
4126 			}
4127 		}
4128 		if(data->pp2_header_state != pp2_header_done || !header) {
4129 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4130 
4131 			cleanup_tcp_handler(data);
4132 			return;
4133 		}
4134 		buffer_flip(data->query->packet);
4135 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4136 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4137 
4138 			cleanup_tcp_handler(data);
4139 			return;
4140 		}
4141 		/* Clear and reset the buffer to read the following
4142 		 * DNS packet(s). */
4143 		buffer_clear(data->query->packet);
4144 		data->bytes_transmitted = 0;
4145 	}
4146 
4147 	/*
4148 	 * Check if we received the leading packet length bytes yet.
4149 	 */
4150 	if (data->bytes_transmitted < sizeof(uint16_t)) {
4151 		if(!more_read_buf_tcp(fd, data,
4152 			(char*) &data->query->tcplen + data->bytes_transmitted,
4153 			sizeof(uint16_t) - data->bytes_transmitted, &received))
4154 			return;
4155 		data->bytes_transmitted += received;
4156 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4157 			/*
4158 			 * Not done with the tcplen yet, wait for more
4159 			 * data to become available.
4160 			 */
4161 			return;
4162 		}
4163 		assert(data->bytes_transmitted == sizeof(uint16_t));
4164 
4165 		data->query->tcplen = ntohs(data->query->tcplen);
4166 
4167 		/*
4168 		 * Minimum query size is:
4169 		 *
4170 		 *     Size of the header (12)
4171 		 *   + Root domain name   (1)
4172 		 *   + Query class        (2)
4173 		 *   + Query type         (2)
4174 		 */
4175 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4176 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4177 			cleanup_tcp_handler(data);
4178 			return;
4179 		}
4180 
4181 		if (data->query->tcplen > data->query->maxlen) {
4182 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4183 			cleanup_tcp_handler(data);
4184 			return;
4185 		}
4186 
4187 		buffer_set_limit(data->query->packet, data->query->tcplen);
4188 	}
4189 
4190 	assert(buffer_remaining(data->query->packet) > 0);
4191 
4192 	/* Read the (remaining) query data.  */
4193 	if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet),
4194 		buffer_remaining(data->query->packet), &received))
4195 		return;
4196 	data->bytes_transmitted += received;
4197 	buffer_skip(data->query->packet, received);
4198 	if (buffer_remaining(data->query->packet) > 0) {
4199 		/*
4200 		 * Message not yet complete, wait for more data to
4201 		 * become available.
4202 		 */
4203 		return;
4204 	}
4205 
4206 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4207 
4208 	/* Account... */
4209 #ifdef BIND8_STATS
4210 #ifndef INET6
4211 	STATUP(data->nsd, ctcp);
4212 #else
4213 	if (data->query->remote_addr.ss_family == AF_INET) {
4214 		STATUP(data->nsd, ctcp);
4215 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4216 		STATUP(data->nsd, ctcp6);
4217 	}
4218 #endif
4219 #endif /* BIND8_STATS */
4220 
4221 	/* We have a complete query, process it.  */
4222 
4223 	/* tcp-query-count: handle query counter ++ */
4224 	data->query_count++;
4225 
4226 	buffer_flip(data->query->packet);
4227 #ifdef USE_DNSTAP
4228 	/*
4229 	 * and send TCP-query with found address (local) and client address to dnstap process
4230 	 */
4231 	log_addr("query from client", &data->query->client_addr);
4232 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4233 	if(verbosity >= 6 && data->query->is_proxied)
4234 		log_addr("query via proxy", &data->query->remote_addr);
4235 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4236 		data->query->client_addrlen, data->query->tcp, data->query->packet);
4237 #endif /* USE_DNSTAP */
4238 	data->query_state = server_process_query(data->nsd, data->query, &now);
4239 	if (data->query_state == QUERY_DISCARDED) {
4240 		/* Drop the packet and the entire connection... */
4241 		STATUP(data->nsd, dropped);
4242 		ZTATUP(data->nsd, data->query->zone, dropped);
4243 		cleanup_tcp_handler(data);
4244 		return;
4245 	}
4246 
4247 #ifdef BIND8_STATS
4248 	if (RCODE(data->query->packet) == RCODE_OK
4249 	    && !AA(data->query->packet))
4250 	{
4251 		STATUP(data->nsd, nona);
4252 		ZTATUP(data->nsd, data->query->zone, nona);
4253 	}
4254 #endif /* BIND8_STATS */
4255 
4256 #ifdef USE_ZONE_STATS
4257 #ifndef INET6
4258 	ZTATUP(data->nsd, data->query->zone, ctcp);
4259 #else
4260 	if (data->query->remote_addr.ss_family == AF_INET) {
4261 		ZTATUP(data->nsd, data->query->zone, ctcp);
4262 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4263 		ZTATUP(data->nsd, data->query->zone, ctcp6);
4264 	}
4265 #endif
4266 #endif /* USE_ZONE_STATS */
4267 
4268 	query_add_optional(data->query, data->nsd, &now);
4269 
4270 	/* Switch to the tcp write handler.  */
4271 	buffer_flip(data->query->packet);
4272 	data->query->tcplen = buffer_remaining(data->query->packet);
4273 #ifdef BIND8_STATS
4274 	/* Account the rcode & TC... */
4275 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4276 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4277 	if (TC(data->query->packet)) {
4278 		STATUP(data->nsd, truncated);
4279 		ZTATUP(data->nsd, data->query->zone, truncated);
4280 	}
4281 #endif /* BIND8_STATS */
4282 #ifdef USE_DNSTAP
4283 	/*
4284 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4285 	 */
4286 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4287 	log_addr("response to client", &data->query->client_addr);
4288 	if(verbosity >= 6 && data->query->is_proxied)
4289 		log_addr("response via proxy", &data->query->remote_addr);
4290 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4291 		data->query->client_addrlen, data->query->tcp, data->query->packet,
4292 		data->query->zone);
4293 #endif /* USE_DNSTAP */
4294 	data->bytes_transmitted = 0;
4295 
4296 	timeout.tv_sec = data->tcp_timeout / 1000;
4297 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4298 
4299 	ev_base = data->event.ev_base;
4300 	event_del(&data->event);
4301 	memset(&data->event, 0, sizeof(data->event));
4302 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4303 		handle_tcp_writing, data);
4304 	if(event_base_set(ev_base, &data->event) != 0)
4305 		log_msg(LOG_ERR, "event base set tcpr failed");
4306 	if(event_add(&data->event, &timeout) != 0)
4307 		log_msg(LOG_ERR, "event add tcpr failed");
4308 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4309 	handle_tcp_writing(fd, EV_WRITE, data);
4310 }
4311 
4312 static void
4313 handle_tcp_writing(int fd, short event, void* arg)
4314 {
4315 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4316 	ssize_t sent;
4317 	struct query *q = data->query;
4318 	struct timeval timeout;
4319 	struct event_base* ev_base;
4320 	uint32_t now = 0;
4321 
4322 	if ((event & EV_TIMEOUT)) {
4323 		/* Connection timed out.  */
4324 		cleanup_tcp_handler(data);
4325 		return;
4326 	}
4327 
4328 	assert((event & EV_WRITE));
4329 
4330 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
4331 		/* Writing the response packet length.  */
4332 		uint16_t n_tcplen = htons(q->tcplen);
4333 #ifdef HAVE_WRITEV
4334 		struct iovec iov[2];
4335 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4336 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4337 		iov[1].iov_base = buffer_begin(q->packet);
4338 		iov[1].iov_len = buffer_limit(q->packet);
4339 		sent = writev(fd, iov, 2);
4340 #else /* HAVE_WRITEV */
4341 		sent = write(fd,
4342 			     (const char *) &n_tcplen + data->bytes_transmitted,
4343 			     sizeof(n_tcplen) - data->bytes_transmitted);
4344 #endif /* HAVE_WRITEV */
4345 		if (sent == -1) {
4346 			if (errno == EAGAIN || errno == EINTR) {
4347 				/*
4348 				 * Write would block, wait until
4349 				 * socket becomes writable again.
4350 				 */
4351 				return;
4352 			} else {
4353 #ifdef ECONNRESET
4354 				if(verbosity >= 2 || errno != ECONNRESET)
4355 #endif /* ECONNRESET */
4356 #ifdef EPIPE
4357 				  if(verbosity >= 2 || errno != EPIPE)
4358 #endif /* EPIPE 'broken pipe' */
4359 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4360 				cleanup_tcp_handler(data);
4361 				return;
4362 			}
4363 		}
4364 
4365 		data->bytes_transmitted += sent;
4366 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
4367 			/*
4368 			 * Writing not complete, wait until socket
4369 			 * becomes writable again.
4370 			 */
4371 			return;
4372 		}
4373 
4374 #ifdef HAVE_WRITEV
4375 		sent -= sizeof(n_tcplen);
4376 		/* handle potential 'packet done' code */
4377 		goto packet_could_be_done;
4378 #endif
4379  	}
4380 
4381 	sent = write(fd,
4382 		     buffer_current(q->packet),
4383 		     buffer_remaining(q->packet));
4384 	if (sent == -1) {
4385 		if (errno == EAGAIN || errno == EINTR) {
4386 			/*
4387 			 * Write would block, wait until
4388 			 * socket becomes writable again.
4389 			 */
4390 			return;
4391 		} else {
4392 #ifdef ECONNRESET
4393 			if(verbosity >= 2 || errno != ECONNRESET)
4394 #endif /* ECONNRESET */
4395 #ifdef EPIPE
4396 				  if(verbosity >= 2 || errno != EPIPE)
4397 #endif /* EPIPE 'broken pipe' */
4398 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4399 			cleanup_tcp_handler(data);
4400 			return;
4401 		}
4402 	}
4403 
4404 	data->bytes_transmitted += sent;
4405 #ifdef HAVE_WRITEV
4406   packet_could_be_done:
4407 #endif
4408 	buffer_skip(q->packet, sent);
4409 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4410 		/*
4411 		 * Still more data to write when socket becomes
4412 		 * writable again.
4413 		 */
4414 		return;
4415 	}
4416 
4417 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4418 
4419 	if (data->query_state == QUERY_IN_AXFR ||
4420 		data->query_state == QUERY_IN_IXFR) {
4421 		/* Continue processing AXFR and writing back results.  */
4422 		buffer_clear(q->packet);
4423 		if(data->query_state == QUERY_IN_AXFR)
4424 			data->query_state = query_axfr(data->nsd, q, 0);
4425 		else data->query_state = query_ixfr(data->nsd, q);
4426 		if (data->query_state != QUERY_PROCESSED) {
4427 			query_add_optional(data->query, data->nsd, &now);
4428 
4429 			/* Reset data. */
4430 			buffer_flip(q->packet);
4431 			q->tcplen = buffer_remaining(q->packet);
4432 			data->bytes_transmitted = 0;
4433 			/* Reset timeout.  */
4434 			timeout.tv_sec = data->tcp_timeout / 1000;
4435 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4436 			ev_base = data->event.ev_base;
4437 			event_del(&data->event);
4438 			memset(&data->event, 0, sizeof(data->event));
4439 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4440 				handle_tcp_writing, data);
4441 			if(event_base_set(ev_base, &data->event) != 0)
4442 				log_msg(LOG_ERR, "event base set tcpw failed");
4443 			if(event_add(&data->event, &timeout) != 0)
4444 				log_msg(LOG_ERR, "event add tcpw failed");
4445 
4446 			/*
4447 			 * Write data if/when the socket is writable
4448 			 * again.
4449 			 */
4450 			return;
4451 		}
4452 	}
4453 
4454 	/*
4455 	 * Done sending, wait for the next request to arrive on the
4456 	 * TCP socket by installing the TCP read handler.
4457 	 */
4458 	if ((data->nsd->tcp_query_count > 0 &&
4459 		data->query_count >= data->nsd->tcp_query_count) ||
4460 		data->tcp_no_more_queries) {
4461 
4462 		(void) shutdown(fd, SHUT_WR);
4463 	}
4464 
4465 	data->bytes_transmitted = 0;
4466 	data->query_needs_reset = 1;
4467 
4468 	timeout.tv_sec = data->tcp_timeout / 1000;
4469 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4470 	ev_base = data->event.ev_base;
4471 	event_del(&data->event);
4472 	memset(&data->event, 0, sizeof(data->event));
4473 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4474 		handle_tcp_reading, data);
4475 	if(event_base_set(ev_base, &data->event) != 0)
4476 		log_msg(LOG_ERR, "event base set tcpw failed");
4477 	if(event_add(&data->event, &timeout) != 0)
4478 		log_msg(LOG_ERR, "event add tcpw failed");
4479 }
4480 
4481 #ifdef HAVE_SSL
4482 /** create SSL object and associate fd */
4483 static SSL*
4484 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4485 {
4486 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4487 	if(!ssl) {
4488 		log_crypto_err("could not SSL_new");
4489 		return NULL;
4490 	}
4491 	SSL_set_accept_state(ssl);
4492 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4493 	if(!SSL_set_fd(ssl, fd)) {
4494 		log_crypto_err("could not SSL_set_fd");
4495 		SSL_free(ssl);
4496 		return NULL;
4497 	}
4498 	return ssl;
4499 }
4500 
4501 /** TLS handshake to upgrade TCP connection */
4502 static int
4503 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4504 {
4505 	int r;
4506 	if(data->shake_state == tls_hs_read_event) {
4507 		/* read condition satisfied back to writing */
4508 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4509 		data->shake_state = tls_hs_none;
4510 		return 1;
4511 	}
4512 	if(data->shake_state == tls_hs_write_event) {
4513 		/* write condition satisfied back to reading */
4514 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4515 		data->shake_state = tls_hs_none;
4516 		return 1;
4517 	}
4518 
4519 	/* (continue to) setup the TLS connection */
4520 	ERR_clear_error();
4521 	r = SSL_do_handshake(data->tls);
4522 
4523 	if(r != 1) {
4524 		int want = SSL_get_error(data->tls, r);
4525 		if(want == SSL_ERROR_WANT_READ) {
4526 			if(data->shake_state == tls_hs_read) {
4527 				/* try again later */
4528 				return 1;
4529 			}
4530 			data->shake_state = tls_hs_read;
4531 			/* switch back to reading mode */
4532 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4533 			return 1;
4534 		} else if(want == SSL_ERROR_WANT_WRITE) {
4535 			if(data->shake_state == tls_hs_write) {
4536 				/* try again later */
4537 				return 1;
4538 			}
4539 			data->shake_state = tls_hs_write;
4540 			/* switch back to writing mode */
4541 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4542 			return 1;
4543 		} else {
4544 			if(r == 0)
4545 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4546 			else {
4547 				unsigned long err = ERR_get_error();
4548 				if(!squelch_err_ssl_handshake(err)) {
4549 					char a[64], s[256];
4550 					addr2str(&data->query->remote_addr, a, sizeof(a));
4551 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4552 					log_crypto_from_err(s, err);
4553 				}
4554 			}
4555 			cleanup_tcp_handler(data);
4556 			return 0;
4557 		}
4558 	}
4559 
4560 	/* Use to log successful upgrade for testing - could be removed*/
4561 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4562 	/* set back to the event we need to have when reading (or writing) */
4563 	if(data->shake_state == tls_hs_read && writing) {
4564 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4565 	} else if(data->shake_state == tls_hs_write && !writing) {
4566 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4567 	}
4568 	data->shake_state = tls_hs_none;
4569 	return 1;
4570 }
4571 
4572 /* Read more data into the buffer for tls read. Pass the amount of additional
4573  * data required. Returns false if nothing needs to be done this event, or
4574  * true if the additional data is in the buffer. */
4575 static int
4576 more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos,
4577 	size_t add_amount, ssize_t* received)
4578 {
4579 	ERR_clear_error();
4580 	if((*received=SSL_read(data->tls, bufpos, add_amount)) <= 0) {
4581 		int want = SSL_get_error(data->tls, *received);
4582 		if(want == SSL_ERROR_ZERO_RETURN) {
4583 			cleanup_tcp_handler(data);
4584 			return 0; /* shutdown, closed */
4585 		} else if(want == SSL_ERROR_WANT_READ) {
4586 			/* wants to be called again */
4587 			return 0;
4588 		}
4589 		else if(want == SSL_ERROR_WANT_WRITE) {
4590 			/* switch to writing */
4591 			data->shake_state = tls_hs_write_event;
4592 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4593 			return 0;
4594 		}
4595 		cleanup_tcp_handler(data);
4596 		log_crypto_err("could not SSL_read");
4597 		return 0;
4598 	}
4599 	return 1;
4600 }
4601 
4602 /** handle TLS reading of incoming query */
4603 static void
4604 handle_tls_reading(int fd, short event, void* arg)
4605 {
4606 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4607 	ssize_t received;
4608 	uint32_t now = 0;
4609 
4610 	if ((event & EV_TIMEOUT)) {
4611 		/* Connection timed out.  */
4612 		cleanup_tcp_handler(data);
4613 		return;
4614 	}
4615 
4616 	if ((data->nsd->tcp_query_count > 0 &&
4617 	     data->query_count >= data->nsd->tcp_query_count) ||
4618 	    (data->query_count > 0 && data->tcp_no_more_queries))
4619 	{
4620 		/* No more queries allowed on this tcp connection. */
4621 		cleanup_tcp_handler(data);
4622 		return;
4623 	}
4624 
4625 	assert((event & EV_READ));
4626 
4627 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4628 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4629 		data->query_needs_reset = 0;
4630 	}
4631 
4632 	if(data->shake_state != tls_hs_none) {
4633 		if(!tls_handshake(data, fd, 0))
4634 			return;
4635 		if(data->shake_state != tls_hs_none)
4636 			return;
4637 	}
4638 
4639 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4640 		struct pp2_header* header = NULL;
4641 		size_t want_read_size = 0;
4642 		size_t current_read_size = 0;
4643 		if(data->pp2_header_state == pp2_header_none) {
4644 			want_read_size = PP2_HEADER_SIZE;
4645 			if(buffer_remaining(data->query->packet) <
4646 				want_read_size) {
4647 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4648 				cleanup_tcp_handler(data);
4649 				return;
4650 			}
4651 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4652 			current_read_size = want_read_size;
4653 			if(data->bytes_transmitted < current_read_size) {
4654 				if(!more_read_buf_tls(fd, data,
4655 					buffer_at(data->query->packet,
4656 						data->bytes_transmitted),
4657 					current_read_size - data->bytes_transmitted,
4658 					&received))
4659 					return;
4660 				data->bytes_transmitted += received;
4661 				buffer_skip(data->query->packet, received);
4662 				if(data->bytes_transmitted != current_read_size)
4663 					return;
4664 				data->pp2_header_state = pp2_header_init;
4665 			}
4666 		}
4667 		if(data->pp2_header_state == pp2_header_init) {
4668 			int err;
4669 			err = pp2_read_header(buffer_begin(data->query->packet),
4670 				buffer_limit(data->query->packet));
4671 			if(err) {
4672 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4673 				cleanup_tcp_handler(data);
4674 				return;
4675 			}
4676 			header = (struct pp2_header*)buffer_begin(data->query->packet);
4677 			want_read_size = ntohs(header->len);
4678 			if(buffer_limit(data->query->packet) <
4679 				PP2_HEADER_SIZE + want_read_size) {
4680 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4681 				cleanup_tcp_handler(data);
4682 				return;
4683 			}
4684 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4685 			current_read_size = PP2_HEADER_SIZE + want_read_size;
4686 			if(want_read_size == 0) {
4687 				/* nothing more to read; header is complete */
4688 				data->pp2_header_state = pp2_header_done;
4689 			} else if(data->bytes_transmitted < current_read_size) {
4690 				if(!more_read_buf_tls(fd, data,
4691 					buffer_at(data->query->packet,
4692 						data->bytes_transmitted),
4693 					current_read_size - data->bytes_transmitted,
4694 					&received))
4695 					return;
4696 				data->bytes_transmitted += received;
4697 				buffer_skip(data->query->packet, received);
4698 				if(data->bytes_transmitted != current_read_size)
4699 					return;
4700 				data->pp2_header_state = pp2_header_done;
4701 			}
4702 		}
4703 		if(data->pp2_header_state != pp2_header_done || !header) {
4704 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4705 			cleanup_tcp_handler(data);
4706 			return;
4707 		}
4708 		buffer_flip(data->query->packet);
4709 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4710 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4711 			cleanup_tcp_handler(data);
4712 			return;
4713 		}
4714 		/* Clear and reset the buffer to read the following
4715 		 * DNS packet(s). */
4716 		buffer_clear(data->query->packet);
4717 		data->bytes_transmitted = 0;
4718 	}
4719 	/*
4720 	 * Check if we received the leading packet length bytes yet.
4721 	 */
4722 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4723 		if(!more_read_buf_tls(fd, data,
4724 		    (char *) &data->query->tcplen + data->bytes_transmitted,
4725 		    sizeof(uint16_t) - data->bytes_transmitted, &received))
4726 			return;
4727 		data->bytes_transmitted += received;
4728 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4729 			/*
4730 			 * Not done with the tcplen yet, wait for more
4731 			 * data to become available.
4732 			 */
4733 			return;
4734 		}
4735 
4736 		assert(data->bytes_transmitted == sizeof(uint16_t));
4737 
4738 		data->query->tcplen = ntohs(data->query->tcplen);
4739 
4740 		/*
4741 		 * Minimum query size is:
4742 		 *
4743 		 *     Size of the header (12)
4744 		 *   + Root domain name   (1)
4745 		 *   + Query class        (2)
4746 		 *   + Query type         (2)
4747 		 */
4748 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4749 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4750 			cleanup_tcp_handler(data);
4751 			return;
4752 		}
4753 
4754 		if (data->query->tcplen > data->query->maxlen) {
4755 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4756 			cleanup_tcp_handler(data);
4757 			return;
4758 		}
4759 
4760 		buffer_set_limit(data->query->packet, data->query->tcplen);
4761 	}
4762 
4763 	assert(buffer_remaining(data->query->packet) > 0);
4764 
4765 	/* Read the (remaining) query data.  */
4766 	if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet),
4767 		buffer_remaining(data->query->packet), &received))
4768 		return;
4769 	data->bytes_transmitted += received;
4770 	buffer_skip(data->query->packet, received);
4771 	if (buffer_remaining(data->query->packet) > 0) {
4772 		/*
4773 		 * Message not yet complete, wait for more data to
4774 		 * become available.
4775 		 */
4776 		return;
4777 	}
4778 
4779 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4780 
4781 	/* Account... */
4782 #ifndef INET6
4783 	STATUP(data->nsd, ctls);
4784 #else
4785 	if (data->query->remote_addr.ss_family == AF_INET) {
4786 		STATUP(data->nsd, ctls);
4787 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4788 		STATUP(data->nsd, ctls6);
4789 	}
4790 #endif
4791 
4792 	/* We have a complete query, process it.  */
4793 
4794 	/* tcp-query-count: handle query counter ++ */
4795 	data->query_count++;
4796 
4797 	buffer_flip(data->query->packet);
4798 #ifdef USE_DNSTAP
4799 	/*
4800 	 * and send TCP-query with found address (local) and client address to dnstap process
4801 	 */
4802 	log_addr("query from client", &data->query->client_addr);
4803 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4804 	if(verbosity >= 6 && data->query->is_proxied)
4805 		log_addr("query via proxy", &data->query->remote_addr);
4806 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4807 		data->query->client_addrlen, data->query->tcp, data->query->packet);
4808 #endif /* USE_DNSTAP */
4809 	data->query_state = server_process_query(data->nsd, data->query, &now);
4810 	if (data->query_state == QUERY_DISCARDED) {
4811 		/* Drop the packet and the entire connection... */
4812 		STATUP(data->nsd, dropped);
4813 		ZTATUP(data->nsd, data->query->zone, dropped);
4814 		cleanup_tcp_handler(data);
4815 		return;
4816 	}
4817 
4818 #ifdef BIND8_STATS
4819 	if (RCODE(data->query->packet) == RCODE_OK
4820 	    && !AA(data->query->packet))
4821 	{
4822 		STATUP(data->nsd, nona);
4823 		ZTATUP(data->nsd, data->query->zone, nona);
4824 	}
4825 #endif /* BIND8_STATS */
4826 
4827 #ifdef USE_ZONE_STATS
4828 #ifndef INET6
4829 	ZTATUP(data->nsd, data->query->zone, ctls);
4830 #else
4831 	if (data->query->remote_addr.ss_family == AF_INET) {
4832 		ZTATUP(data->nsd, data->query->zone, ctls);
4833 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4834 		ZTATUP(data->nsd, data->query->zone, ctls6);
4835 	}
4836 #endif
4837 #endif /* USE_ZONE_STATS */
4838 
4839 	query_add_optional(data->query, data->nsd, &now);
4840 
4841 	/* Switch to the tcp write handler.  */
4842 	buffer_flip(data->query->packet);
4843 	data->query->tcplen = buffer_remaining(data->query->packet);
4844 #ifdef BIND8_STATS
4845 	/* Account the rcode & TC... */
4846 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4847 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4848 	if (TC(data->query->packet)) {
4849 		STATUP(data->nsd, truncated);
4850 		ZTATUP(data->nsd, data->query->zone, truncated);
4851 	}
4852 #endif /* BIND8_STATS */
4853 #ifdef USE_DNSTAP
4854 	/*
4855 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4856 	 */
4857 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4858 	log_addr("response to client", &data->query->client_addr);
4859 	if(verbosity >= 6 && data->query->is_proxied)
4860 		log_addr("response via proxy", &data->query->remote_addr);
4861 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4862 		data->query->client_addrlen, data->query->tcp, data->query->packet,
4863 		data->query->zone);
4864 #endif /* USE_DNSTAP */
4865 	data->bytes_transmitted = 0;
4866 
4867 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4868 
4869 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4870 	handle_tls_writing(fd, EV_WRITE, data);
4871 }
4872 
4873 /** handle TLS writing of outgoing response */
4874 static void
4875 handle_tls_writing(int fd, short event, void* arg)
4876 {
4877 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4878 	ssize_t sent;
4879 	struct query *q = data->query;
4880 	/* static variable that holds reassembly buffer used to put the
4881 	 * TCP length in front of the packet, like writev. */
4882 	static buffer_type* global_tls_temp_buffer = NULL;
4883 	buffer_type* write_buffer;
4884 	uint32_t now = 0;
4885 
4886 	if ((event & EV_TIMEOUT)) {
4887 		/* Connection timed out.  */
4888 		cleanup_tcp_handler(data);
4889 		return;
4890 	}
4891 
4892 	assert((event & EV_WRITE));
4893 
4894 	if(data->shake_state != tls_hs_none) {
4895 		if(!tls_handshake(data, fd, 1))
4896 			return;
4897 		if(data->shake_state != tls_hs_none)
4898 			return;
4899 	}
4900 
4901 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4902 
4903 	/* If we are writing the start of a message, we must include the length
4904 	 * this is done with a copy into write_buffer. */
4905 	write_buffer = NULL;
4906 	if (data->bytes_transmitted == 0) {
4907 		if(!global_tls_temp_buffer) {
4908 			/* gets deallocated when nsd shuts down from
4909 			 * nsd.region */
4910 			global_tls_temp_buffer = buffer_create(nsd.region,
4911 				QIOBUFSZ + sizeof(q->tcplen));
4912 			if (!global_tls_temp_buffer) {
4913 				return;
4914 			}
4915 		}
4916 		write_buffer = global_tls_temp_buffer;
4917 		buffer_clear(write_buffer);
4918 		buffer_write_u16(write_buffer, q->tcplen);
4919 		buffer_write(write_buffer, buffer_current(q->packet),
4920 			(int)buffer_remaining(q->packet));
4921 		buffer_flip(write_buffer);
4922 	} else {
4923 		write_buffer = q->packet;
4924 	}
4925 
4926 	/* Write the response */
4927 	ERR_clear_error();
4928 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4929 	if(sent <= 0) {
4930 		int want = SSL_get_error(data->tls, sent);
4931 		if(want == SSL_ERROR_ZERO_RETURN) {
4932 			cleanup_tcp_handler(data);
4933 			/* closed */
4934 		} else if(want == SSL_ERROR_WANT_READ) {
4935 			/* switch back to reading */
4936 			data->shake_state = tls_hs_read_event;
4937 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4938 		} else if(want != SSL_ERROR_WANT_WRITE) {
4939 			cleanup_tcp_handler(data);
4940 			log_crypto_err("could not SSL_write");
4941 		}
4942 		return;
4943 	}
4944 
4945 	buffer_skip(write_buffer, sent);
4946 	if(buffer_remaining(write_buffer) != 0) {
4947 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4948 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4949 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4950 		}
4951 	}
4952 
4953 	data->bytes_transmitted += sent;
4954 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4955 		/*
4956 		 * Still more data to write when socket becomes
4957 		 * writable again.
4958 		 */
4959 		return;
4960 	}
4961 
4962 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4963 
4964 	if (data->query_state == QUERY_IN_AXFR ||
4965 		data->query_state == QUERY_IN_IXFR) {
4966 		/* Continue processing AXFR and writing back results.  */
4967 		buffer_clear(q->packet);
4968 		if(data->query_state == QUERY_IN_AXFR)
4969 			data->query_state = query_axfr(data->nsd, q, 0);
4970 		else data->query_state = query_ixfr(data->nsd, q);
4971 		if (data->query_state != QUERY_PROCESSED) {
4972 			query_add_optional(data->query, data->nsd, &now);
4973 
4974 			/* Reset data. */
4975 			buffer_flip(q->packet);
4976 			q->tcplen = buffer_remaining(q->packet);
4977 			data->bytes_transmitted = 0;
4978 			/* Reset to writing mode.  */
4979 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4980 
4981 			/*
4982 			 * Write data if/when the socket is writable
4983 			 * again.
4984 			 */
4985 			return;
4986 		}
4987 	}
4988 
4989 	/*
4990 	 * Done sending, wait for the next request to arrive on the
4991 	 * TCP socket by installing the TCP read handler.
4992 	 */
4993 	if ((data->nsd->tcp_query_count > 0 &&
4994 		data->query_count >= data->nsd->tcp_query_count) ||
4995 		data->tcp_no_more_queries) {
4996 
4997 		(void) shutdown(fd, SHUT_WR);
4998 	}
4999 
5000 	data->bytes_transmitted = 0;
5001 	data->query_needs_reset = 1;
5002 
5003 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
5004 }
5005 #endif
5006 
5007 static void
5008 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
5009 	void* ATTR_UNUSED(arg))
5010 {
5011 	if(slowaccept) {
5012 		configure_handler_event_types(EV_PERSIST | EV_READ);
5013 		slowaccept = 0;
5014 	}
5015 }
5016 
5017 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
5018 {
5019 #ifndef HAVE_ACCEPT4
5020 	int s = accept(fd, addr, addrlen);
5021 	if (s != -1) {
5022 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
5023 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
5024 			close(s);
5025 			s = -1;
5026 			errno=EINTR; /* stop error printout as error in accept4
5027 				by setting this errno, it omits printout, in
5028 				later code that calls nsd_accept4 */
5029 		}
5030 	}
5031 	return s;
5032 #else
5033 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
5034 #endif /* HAVE_ACCEPT4 */
5035 }
5036 
5037 /*
5038  * Handle an incoming TCP connection.  The connection is accepted and
5039  * a new TCP reader event handler is added.  The TCP handler
5040  * is responsible for cleanup when the connection is closed.
5041  */
5042 static void
5043 handle_tcp_accept(int fd, short event, void* arg)
5044 {
5045 	struct tcp_accept_handler_data *data
5046 		= (struct tcp_accept_handler_data *) arg;
5047 	int s;
5048 	int reject = 0;
5049 	struct tcp_handler_data *tcp_data;
5050 	region_type *tcp_region;
5051 #ifdef INET6
5052 	struct sockaddr_storage addr;
5053 #else
5054 	struct sockaddr_in addr;
5055 #endif
5056 	socklen_t addrlen;
5057 	struct timeval timeout;
5058 
5059 	if (!(event & EV_READ)) {
5060 		return;
5061 	}
5062 
5063 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
5064 		reject = data->nsd->options->tcp_reject_overflow;
5065 		if (!reject) {
5066 			return;
5067 		}
5068 	}
5069 
5070 	/* Accept it... */
5071 	addrlen = sizeof(addr);
5072 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
5073 	if (s == -1) {
5074 		/**
5075 		 * EMFILE and ENFILE is a signal that the limit of open
5076 		 * file descriptors has been reached. Pause accept().
5077 		 * EINTR is a signal interrupt. The others are various OS ways
5078 		 * of saying that the client has closed the connection.
5079 		 */
5080 		if (errno == EMFILE || errno == ENFILE) {
5081 			if (!slowaccept) {
5082 				/* disable accept events */
5083 				struct timeval tv;
5084 				configure_handler_event_types(0);
5085 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
5086 				tv.tv_usec = 0L;
5087 				memset(&slowaccept_event, 0,
5088 					sizeof(slowaccept_event));
5089 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
5090 					handle_slowaccept_timeout, NULL);
5091 				(void)event_base_set(data->event.ev_base,
5092 					&slowaccept_event);
5093 				(void)event_add(&slowaccept_event, &tv);
5094 				slowaccept = 1;
5095 				/* We don't want to spam the logs here */
5096 			}
5097 		} else if (errno != EINTR
5098 			&& errno != EWOULDBLOCK
5099 #ifdef ECONNABORTED
5100 			&& errno != ECONNABORTED
5101 #endif /* ECONNABORTED */
5102 #ifdef EPROTO
5103 			&& errno != EPROTO
5104 #endif /* EPROTO */
5105 			) {
5106 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
5107 		}
5108 		return;
5109 	}
5110 
5111 	if (reject) {
5112 		shutdown(s, SHUT_RDWR);
5113 		close(s);
5114 		return;
5115 	}
5116 
5117 	/*
5118 	 * This region is deallocated when the TCP connection is
5119 	 * closed by the TCP handler.
5120 	 */
5121 	tcp_region = region_create(xalloc, free);
5122 	tcp_data = (struct tcp_handler_data *) region_alloc(
5123 		tcp_region, sizeof(struct tcp_handler_data));
5124 	tcp_data->region = tcp_region;
5125 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
5126 		compression_table_size, compressed_dnames);
5127 	tcp_data->nsd = data->nsd;
5128 	tcp_data->query_count = 0;
5129 #ifdef HAVE_SSL
5130 	tcp_data->shake_state = tls_hs_none;
5131 	tcp_data->tls = NULL;
5132 #endif
5133 	tcp_data->query_needs_reset = 1;
5134 	tcp_data->pp2_enabled = data->pp2_enabled;
5135 	tcp_data->pp2_header_state = pp2_header_none;
5136 	tcp_data->prev = NULL;
5137 	tcp_data->next = NULL;
5138 
5139 	tcp_data->query_state = QUERY_PROCESSED;
5140 	tcp_data->bytes_transmitted = 0;
5141 	memcpy(&tcp_data->query->remote_addr, &addr, addrlen);
5142 	tcp_data->query->remote_addrlen = addrlen;
5143 	/* Copy remote_address to client_address.
5144 	 * Simplest way/time for streams to do that. */
5145 	memcpy(&tcp_data->query->client_addr, &addr, addrlen);
5146 	tcp_data->query->client_addrlen = addrlen;
5147 	tcp_data->query->is_proxied = 0;
5148 
5149 	tcp_data->tcp_no_more_queries = 0;
5150 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
5151 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
5152 		/* very busy, give smaller timeout */
5153 		tcp_data->tcp_timeout = 200;
5154 	}
5155 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5156 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
5157 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
5158 
5159 #ifdef USE_DNSTAP
5160 	/* save the address of the connection */
5161 	tcp_data->socket = data->socket;
5162 #endif /* USE_DNSTAP */
5163 
5164 #ifdef HAVE_SSL
5165 	if (data->tls_accept) {
5166 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
5167 		if(!tcp_data->tls) {
5168 			close(s);
5169 			return;
5170 		}
5171 		tcp_data->shake_state = tls_hs_read;
5172 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5173 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5174 			  handle_tls_reading, tcp_data);
5175 	} else {
5176 #endif
5177 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5178 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5179 			  handle_tcp_reading, tcp_data);
5180 #ifdef HAVE_SSL
5181 	}
5182 #endif
5183 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
5184 		log_msg(LOG_ERR, "cannot set tcp event base");
5185 		close(s);
5186 		region_destroy(tcp_region);
5187 		return;
5188 	}
5189 	if(event_add(&tcp_data->event, &timeout) != 0) {
5190 		log_msg(LOG_ERR, "cannot add tcp to event base");
5191 		close(s);
5192 		region_destroy(tcp_region);
5193 		return;
5194 	}
5195 	if(tcp_active_list) {
5196 		tcp_active_list->prev = tcp_data;
5197 		tcp_data->next = tcp_active_list;
5198 	}
5199 	tcp_active_list = tcp_data;
5200 
5201 	/*
5202 	 * Keep track of the total number of TCP handlers installed so
5203 	 * we can stop accepting connections when the maximum number
5204 	 * of simultaneous TCP connections is reached.
5205 	 *
5206 	 * If tcp-reject-overflow is enabled, however, then we do not
5207 	 * change the handler event type; we keep it as-is and accept
5208 	 * overflow TCP connections only so that we can forcibly kill
5209 	 * them off.
5210 	 */
5211 	++data->nsd->current_tcp_count;
5212 	if (!data->nsd->options->tcp_reject_overflow &&
5213 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
5214 	{
5215 		configure_handler_event_types(0);
5216 	}
5217 }
5218 
5219 static void
5220 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
5221 {
5222 	size_t i;
5223 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5224 	for (i = 0; i < nsd->child_count; ++i) {
5225 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
5226 			if (write(nsd->children[i].child_fd,
5227 				&command,
5228 				sizeof(command)) == -1)
5229 			{
5230 				if(errno != EAGAIN && errno != EINTR)
5231 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
5232 					(int) command,
5233 					(int) nsd->children[i].pid,
5234 					strerror(errno));
5235 			} else if (timeout > 0) {
5236 				(void)block_read(NULL,
5237 					nsd->children[i].child_fd,
5238 					&command, sizeof(command), timeout);
5239 			}
5240 			fsync(nsd->children[i].child_fd);
5241 			close(nsd->children[i].child_fd);
5242 			nsd->children[i].child_fd = -1;
5243 		}
5244 	}
5245 }
5246 
5247 static void
5248 send_children_quit(struct nsd* nsd)
5249 {
5250 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
5251 	send_children_command(nsd, NSD_QUIT, 0);
5252 }
5253 
5254 static void
5255 send_children_quit_and_wait(struct nsd* nsd)
5256 {
5257 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
5258 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
5259 }
5260 
5261 #ifdef BIND8_STATS
5262 static void
5263 set_children_stats(struct nsd* nsd)
5264 {
5265 	size_t i;
5266 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5267 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
5268 	for (i = 0; i < nsd->child_count; ++i) {
5269 		nsd->children[i].need_to_send_STATS = 1;
5270 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
5271 	}
5272 }
5273 #endif /* BIND8_STATS */
5274 
5275 static void
5276 configure_handler_event_types(short event_types)
5277 {
5278 	size_t i;
5279 
5280 	for (i = 0; i < tcp_accept_handler_count; ++i) {
5281 		struct event* handler = &tcp_accept_handlers[i].event;
5282 		if(event_types) {
5283 			/* reassign */
5284 			int fd = handler->ev_fd;
5285 			struct event_base* base = handler->ev_base;
5286 			if(tcp_accept_handlers[i].event_added)
5287 				event_del(handler);
5288 			memset(handler, 0, sizeof(*handler));
5289 			event_set(handler, fd, event_types,
5290 				handle_tcp_accept, &tcp_accept_handlers[i]);
5291 			if(event_base_set(base, handler) != 0)
5292 				log_msg(LOG_ERR, "conhand: cannot event_base");
5293 			if(event_add(handler, NULL) != 0)
5294 				log_msg(LOG_ERR, "conhand: cannot event_add");
5295 			tcp_accept_handlers[i].event_added = 1;
5296 		} else {
5297 			/* remove */
5298 			if(tcp_accept_handlers[i].event_added) {
5299 				event_del(handler);
5300 				tcp_accept_handlers[i].event_added = 0;
5301 			}
5302 		}
5303 	}
5304 }
5305