xref: /openbsd/usr.sbin/nsd/server.c (revision bf87c3c0)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #include "ixfr.h"
85 #ifdef USE_DNSTAP
86 #include "dnstap/dnstap_collector.h"
87 #endif
88 #include "verify.h"
89 #include "util/proxy_protocol.h"
90 
91 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
92 
93 #ifdef USE_DNSTAP
94 /*
95  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
96  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
97  */
98 static void
log_addr(const char * descr,struct sockaddr_storage * addr)99 log_addr(const char* descr,
100 #ifdef INET6
101 	struct sockaddr_storage* addr
102 #else
103 	struct sockaddr_in* addr
104 #endif
105 	)
106 {
107 	char str_buf[64];
108 	if(verbosity < 6)
109 		return;
110 	if(
111 #ifdef INET6
112 		addr->ss_family == AF_INET
113 #else
114 		addr->sin_family == AF_INET
115 #endif
116 		) {
117 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
118 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
119 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
120 #ifdef INET6
121 	} else {
122 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
123 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
124 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
125 #endif
126 	}
127 }
128 #endif /* USE_DNSTAP */
129 
130 #ifdef USE_TCP_FASTOPEN
131   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
132   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
133 #endif
134 
135 /* header state for the PROXYv2 header (for TCP) */
136 enum pp2_header_state {
137 	/* no header encounter yet */
138 	pp2_header_none = 0,
139 	/* read the static part of the header */
140 	pp2_header_init,
141 	/* read the full header */
142 	pp2_header_done
143 };
144 
145 /*
146  * Data for the UDP handlers.
147  */
148 struct udp_handler_data
149 {
150 	struct nsd        *nsd;
151 	struct nsd_socket *socket;
152 	struct event       event;
153 	/* if set, PROXYv2 is expected on this connection */
154 	int pp2_enabled;
155 };
156 
157 struct tcp_accept_handler_data {
158 	struct nsd        *nsd;
159 	struct nsd_socket *socket;
160 	int                event_added;
161 	struct event       event;
162 #ifdef HAVE_SSL
163 	/* handler accepts TLS connections on the dedicated port */
164 	int                tls_accept;
165 #endif
166 	/* if set, PROXYv2 is expected on this connection */
167 	int pp2_enabled;
168 };
169 
170 /*
171  * These globals are used to enable the TCP accept handlers
172  * when the number of TCP connection drops below the maximum
173  * number of TCP connections.
174  */
175 static size_t tcp_accept_handler_count;
176 static struct tcp_accept_handler_data *tcp_accept_handlers;
177 
178 static struct event slowaccept_event;
179 static int slowaccept;
180 
181 #ifdef HAVE_SSL
182 static unsigned char *ocspdata = NULL;
183 static long ocspdata_len = 0;
184 #endif
185 
186 #ifdef NONBLOCKING_IS_BROKEN
187 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
188    read multiple times from a socket when reported ready by select. */
189 # define NUM_RECV_PER_SELECT (1)
190 #else /* !NONBLOCKING_IS_BROKEN */
191 # define NUM_RECV_PER_SELECT (100)
192 #endif /* NONBLOCKING_IS_BROKEN */
193 
194 #ifndef HAVE_MMSGHDR
195 struct mmsghdr {
196 	struct msghdr msg_hdr;
197 	unsigned int  msg_len;
198 };
199 #endif
200 
201 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
202 static struct iovec iovecs[NUM_RECV_PER_SELECT];
203 static struct query *queries[NUM_RECV_PER_SELECT];
204 
205 /*
206  * Data for the TCP connection handlers.
207  *
208  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
209  * blocking the entire server on a slow TCP connection, but does make
210  * reading from and writing to the socket more complicated.
211  *
212  * Basically, whenever a read/write would block (indicated by the
213  * EAGAIN errno variable) we remember the position we were reading
214  * from/writing to and return from the TCP reading/writing event
215  * handler.  When the socket becomes readable/writable again we
216  * continue from the same position.
217  */
218 struct tcp_handler_data
219 {
220 	/*
221 	 * The region used to allocate all TCP connection related
222 	 * data, including this structure.  This region is destroyed
223 	 * when the connection is closed.
224 	 */
225 	region_type*		region;
226 
227 	/*
228 	 * The global nsd structure.
229 	 */
230 	struct nsd*			nsd;
231 
232 	/*
233 	 * The current query data for this TCP connection.
234 	 */
235 	query_type*			query;
236 
237 	/*
238 	 * The query_state is used to remember if we are performing an
239 	 * AXFR, if we're done processing, or if we should discard the
240 	 * query and connection.
241 	 */
242 	query_state_type	query_state;
243 
244 	/*
245 	 * The event for the file descriptor and tcp timeout
246 	 */
247 	struct event event;
248 
249 	/*
250 	 * The bytes_transmitted field is used to remember the number
251 	 * of bytes transmitted when receiving or sending a DNS
252 	 * packet.  The count includes the two additional bytes used
253 	 * to specify the packet length on a TCP connection.
254 	 */
255 	size_t				bytes_transmitted;
256 
257 	/* If the query is restarted and needs a reset */
258 	int query_needs_reset;
259 
260 	/*
261 	 * The number of queries handled by this specific TCP connection.
262 	 */
263 	int					query_count;
264 
265 	/*
266 	 * The timeout in msec for this tcp connection
267 	 */
268 	int	tcp_timeout;
269 
270 	/*
271 	 * If the connection is allowed to have further queries on it.
272 	 */
273 	int tcp_no_more_queries;
274 
275 #ifdef USE_DNSTAP
276 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
277 	struct nsd_socket *socket;
278 #endif /* USE_DNSTAP */
279 
280 	/* if set, PROXYv2 is expected on this connection */
281 	int pp2_enabled;
282 
283 	/* header state for the PROXYv2 header (for TCP) */
284 	enum pp2_header_state pp2_header_state;
285 
286 #ifdef HAVE_SSL
287 	/*
288 	 * TLS object.
289 	 */
290 	SSL* tls;
291 
292 	/*
293 	 * TLS handshake state.
294 	 */
295 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
296 		tls_hs_read_event, tls_hs_write_event } shake_state;
297 #endif
298 	/* list of connections, for service of remaining tcp channels */
299 	struct tcp_handler_data *prev, *next;
300 };
301 /* global that is the list of active tcp channels */
302 static struct tcp_handler_data *tcp_active_list = NULL;
303 
304 /*
305  * Handle incoming queries on the UDP server sockets.
306  */
307 static void handle_udp(int fd, short event, void* arg);
308 
309 /*
310  * Handle incoming connections on the TCP sockets.  These handlers
311  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
312  * connection) but are disabled when the number of current TCP
313  * connections is equal to the maximum number of TCP connections.
314  * Disabling is done by changing the handler to wait for the
315  * NETIO_EVENT_NONE type.  This is done using the function
316  * configure_tcp_accept_handlers.
317  */
318 static void handle_tcp_accept(int fd, short event, void* arg);
319 
320 /*
321  * Handle incoming queries on a TCP connection.  The TCP connections
322  * are configured to be non-blocking and the handler may be called
323  * multiple times before a complete query is received.
324  */
325 static void handle_tcp_reading(int fd, short event, void* arg);
326 
327 /*
328  * Handle outgoing responses on a TCP connection.  The TCP connections
329  * are configured to be non-blocking and the handler may be called
330  * multiple times before a complete response is sent.
331  */
332 static void handle_tcp_writing(int fd, short event, void* arg);
333 
334 #ifdef HAVE_SSL
335 /* Create SSL object and associate fd */
336 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
337 /*
338  * Handle TLS handshake. May be called multiple times if incomplete.
339  */
340 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
341 
342 /*
343  * Handle incoming queries on a TLS over TCP connection.  The TLS
344  * connections are configured to be non-blocking and the handler may
345  * be called multiple times before a complete query is received.
346  */
347 static void handle_tls_reading(int fd, short event, void* arg);
348 
349 /*
350  * Handle outgoing responses on a TLS over TCP connection.  The TLS
351  * connections are configured to be non-blocking and the handler may
352  * be called multiple times before a complete response is sent.
353  */
354 static void handle_tls_writing(int fd, short event, void* arg);
355 #endif
356 
357 /*
358  * Send all children the quit nonblocking, then close pipe.
359  */
360 static void send_children_quit(struct nsd* nsd);
361 /* same, for shutdown time, waits for child to exit to avoid restart issues */
362 static void send_children_quit_and_wait(struct nsd* nsd);
363 
364 /* set childrens flags to send NSD_STATS to them */
365 #ifdef BIND8_STATS
366 static void set_children_stats(struct nsd* nsd);
367 #endif /* BIND8_STATS */
368 
369 /*
370  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
371  */
372 static void configure_handler_event_types(short event_types);
373 
374 static uint16_t *compressed_dname_offsets = 0;
375 static uint32_t compression_table_capacity = 0;
376 static uint32_t compression_table_size = 0;
377 static domain_type* compressed_dnames[MAXRRSPP];
378 
379 #ifdef USE_TCP_FASTOPEN
380 /* Checks to see if the kernel value must be manually changed in order for
381    TCP Fast Open to support server mode */
report_tcp_fastopen_config()382 static void report_tcp_fastopen_config() {
383 
384 	int tcp_fastopen_fp;
385 	uint8_t tcp_fastopen_value;
386 
387 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
388 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
389 	}
390 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
391 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
392 		close(tcp_fastopen_fp);
393 	}
394 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
395 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
396 		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
397 		log_msg(LOG_WARNING, "To enable TFO use the command:");
398 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
399 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
400 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
401 		close(tcp_fastopen_fp);
402 	}
403 	close(tcp_fastopen_fp);
404 }
405 #endif
406 
407 /*
408  * Remove the specified pid from the list of child pids.  Returns -1 if
409  * the pid is not in the list, child_num otherwise.  The field is set to 0.
410  */
411 static int
delete_child_pid(struct nsd * nsd,pid_t pid)412 delete_child_pid(struct nsd *nsd, pid_t pid)
413 {
414 	size_t i;
415 	for (i = 0; i < nsd->child_count; ++i) {
416 		if (nsd->children[i].pid == pid) {
417 			nsd->children[i].pid = 0;
418 			if(!nsd->children[i].need_to_exit) {
419 				if(nsd->children[i].child_fd != -1)
420 					close(nsd->children[i].child_fd);
421 				nsd->children[i].child_fd = -1;
422 				if(nsd->children[i].handler)
423 					nsd->children[i].handler->fd = -1;
424 			}
425 			return i;
426 		}
427 	}
428 	return -1;
429 }
430 
431 /*
432  * Restart child servers if necessary.
433  */
434 static int
restart_child_servers(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)435 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
436 	int* xfrd_sock_p)
437 {
438 	struct main_ipc_handler_data *ipc_data;
439 	size_t i;
440 	int sv[2];
441 
442 	/* Fork the child processes... */
443 	for (i = 0; i < nsd->child_count; ++i) {
444 		if (nsd->children[i].pid <= 0) {
445 			if (nsd->children[i].child_fd != -1)
446 				close(nsd->children[i].child_fd);
447 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
448 				log_msg(LOG_ERR, "socketpair: %s",
449 					strerror(errno));
450 				return -1;
451 			}
452 			nsd->children[i].child_fd = sv[0];
453 			nsd->children[i].parent_fd = sv[1];
454 			nsd->children[i].pid = fork();
455 			switch (nsd->children[i].pid) {
456 			default: /* SERVER MAIN */
457 				close(nsd->children[i].parent_fd);
458 				nsd->children[i].parent_fd = -1;
459 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
460 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
461 				}
462 				if(!nsd->children[i].handler)
463 				{
464 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
465 						region, sizeof(struct main_ipc_handler_data));
466 					ipc_data->nsd = nsd;
467 					ipc_data->child = &nsd->children[i];
468 					ipc_data->child_num = i;
469 					ipc_data->xfrd_sock = xfrd_sock_p;
470 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
471 					ipc_data->forward_mode = 0;
472 					ipc_data->got_bytes = 0;
473 					ipc_data->total_bytes = 0;
474 					ipc_data->acl_num = 0;
475 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
476 						region, sizeof(struct netio_handler));
477 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
478 					nsd->children[i].handler->timeout = NULL;
479 					nsd->children[i].handler->user_data = ipc_data;
480 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
481 					nsd->children[i].handler->event_handler = parent_handle_child_command;
482 					netio_add_handler(netio, nsd->children[i].handler);
483 				}
484 				/* clear any ongoing ipc */
485 				ipc_data = (struct main_ipc_handler_data*)
486 					nsd->children[i].handler->user_data;
487 				ipc_data->forward_mode = 0;
488 				/* restart - update fd */
489 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
490 				break;
491 			case 0: /* CHILD */
492 #ifdef MEMCLEAN /* OS collects memory pages */
493 				region_destroy(region);
494 #endif
495 
496 				if (pledge("stdio rpath inet", NULL) == -1) {
497 					log_msg(LOG_ERR, "pledge");
498 					exit(1);
499 				}
500 
501 				nsd->pid = 0;
502 				nsd->child_count = 0;
503 				nsd->server_kind = nsd->children[i].kind;
504 				nsd->this_child = &nsd->children[i];
505 				nsd->this_child->child_num = i;
506 				/* remove signal flags inherited from parent
507 				   the parent will handle them. */
508 				nsd->signal_hint_reload_hup = 0;
509 				nsd->signal_hint_reload = 0;
510 				nsd->signal_hint_child = 0;
511 				nsd->signal_hint_quit = 0;
512 				nsd->signal_hint_shutdown = 0;
513 				nsd->signal_hint_stats = 0;
514 				nsd->signal_hint_statsusr = 0;
515 				close(*xfrd_sock_p);
516 				close(nsd->this_child->child_fd);
517 				nsd->this_child->child_fd = -1;
518 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
519 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
520 				}
521 				server_child(nsd);
522 				/* NOTREACH */
523 				exit(0);
524 			case -1:
525 				log_msg(LOG_ERR, "fork failed: %s",
526 					strerror(errno));
527 				return -1;
528 			}
529 		}
530 	}
531 	return 0;
532 }
533 
534 #ifdef BIND8_STATS
set_bind8_alarm(struct nsd * nsd)535 static void set_bind8_alarm(struct nsd* nsd)
536 {
537 	/* resync so that the next alarm is on the next whole minute */
538 	if(nsd->st_period > 0) /* % by 0 gives divbyzero error */
539 		alarm(nsd->st_period - (time(NULL) % nsd->st_period));
540 }
541 #endif
542 
543 /* set zone stat ids for zones initially read in */
544 static void
zonestatid_tree_set(struct nsd * nsd)545 zonestatid_tree_set(struct nsd* nsd)
546 {
547 	struct radnode* n;
548 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
549 		zone_type* zone = (zone_type*)n->elem;
550 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
551 	}
552 }
553 
554 #ifdef USE_ZONE_STATS
555 void
server_zonestat_alloc(struct nsd * nsd)556 server_zonestat_alloc(struct nsd* nsd)
557 {
558 	size_t num = (nsd->options->zonestatnames->count==0?1:
559 			nsd->options->zonestatnames->count);
560 	size_t sz = sizeof(struct nsdst)*num;
561 	char tmpfile[256];
562 	uint8_t z = 0;
563 
564 	/* file names */
565 	nsd->zonestatfname[0] = 0;
566 	nsd->zonestatfname[1] = 0;
567 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
568 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
569 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
570 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
571 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
572 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
573 
574 	/* file descriptors */
575 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
576 	if(nsd->zonestatfd[0] == -1) {
577 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
578 			strerror(errno));
579 		exit(1);
580 	}
581 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
582 	if(nsd->zonestatfd[0] == -1) {
583 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
584 			strerror(errno));
585 		close(nsd->zonestatfd[0]);
586 		unlink(nsd->zonestatfname[0]);
587 		exit(1);
588 	}
589 
590 #ifdef HAVE_MMAP
591 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
592 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
593 			strerror(errno));
594 		exit(1);
595 	}
596 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
597 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
598 			nsd->zonestatfname[0], strerror(errno));
599 		exit(1);
600 	}
601 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
602 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
603 			strerror(errno));
604 		exit(1);
605 	}
606 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
607 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
608 			nsd->zonestatfname[1], strerror(errno));
609 		exit(1);
610 	}
611 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
612 		MAP_SHARED, nsd->zonestatfd[0], 0);
613 	if(nsd->zonestat[0] == MAP_FAILED) {
614 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
615 		unlink(nsd->zonestatfname[0]);
616 		unlink(nsd->zonestatfname[1]);
617 		exit(1);
618 	}
619 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
620 		MAP_SHARED, nsd->zonestatfd[1], 0);
621 	if(nsd->zonestat[1] == MAP_FAILED) {
622 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
623 		unlink(nsd->zonestatfname[0]);
624 		unlink(nsd->zonestatfname[1]);
625 		exit(1);
626 	}
627 	memset(nsd->zonestat[0], 0, sz);
628 	memset(nsd->zonestat[1], 0, sz);
629 	nsd->zonestatsize[0] = num;
630 	nsd->zonestatsize[1] = num;
631 	nsd->zonestatdesired = num;
632 	nsd->zonestatsizenow = num;
633 	nsd->zonestatnow = nsd->zonestat[0];
634 #endif /* HAVE_MMAP */
635 }
636 
637 void
zonestat_remap(struct nsd * nsd,int idx,size_t sz)638 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
639 {
640 #ifdef HAVE_MMAP
641 #ifdef MREMAP_MAYMOVE
642 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
643 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
644 		MREMAP_MAYMOVE);
645 	if(nsd->zonestat[idx] == MAP_FAILED) {
646 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
647 		exit(1);
648 	}
649 #else /* !HAVE MREMAP */
650 	if(msync(nsd->zonestat[idx],
651 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
652 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
653 	if(munmap(nsd->zonestat[idx],
654 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
655 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
656 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
657 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
658 	if(nsd->zonestat[idx] == MAP_FAILED) {
659 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
660 		exit(1);
661 	}
662 #endif /* MREMAP */
663 #endif /* HAVE_MMAP */
664 }
665 
666 /* realloc the zonestat array for the one that is not currently in use,
667  * to match the desired new size of the array (if applicable) */
668 void
server_zonestat_realloc(struct nsd * nsd)669 server_zonestat_realloc(struct nsd* nsd)
670 {
671 #ifdef HAVE_MMAP
672 	uint8_t z = 0;
673 	size_t sz;
674 	int idx = 0; /* index of the zonestat array that is not in use */
675 	if(nsd->zonestatnow == nsd->zonestat[0])
676 		idx = 1;
677 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
678 		return;
679 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
680 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
681 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
682 			strerror(errno));
683 		exit(1);
684 	}
685 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
686 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
687 			nsd->zonestatfname[idx], strerror(errno));
688 		exit(1);
689 	}
690 	zonestat_remap(nsd, idx, sz);
691 	/* zero the newly allocated region */
692 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
693 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
694 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
695 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
696 	}
697 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
698 #endif /* HAVE_MMAP */
699 }
700 
701 /* switchover to use the other array for the new children, that
702  * briefly coexist with the old children.  And we want to avoid them
703  * both writing to the same statistics arrays. */
704 void
server_zonestat_switch(struct nsd * nsd)705 server_zonestat_switch(struct nsd* nsd)
706 {
707 	if(nsd->zonestatnow == nsd->zonestat[0]) {
708 		nsd->zonestatnow = nsd->zonestat[1];
709 		nsd->zonestatsizenow = nsd->zonestatsize[1];
710 	} else {
711 		nsd->zonestatnow = nsd->zonestat[0];
712 		nsd->zonestatsizenow = nsd->zonestatsize[0];
713 	}
714 }
715 #endif /* USE_ZONE_STATS */
716 
717 #ifdef BIND8_STATS
718 void
server_stat_alloc(struct nsd * nsd)719 server_stat_alloc(struct nsd* nsd)
720 {
721 	char tmpfile[256];
722 	size_t sz = sizeof(struct nsdst) * nsd->child_count * 2;
723 	uint8_t z = 0;
724 
725 	/* file name */
726 	nsd->statfname = 0;
727 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat",
728 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
729 	nsd->statfname = region_strdup(nsd->region, tmpfile);
730 
731 	/* file descriptor */
732 	nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600);
733 	if(nsd->statfd == -1) {
734 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname,
735 			strerror(errno));
736 		unlink(nsd->zonestatfname[0]);
737 		unlink(nsd->zonestatfname[1]);
738 		exit(1);
739 	}
740 
741 #ifdef HAVE_MMAP
742 	if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) {
743 		log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname,
744 			strerror(errno));
745 		goto fail_exit;
746 	}
747 	if(write(nsd->statfd, &z, 1) == -1) {
748 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
749 			nsd->statfname, strerror(errno));
750 		goto fail_exit;
751 	}
752 	nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
753 		MAP_SHARED, nsd->statfd, 0);
754 	if(nsd->stat_map == MAP_FAILED) {
755 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
756 fail_exit:
757 		close(nsd->statfd);
758 		unlink(nsd->statfname);
759 		unlink(nsd->zonestatfname[0]);
760 		unlink(nsd->zonestatfname[1]);
761 		exit(1);
762 	}
763 	memset(nsd->stat_map, 0, sz);
764 	nsd->stats_per_child[0] = nsd->stat_map;
765 	nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count];
766 	nsd->stat_current = 0;
767 	nsd->st = &nsd->stats_per_child[nsd->stat_current][0];
768 #endif /* HAVE_MMAP */
769 }
770 #endif /* BIND8_STATS */
771 
772 #ifdef BIND8_STATS
773 void
server_stat_free(struct nsd * nsd)774 server_stat_free(struct nsd* nsd)
775 {
776 	unlink(nsd->statfname);
777 }
778 #endif /* BIND8_STATS */
779 
780 static void
cleanup_dname_compression_tables(void * ptr)781 cleanup_dname_compression_tables(void *ptr)
782 {
783 	free(ptr);
784 	compressed_dname_offsets = NULL;
785 	compression_table_capacity = 0;
786 }
787 
788 static void
initialize_dname_compression_tables(struct nsd * nsd)789 initialize_dname_compression_tables(struct nsd *nsd)
790 {
791 	size_t needed = domain_table_count(nsd->db->domains) + 1;
792 	needed += EXTRA_DOMAIN_NUMBERS;
793 	if(compression_table_capacity < needed) {
794 		if(compressed_dname_offsets) {
795 			region_remove_cleanup(nsd->db->region,
796 				cleanup_dname_compression_tables,
797 				compressed_dname_offsets);
798 			free(compressed_dname_offsets);
799 		}
800 		compressed_dname_offsets = (uint16_t *) xmallocarray(
801 			needed, sizeof(uint16_t));
802 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
803 			compressed_dname_offsets);
804 		compression_table_capacity = needed;
805 		compression_table_size=domain_table_count(nsd->db->domains)+1;
806 	}
807 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
808 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
809 }
810 
811 static int
set_cloexec(struct nsd_socket * sock)812 set_cloexec(struct nsd_socket *sock)
813 {
814 	assert(sock != NULL);
815 
816 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
817 		const char *socktype =
818 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
819 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
820 			socktype, strerror(errno));
821 		return -1;
822 	}
823 
824 	return 1;
825 }
826 
827 static int
set_reuseport(struct nsd_socket * sock)828 set_reuseport(struct nsd_socket *sock)
829 {
830 #ifdef SO_REUSEPORT
831 	int on = 1;
832 #ifdef SO_REUSEPORT_LB
833 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
834 	 * SO_REUSEPORT on Linux. This is what the users want with the config
835 	 * option in nsd.conf; if we actually need local address and port reuse
836 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
837 	 * _LB they want.
838 	 */
839 	int opt = SO_REUSEPORT_LB;
840 	static const char optname[] = "SO_REUSEPORT_LB";
841 #else /* !SO_REUSEPORT_LB */
842 	int opt = SO_REUSEPORT;
843 	static const char optname[] = "SO_REUSEPORT";
844 #endif /* SO_REUSEPORT_LB */
845 
846 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
847 		return 1;
848 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
849 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
850 			optname, strerror(errno));
851 	}
852 	return -1;
853 #else
854 	(void)sock;
855 #endif /* SO_REUSEPORT */
856 
857 	return 0;
858 }
859 
860 static int
set_reuseaddr(struct nsd_socket * sock)861 set_reuseaddr(struct nsd_socket *sock)
862 {
863 #ifdef SO_REUSEADDR
864 	int on = 1;
865 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
866 		return 1;
867 	}
868 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
869 		strerror(errno));
870 	return -1;
871 #endif /* SO_REUSEADDR */
872 	return 0;
873 }
874 
875 static int
set_rcvbuf(struct nsd_socket * sock,int rcv)876 set_rcvbuf(struct nsd_socket *sock, int rcv)
877 {
878 #ifdef SO_RCVBUF
879 #ifdef SO_RCVBUFFORCE
880 	if(0 == setsockopt(
881 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
882 	{
883 		return 1;
884 	}
885 	if(errno == EPERM || errno == ENOBUFS) {
886 		return 0;
887 	}
888 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
889 		strerror(errno));
890 	return -1;
891 #else /* !SO_RCVBUFFORCE */
892 	if (0 == setsockopt(
893 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
894 	{
895 		return 1;
896 	}
897 	if(errno == ENOSYS || errno == ENOBUFS) {
898 		return 0;
899 	}
900 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
901 		strerror(errno));
902 	return -1;
903 #endif /* SO_RCVBUFFORCE */
904 #endif /* SO_RCVBUF */
905 
906 	return 0;
907 }
908 
909 static int
set_sndbuf(struct nsd_socket * sock,int snd)910 set_sndbuf(struct nsd_socket *sock, int snd)
911 {
912 #ifdef SO_SNDBUF
913 #ifdef SO_SNDBUFFORCE
914 	if(0 == setsockopt(
915 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
916 	{
917 		return 1;
918 	}
919 	if(errno == EPERM || errno == ENOBUFS) {
920 		return 0;
921 	}
922 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
923 		strerror(errno));
924 	return -1;
925 #else /* !SO_SNDBUFFORCE */
926 	if(0 == setsockopt(
927 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
928 	{
929 		return 1;
930 	}
931 	if(errno == ENOSYS || errno == ENOBUFS) {
932 		return 0;
933 	}
934 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
935 		strerror(errno));
936 	return -1;
937 #endif /* SO_SNDBUFFORCE */
938 #endif /* SO_SNDBUF */
939 
940 	return 0;
941 }
942 
943 static int
set_nonblock(struct nsd_socket * sock)944 set_nonblock(struct nsd_socket *sock)
945 {
946 	const char *socktype =
947 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
948 
949 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
950 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
951 			socktype, strerror(errno));
952 		return -1;
953 	}
954 
955 	return 1;
956 }
957 
958 #ifdef INET6
959 static int
set_ipv6_v6only(struct nsd_socket * sock)960 set_ipv6_v6only(struct nsd_socket *sock)
961 {
962 #ifdef IPV6_V6ONLY
963 	int on = 1;
964 	const char *socktype =
965 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
966 
967 	if(0 == setsockopt(
968 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
969 	{
970 		return 1;
971 	}
972 
973 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
974 		socktype, strerror(errno));
975 	return -1;
976 #else
977 	(void)sock;
978 #endif /* IPV6_V6ONLY */
979 
980 	return 0;
981 }
982 #endif /* INET6 */
983 
984 #ifdef INET6
985 static int
set_ipv6_use_min_mtu(struct nsd_socket * sock)986 set_ipv6_use_min_mtu(struct nsd_socket *sock)
987 {
988 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
989 #if defined(IPV6_USE_MIN_MTU)
990 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
991 	 * network. Therefore we do not send UDP datagrams larger than the
992 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
993 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
994 	 */
995 	int opt = IPV6_USE_MIN_MTU;
996 	int optval = 1;
997 	static const char optname[] = "IPV6_USE_MIN_MTU";
998 #elif defined(IPV6_MTU)
999 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
1000 	 * to the MIN MTU to get the same.
1001 	 */
1002 	int opt = IPV6_MTU;
1003 	int optval = IPV6_MIN_MTU;
1004 	static const char optname[] = "IPV6_MTU";
1005 #endif
1006 	if(0 == setsockopt(
1007 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
1008 	{
1009 		return 1;
1010 	}
1011 
1012 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
1013 		optname, strerror(errno));
1014 	return -1;
1015 #else
1016 	(void)sock;
1017 #endif /* INET6 */
1018 
1019 	return 0;
1020 }
1021 #endif /* INET6 */
1022 
1023 static int
set_ipv4_no_pmtu_disc(struct nsd_socket * sock)1024 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
1025 {
1026 	int ret = 0;
1027 
1028 #if defined(IP_MTU_DISCOVER)
1029 	int opt = IP_MTU_DISCOVER;
1030 	int optval;
1031 # if defined(IP_PMTUDISC_OMIT)
1032 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
1033 	 * information and send packets with DF=0. Fragmentation is allowed if
1034 	 * and only if the packet size exceeds the outgoing interface MTU or
1035 	 * the packet encounters smaller MTU link in network. This mitigates
1036 	 * DNS fragmentation attacks by preventing forged PMTU information.
1037 	 * FreeBSD already has same semantics without setting the option.
1038 	 */
1039 	optval = IP_PMTUDISC_OMIT;
1040 	if(0 == setsockopt(
1041 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1042 	{
1043 		return 1;
1044 	}
1045 
1046 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1047 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
1048 # endif /* IP_PMTUDISC_OMIT */
1049 # if defined(IP_PMTUDISC_DONT)
1050 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
1051 	optval = IP_PMTUDISC_DONT;
1052 	if(0 == setsockopt(
1053 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1054 	{
1055 		return 1;
1056 	}
1057 
1058 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1059 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
1060 # endif
1061 	ret = -1;
1062 #elif defined(IP_DONTFRAG)
1063 	int off = 0;
1064 	if (0 == setsockopt(
1065 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
1066 	{
1067 		return 1;
1068 	}
1069 
1070 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
1071 		strerror(errno));
1072 	ret = -1;
1073 #else
1074 	(void)sock;
1075 #endif
1076 
1077 	return ret;
1078 }
1079 
1080 static int
set_ip_freebind(struct nsd_socket * sock)1081 set_ip_freebind(struct nsd_socket *sock)
1082 {
1083 #ifdef IP_FREEBIND
1084 	int on = 1;
1085 	const char *socktype =
1086 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1087 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1088 	{
1089 		return 1;
1090 	}
1091 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1092 		socktype, strerror(errno));
1093 	return -1;
1094 #else
1095 	(void)sock;
1096 #endif /* IP_FREEBIND */
1097 
1098 	return 0;
1099 }
1100 
1101 static int
set_ip_transparent(struct nsd_socket * sock)1102 set_ip_transparent(struct nsd_socket *sock)
1103 {
1104 	/*
1105 	The scandalous preprocessor blob here calls for some explanation :)
1106 	POSIX does not specify an option to bind non-local IPs, so
1107 	platforms developed several implementation-specific options,
1108 	all set in the same way, but with different names.
1109 	For additional complexity, some platform manage this setting
1110 	differently for different address families (IPv4 vs IPv6).
1111 	This scandalous preprocessor blob below abstracts such variability
1112 	in the way which leaves the C code as lean and clear as possible.
1113 	*/
1114 
1115 #if defined(IP_TRANSPARENT)
1116 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1117 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1118 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1119 // as of 2020-01, Linux does not support this on IPv6 programmatically
1120 #elif defined(SO_BINDANY)
1121 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1122 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1123 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1124 #elif defined(IP_BINDANY)
1125 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1126 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1127 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1128 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1129 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1130 #endif
1131 
1132 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1133 	(void)sock;
1134 #else
1135 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1136 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1137 #	endif
1138 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1139 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1140 #	endif
1141 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1142 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1143 #	endif
1144 
1145 	int on = 1;
1146 	const char *socktype =
1147 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1148 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1149 
1150 	if(0 == setsockopt(
1151 		sock->s,
1152 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1153 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1154 		&on, sizeof(on)))
1155 	{
1156 		return 1;
1157 	}
1158 
1159 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1160 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1161 	return -1;
1162 #endif
1163 
1164 	return 0;
1165 }
1166 
1167 static int
set_tcp_maxseg(struct nsd_socket * sock,int mss)1168 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1169 {
1170 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1171 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1172 		return 1;
1173 	}
1174 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1175 		strerror(errno));
1176 	return -1;
1177 #else
1178 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1179 #endif
1180 	return 0;
1181 }
1182 
1183 #ifdef USE_TCP_FASTOPEN
1184 static int
set_tcp_fastopen(struct nsd_socket * sock)1185 set_tcp_fastopen(struct nsd_socket *sock)
1186 {
1187 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1188 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1189 	 */
1190 	int qlen;
1191 
1192 #ifdef __APPLE__
1193 	/* macOS X implementation only supports qlen of 1 via this call. The
1194 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1195 	 * kernel parameter.
1196 	 */
1197 	qlen = 1;
1198 #else
1199 	/* 5 is recommended on Linux. */
1200 	qlen = 5;
1201 #endif
1202 	if (0 == setsockopt(
1203 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1204 	{
1205 		return 1;
1206 	}
1207 
1208 	if (errno == EPERM) {
1209 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1210 				 "; this could likely be because sysctl "
1211 				 "net.inet.tcp.fastopen.enabled, "
1212 				 "net.inet.tcp.fastopen.server_enable, or "
1213 				 "net.ipv4.tcp_fastopen is disabled",
1214 			strerror(errno));
1215 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1216 	 * disabled, except when verbosity enabled for debugging
1217 	 */
1218 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1219 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1220 			strerror(errno));
1221 	}
1222 
1223 	return (errno == ENOPROTOOPT ? 0 : -1);
1224 }
1225 #endif /* USE_TCP_FASTOPEN */
1226 
1227 static int
set_bindtodevice(struct nsd_socket * sock)1228 set_bindtodevice(struct nsd_socket *sock)
1229 {
1230 #if defined(SO_BINDTODEVICE)
1231 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1232 		sock->device, strlen(sock->device)) == -1)
1233 	{
1234 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1235 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1236 		return -1;
1237 	}
1238 
1239 	return 1;
1240 #else
1241 	(void)sock;
1242 	return 0;
1243 #endif
1244 }
1245 
1246 static int
set_setfib(struct nsd_socket * sock)1247 set_setfib(struct nsd_socket *sock)
1248 {
1249 #if defined(SO_SETFIB)
1250 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1251 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1252 	{
1253 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1254 		                 "SO_SETFIB", sock->fib, strerror(errno));
1255 		return -1;
1256 	}
1257 
1258 	return 1;
1259 #else
1260 	(void)sock;
1261 	return 0;
1262 #endif
1263 }
1264 
1265 static int
open_udp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1266 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1267 {
1268 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1269 
1270 	if(-1 == (sock->s = socket(
1271 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1272 	{
1273 #ifdef INET6
1274 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1275 		   (sock->addr.ai_family == AF_INET6) &&
1276 		   (errno == EAFNOSUPPORT))
1277 		{
1278 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1279 				"not supported");
1280 			return 0;
1281 		}
1282 #endif
1283 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1284 		return -1;
1285 	}
1286 
1287 	set_cloexec(sock);
1288 
1289 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1290 		*reuseport_works = (set_reuseport(sock) == 1);
1291 
1292 	if(nsd->options->receive_buffer_size > 0)
1293 		rcv = nsd->options->receive_buffer_size;
1294 	if(set_rcvbuf(sock, rcv) == -1)
1295 		return -1;
1296 
1297 	if(nsd->options->send_buffer_size > 0)
1298 		snd = nsd->options->send_buffer_size;
1299 	if(set_sndbuf(sock, snd) == -1)
1300 		return -1;
1301 #ifdef INET6
1302 	if(sock->addr.ai_family == AF_INET6) {
1303 		if(set_ipv6_v6only(sock) == -1 ||
1304 		   set_ipv6_use_min_mtu(sock) == -1)
1305 			return -1;
1306 	} else
1307 #endif /* INET6 */
1308 	if(sock->addr.ai_family == AF_INET) {
1309 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1310 			return -1;
1311 	}
1312 
1313 	/* Set socket to non-blocking. Otherwise, on operating systems
1314 	 * with thundering herd problems, the UDP recv could block
1315 	 * after select returns readable.
1316 	 */
1317 	set_nonblock(sock);
1318 
1319 	if(nsd->options->ip_freebind)
1320 		(void)set_ip_freebind(sock);
1321 	if(nsd->options->ip_transparent)
1322 		(void)set_ip_transparent(sock);
1323 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1324 		return -1;
1325 	if(sock->fib != -1 && set_setfib(sock) == -1)
1326 		return -1;
1327 
1328 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1329 		char buf[256];
1330 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1331 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1332 			buf, strerror(errno));
1333 		return -1;
1334 	}
1335 
1336 	return 1;
1337 }
1338 
1339 static int
open_tcp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1340 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1341 {
1342 #ifdef USE_TCP_FASTOPEN
1343 	report_tcp_fastopen_config();
1344 #endif
1345 
1346 	(void)reuseport_works;
1347 
1348 	if(-1 == (sock->s = socket(
1349 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1350 	{
1351 #ifdef INET6
1352 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1353 		   (sock->addr.ai_family == AF_INET6) &&
1354 		   (errno == EAFNOSUPPORT))
1355 		{
1356 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1357 			                     "not supported");
1358 			return 0;
1359 		}
1360 #endif /* INET6 */
1361 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1362 		return -1;
1363 	}
1364 
1365 	set_cloexec(sock);
1366 
1367 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1368 		*reuseport_works = (set_reuseport(sock) == 1);
1369 
1370 	(void)set_reuseaddr(sock);
1371 
1372 #ifdef INET6
1373 	if(sock->addr.ai_family == AF_INET6) {
1374 		if (set_ipv6_v6only(sock) == -1 ||
1375 		    set_ipv6_use_min_mtu(sock) == -1)
1376 			return -1;
1377 	}
1378 #endif
1379 
1380 	if(nsd->tcp_mss > 0)
1381 		set_tcp_maxseg(sock, nsd->tcp_mss);
1382 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1383 	   it may block in accept, even if select() says readable. */
1384 	(void)set_nonblock(sock);
1385 	if(nsd->options->ip_freebind)
1386 		(void)set_ip_freebind(sock);
1387 	if(nsd->options->ip_transparent)
1388 		(void)set_ip_transparent(sock);
1389 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1390 		return -1;
1391 	if(sock->fib != -1 && set_setfib(sock) == -1)
1392 		return -1;
1393 
1394 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1395 		char buf[256];
1396 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1397 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1398 			buf, strerror(errno));
1399 		return -1;
1400 	}
1401 
1402 #ifdef USE_TCP_FASTOPEN
1403 	(void)set_tcp_fastopen(sock);
1404 #endif
1405 
1406 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1407 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1408 		return -1;
1409 	}
1410 
1411 	return 1;
1412 }
1413 
1414 /*
1415  * Initialize the server, reuseport, create and bind the sockets.
1416  */
1417 int
server_init(struct nsd * nsd)1418 server_init(struct nsd *nsd)
1419 {
1420 	size_t i;
1421 	int reuseport = 1; /* Determine if REUSEPORT works. */
1422 
1423 	/* open server interface ports */
1424 	for(i = 0; i < nsd->ifs; i++) {
1425 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1426 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1427 		{
1428 			return -1;
1429 		}
1430 	}
1431 
1432 	if(nsd->reuseport && reuseport) {
1433 		size_t ifs = nsd->ifs * nsd->reuseport;
1434 
1435 		/* increase the size of the interface arrays, there are going
1436 		 * to be separate interface file descriptors for every server
1437 		 * instance */
1438 		region_remove_cleanup(nsd->region, free, nsd->udp);
1439 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1440 
1441 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1442 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1443 		region_add_cleanup(nsd->region, free, nsd->udp);
1444 		region_add_cleanup(nsd->region, free, nsd->tcp);
1445 		if(ifs > nsd->ifs) {
1446 			memset(&nsd->udp[nsd->ifs], 0,
1447 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1448 			memset(&nsd->tcp[nsd->ifs], 0,
1449 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1450 		}
1451 
1452 		for(i = nsd->ifs; i < ifs; i++) {
1453 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1454 			nsd->udp[i].s = -1;
1455 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1456 				return -1;
1457 			}
1458 			/* Turn off REUSEPORT for TCP by copying the socket
1459 			 * file descriptor.
1460 			 * This means we should not close TCP used by
1461 			 * other servers in reuseport enabled mode, in
1462 			 * server_child().
1463 			 */
1464 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1465 		}
1466 
1467 		nsd->ifs = ifs;
1468 	} else {
1469 		nsd->reuseport = 0;
1470 	}
1471 
1472 	/* open server interface ports for verifiers */
1473 	for(i = 0; i < nsd->verify_ifs; i++) {
1474 		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1475 		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1476 		{
1477 			return -1;
1478 		}
1479 	}
1480 
1481 	return 0;
1482 }
1483 
1484 /*
1485  * Prepare the server for take off.
1486  *
1487  */
1488 int
server_prepare(struct nsd * nsd)1489 server_prepare(struct nsd *nsd)
1490 {
1491 #ifdef RATELIMIT
1492 	/* set secret modifier for hashing (rate limits) */
1493 #ifdef HAVE_GETRANDOM
1494 	uint32_t v;
1495 	if(getrandom(&v, sizeof(v), 0) == -1) {
1496 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1497 		exit(1);
1498 	}
1499 	hash_set_raninit(v);
1500 #elif defined(HAVE_ARC4RANDOM)
1501 	hash_set_raninit(arc4random());
1502 #else
1503 	uint32_t v = getpid() ^ time(NULL);
1504 	srandom((unsigned long)v);
1505 #  ifdef HAVE_SSL
1506 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1507 		hash_set_raninit(v);
1508 	else
1509 #  endif
1510 		hash_set_raninit(random());
1511 #endif
1512 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1513 		nsd->options->rrl_ratelimit,
1514 		nsd->options->rrl_whitelist_ratelimit,
1515 		nsd->options->rrl_slip,
1516 		nsd->options->rrl_ipv4_prefix_length,
1517 		nsd->options->rrl_ipv6_prefix_length);
1518 #endif /* RATELIMIT */
1519 
1520 	/* Open the database... */
1521 	if ((nsd->db = namedb_open(nsd->options)) == NULL) {
1522 		log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno));
1523 		unlink(nsd->task[0]->fname);
1524 		unlink(nsd->task[1]->fname);
1525 #ifdef USE_ZONE_STATS
1526 		unlink(nsd->zonestatfname[0]);
1527 		unlink(nsd->zonestatfname[1]);
1528 #endif
1529 #ifdef BIND8_STATS
1530 		server_stat_free(nsd);
1531 #endif
1532 		xfrd_del_tempdir(nsd);
1533 		return -1;
1534 	}
1535 	/* check if zone files can be read */
1536 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1537 	 * for all zones */
1538 	namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1539 	zonestatid_tree_set(nsd);
1540 
1541 	compression_table_capacity = 0;
1542 	initialize_dname_compression_tables(nsd);
1543 
1544 #ifdef	BIND8_STATS
1545 	/* Initialize times... */
1546 	time(&nsd->st->boot);
1547 	set_bind8_alarm(nsd);
1548 #endif /* BIND8_STATS */
1549 
1550 	return 0;
1551 }
1552 
1553 /*
1554  * Fork the required number of servers.
1555  */
1556 static int
server_start_children(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)1557 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1558 	int* xfrd_sock_p)
1559 {
1560 	size_t i;
1561 
1562 	/* Start all child servers initially.  */
1563 	for (i = 0; i < nsd->child_count; ++i) {
1564 		nsd->children[i].pid = 0;
1565 	}
1566 
1567 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1568 }
1569 
1570 static void
server_close_socket(struct nsd_socket * sock)1571 server_close_socket(struct nsd_socket *sock)
1572 {
1573 	if(sock->s != -1) {
1574 		close(sock->s);
1575 		sock->s = -1;
1576 	}
1577 }
1578 
1579 void
server_close_all_sockets(struct nsd_socket sockets[],size_t n)1580 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1581 {
1582 	size_t i;
1583 
1584 	/* Close all the sockets... */
1585 	for (i = 0; i < n; ++i) {
1586 		server_close_socket(&sockets[i]);
1587 	}
1588 }
1589 
1590 /*
1591  * Close the sockets, shutdown the server and exit.
1592  * Does not return.
1593  */
1594 void
server_shutdown(struct nsd * nsd)1595 server_shutdown(struct nsd *nsd)
1596 {
1597 	size_t i;
1598 
1599 	server_close_all_sockets(nsd->udp, nsd->ifs);
1600 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1601 	/* CHILD: close command channel to parent */
1602 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1603 	{
1604 		close(nsd->this_child->parent_fd);
1605 		nsd->this_child->parent_fd = -1;
1606 	}
1607 	/* SERVER: close command channels to children */
1608 	if(!nsd->this_child)
1609 	{
1610 		for(i=0; i < nsd->child_count; ++i)
1611 			if(nsd->children[i].child_fd != -1)
1612 			{
1613 				close(nsd->children[i].child_fd);
1614 				nsd->children[i].child_fd = -1;
1615 			}
1616 	}
1617 
1618 	tsig_finalize();
1619 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1620 #ifdef HAVE_SSL
1621 	if (nsd->tls_ctx)
1622 		SSL_CTX_free(nsd->tls_ctx);
1623 #endif
1624 
1625 #ifdef MEMCLEAN /* OS collects memory pages */
1626 #ifdef RATELIMIT
1627 	rrl_mmap_deinit_keep_mmap();
1628 #endif
1629 #ifdef USE_DNSTAP
1630 	dt_collector_destroy(nsd->dt_collector, nsd);
1631 #endif
1632 	udb_base_free_keep_mmap(nsd->task[0]);
1633 	udb_base_free_keep_mmap(nsd->task[1]);
1634 	namedb_free_ixfr(nsd->db);
1635 	namedb_close(nsd->db);
1636 	nsd_options_destroy(nsd->options);
1637 	region_destroy(nsd->region);
1638 #endif
1639 	log_finalize();
1640 	exit(0);
1641 }
1642 
1643 void
server_prepare_xfrd(struct nsd * nsd)1644 server_prepare_xfrd(struct nsd* nsd)
1645 {
1646 	char tmpfile[256];
1647 	/* create task mmaps */
1648 	nsd->mytask = 0;
1649 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1650 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1651 	nsd->task[0] = task_file_create(tmpfile);
1652 	if(!nsd->task[0]) {
1653 #ifdef USE_ZONE_STATS
1654 		unlink(nsd->zonestatfname[0]);
1655 		unlink(nsd->zonestatfname[1]);
1656 #endif
1657 #ifdef BIND8_STATS
1658 		server_stat_free(nsd);
1659 #endif
1660 		xfrd_del_tempdir(nsd);
1661 		exit(1);
1662 	}
1663 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1664 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1665 	nsd->task[1] = task_file_create(tmpfile);
1666 	if(!nsd->task[1]) {
1667 		unlink(nsd->task[0]->fname);
1668 #ifdef USE_ZONE_STATS
1669 		unlink(nsd->zonestatfname[0]);
1670 		unlink(nsd->zonestatfname[1]);
1671 #endif
1672 #ifdef BIND8_STATS
1673 		server_stat_free(nsd);
1674 #endif
1675 		xfrd_del_tempdir(nsd);
1676 		exit(1);
1677 	}
1678 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1679 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1680 	/* create xfrd listener structure */
1681 	nsd->xfrd_listener = region_alloc(nsd->region,
1682 		sizeof(netio_handler_type));
1683 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1684 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1685 	nsd->xfrd_listener->fd = -1;
1686 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1687 		nsd;
1688 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1689 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1690 }
1691 
1692 
1693 void
server_start_xfrd(struct nsd * nsd,int del_db,int reload_active)1694 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1695 {
1696 	pid_t pid;
1697 	int sockets[2] = {0,0};
1698 	struct ipc_handler_conn_data *data;
1699 
1700 	if(nsd->xfrd_listener->fd != -1)
1701 		close(nsd->xfrd_listener->fd);
1702 	if(del_db) {
1703 		/* recreate taskdb that xfrd was using, it may be corrupt */
1704 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1705 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1706 		nsd->task[1-nsd->mytask]->fname = NULL;
1707 		/* free alloc already, so udb does not shrink itself */
1708 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1709 		nsd->task[1-nsd->mytask]->alloc = NULL;
1710 		udb_base_free(nsd->task[1-nsd->mytask]);
1711 		/* create new file, overwrite the old one */
1712 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1713 		free(tmpfile);
1714 	}
1715 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1716 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1717 		return;
1718 	}
1719 	pid = fork();
1720 	switch (pid) {
1721 	case -1:
1722 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1723 		break;
1724 	default:
1725 		/* PARENT: close first socket, use second one */
1726 		close(sockets[0]);
1727 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1728 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1729 		}
1730 		if(del_db) xfrd_free_namedb(nsd);
1731 		/* use other task than I am using, since if xfrd died and is
1732 		 * restarted, the reload is using nsd->mytask */
1733 		nsd->mytask = 1 - nsd->mytask;
1734 
1735 #ifdef HAVE_SETPROCTITLE
1736 		setproctitle("xfrd");
1737 #endif
1738 #ifdef USE_LOG_PROCESS_ROLE
1739 		log_set_process_role("xfrd");
1740 #endif
1741 #ifdef HAVE_CPUSET_T
1742 		if(nsd->use_cpu_affinity) {
1743 			set_cpu_affinity(nsd->xfrd_cpuset);
1744 		}
1745 #endif
1746 
1747 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1748 		/* ENOTREACH */
1749 		break;
1750 	case 0:
1751 		/* CHILD: close second socket, use first one */
1752 		close(sockets[1]);
1753 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1754 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1755 		}
1756 		nsd->xfrd_listener->fd = sockets[0];
1757 #ifdef HAVE_SETPROCTITLE
1758 		setproctitle("main");
1759 #endif
1760 #ifdef USE_LOG_PROCESS_ROLE
1761 		log_set_process_role("main");
1762 #endif
1763 		break;
1764 	}
1765 	/* server-parent only */
1766 	nsd->xfrd_listener->timeout = NULL;
1767 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1768 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1769 	/* clear ongoing ipc reads */
1770 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1771 	data->conn->is_reading = 0;
1772 }
1773 
1774 /** add all soainfo to taskdb */
1775 static void
add_all_soa_to_task(struct nsd * nsd,struct udb_base * taskudb)1776 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1777 {
1778 	struct radnode* n;
1779 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1780 	/* add all SOA INFO to mytask */
1781 	udb_ptr_init(&task_last, taskudb);
1782 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1783 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1784 	}
1785 	udb_ptr_unlink(&task_last, taskudb);
1786 }
1787 
1788 void
server_send_soa_xfrd(struct nsd * nsd,int shortsoa)1789 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1790 {
1791 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1792 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1793 	 *   then they exchange and process.
1794 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1795 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1796 	 *   expire notifications can be sent back via a normal reload later
1797 	 *   (xfrd will wait for current running reload to finish if any).
1798 	 */
1799 	sig_atomic_t cmd = 0;
1800 	pid_t mypid;
1801 	int xfrd_sock = nsd->xfrd_listener->fd;
1802 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1803 	udb_ptr t;
1804 	if(!shortsoa) {
1805 		if(nsd->signal_hint_shutdown) {
1806 		shutdown:
1807 			log_msg(LOG_WARNING, "signal received, shutting down...");
1808 			server_close_all_sockets(nsd->udp, nsd->ifs);
1809 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1810 			daemon_remote_close(nsd->rc);
1811 			/* Unlink it if possible... */
1812 			unlinkpid(nsd->pidfile);
1813 			unlink(nsd->task[0]->fname);
1814 			unlink(nsd->task[1]->fname);
1815 #ifdef USE_ZONE_STATS
1816 			unlink(nsd->zonestatfname[0]);
1817 			unlink(nsd->zonestatfname[1]);
1818 #endif
1819 #ifdef BIND8_STATS
1820 			server_stat_free(nsd);
1821 #endif
1822 			server_shutdown(nsd);
1823 			/* ENOTREACH */
1824 			exit(0);
1825 		}
1826 	}
1827 	if(shortsoa) {
1828 		/* put SOA in xfrd task because mytask may be in use */
1829 		taskudb = nsd->task[1-nsd->mytask];
1830 	}
1831 
1832 	add_all_soa_to_task(nsd, taskudb);
1833 	if(!shortsoa) {
1834 		/* wait for xfrd to signal task is ready, RELOAD signal */
1835 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1836 			cmd != NSD_RELOAD) {
1837 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1838 			exit(1);
1839 		}
1840 		if(nsd->signal_hint_shutdown) {
1841 			goto shutdown;
1842 		}
1843 	}
1844 	/* give xfrd our task, signal it with RELOAD_DONE */
1845 	task_process_sync(taskudb);
1846 	cmd = NSD_RELOAD_DONE;
1847 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1848 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1849 			(int)nsd->pid, strerror(errno));
1850 	}
1851 	mypid = getpid();
1852 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1853 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1854 			strerror(errno));
1855 	}
1856 
1857 	if(!shortsoa) {
1858 		/* process the xfrd task works (expiry data) */
1859 		nsd->mytask = 1 - nsd->mytask;
1860 		taskudb = nsd->task[nsd->mytask];
1861 		task_remap(taskudb);
1862 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1863 		while(!udb_ptr_is_null(&t)) {
1864 			task_process_expire(nsd->db, TASKLIST(&t));
1865 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1866 		}
1867 		udb_ptr_unlink(&t, taskudb);
1868 		task_clear(taskudb);
1869 
1870 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1871 		cmd = NSD_RELOAD_DONE;
1872 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1873 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1874 				(int)nsd->pid, strerror(errno));
1875 		}
1876 	}
1877 }
1878 
1879 #ifdef HAVE_SSL
1880 static void
log_crypto_from_err(const char * str,unsigned long err)1881 log_crypto_from_err(const char* str, unsigned long err)
1882 {
1883 	/* error:[error code]:[library name]:[function name]:[reason string] */
1884 	char buf[128];
1885 	unsigned long e;
1886 	ERR_error_string_n(err, buf, sizeof(buf));
1887 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1888 	while( (e=ERR_get_error()) ) {
1889 		ERR_error_string_n(e, buf, sizeof(buf));
1890 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1891 	}
1892 }
1893 
1894 void
log_crypto_err(const char * str)1895 log_crypto_err(const char* str)
1896 {
1897 	log_crypto_from_err(str, ERR_get_error());
1898 }
1899 
1900 /** true if the ssl handshake error has to be squelched from the logs */
1901 static int
squelch_err_ssl_handshake(unsigned long err)1902 squelch_err_ssl_handshake(unsigned long err)
1903 {
1904 	if(verbosity >= 3)
1905 		return 0; /* only squelch on low verbosity */
1906 	/* this is very specific, we could filter on ERR_GET_REASON()
1907 	 * (the third element in ERR_PACK) */
1908 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1909 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1910 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1911 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1912 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1913 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1914 #endif
1915 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1916 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1917 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1918 #  ifdef SSL_R_VERSION_TOO_LOW
1919 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1920 #  endif
1921 #endif
1922 		)
1923 		return 1;
1924 	return 0;
1925 }
1926 
1927 void
perform_openssl_init(void)1928 perform_openssl_init(void)
1929 {
1930 	/* init SSL library */
1931 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1932 	ERR_load_crypto_strings();
1933 #endif
1934 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1935 	ERR_load_SSL_strings();
1936 #endif
1937 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1938 	OpenSSL_add_all_algorithms();
1939 #else
1940 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1941 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1942 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1943 #endif
1944 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1945 	(void)SSL_library_init();
1946 #else
1947 	OPENSSL_init_ssl(0, NULL);
1948 #endif
1949 
1950 	if(!RAND_status()) {
1951 		/* try to seed it */
1952 		unsigned char buf[256];
1953 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1954 		size_t i;
1955 		v = seed;
1956 		for(i=0; i<256/sizeof(v); i++) {
1957 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1958 			v = v*seed + (unsigned int)i;
1959 		}
1960 		RAND_seed(buf, 256);
1961 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1962 	}
1963 }
1964 
1965 static int
get_ocsp(char * filename,unsigned char ** ocsp)1966 get_ocsp(char *filename, unsigned char **ocsp)
1967 {
1968 	BIO *bio;
1969 	OCSP_RESPONSE *response;
1970 	int len = -1;
1971 	unsigned char *p, *buf;
1972 	assert(filename);
1973 
1974 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1975 		log_crypto_err("get_ocsp: BIO_new_file failed");
1976 		return -1;
1977 	}
1978 
1979 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1980 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1981 		BIO_free(bio);
1982 		return -1;
1983 	}
1984 
1985 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1986 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1987 		OCSP_RESPONSE_free(response);
1988 		BIO_free(bio);
1989 		return -1;
1990 	}
1991 
1992 	if ((buf = malloc((size_t) len)) == NULL) {
1993 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1994 		OCSP_RESPONSE_free(response);
1995 		BIO_free(bio);
1996 		return -1;
1997 	}
1998 
1999 	p = buf;
2000 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
2001 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
2002 		free(buf);
2003 		OCSP_RESPONSE_free(response);
2004 		BIO_free(bio);
2005 		return -1;
2006 	}
2007 
2008 	OCSP_RESPONSE_free(response);
2009 	BIO_free(bio);
2010 
2011 	*ocsp = buf;
2012 	return len;
2013 }
2014 
2015 /* further setup ssl ctx after the keys are loaded */
2016 static void
listen_sslctx_setup_2(void * ctxt)2017 listen_sslctx_setup_2(void* ctxt)
2018 {
2019 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
2020 	(void)ctx;
2021 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
2022 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
2023 		/* ENOTREACH */
2024 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
2025 	}
2026 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
2027 	if(1) {
2028 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
2029 		if (!ecdh) {
2030 			log_crypto_err("could not find p256, not enabling ECDHE");
2031 		} else {
2032 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
2033 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
2034 			}
2035 			EC_KEY_free (ecdh);
2036 		}
2037 	}
2038 #endif
2039 }
2040 
2041 static int
add_ocsp_data_cb(SSL * s,void * ATTR_UNUSED (arg))2042 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
2043 {
2044 	if(ocspdata) {
2045 		unsigned char *p;
2046 		if ((p=malloc(ocspdata_len)) == NULL) {
2047 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
2048 			return SSL_TLSEXT_ERR_NOACK;
2049 		}
2050 		memcpy(p, ocspdata, ocspdata_len);
2051 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
2052 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
2053 			free(p);
2054 			return SSL_TLSEXT_ERR_NOACK;
2055 		}
2056 		return SSL_TLSEXT_ERR_OK;
2057 	} else {
2058 		return SSL_TLSEXT_ERR_NOACK;
2059 	}
2060 }
2061 
2062 SSL_CTX*
server_tls_ctx_setup(char * key,char * pem,char * verifypem)2063 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
2064 {
2065 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
2066 	if(!ctx) {
2067 		log_crypto_err("could not SSL_CTX_new");
2068 		return NULL;
2069 	}
2070 	/* no SSLv2, SSLv3 because has defects */
2071 #if SSL_OP_NO_SSLv2 != 0
2072 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
2073 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
2074 		SSL_CTX_free(ctx);
2075 		return NULL;
2076 	}
2077 #endif
2078 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
2079 		!= SSL_OP_NO_SSLv3){
2080 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
2081 		SSL_CTX_free(ctx);
2082 		return 0;
2083 	}
2084 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
2085 	/* if we have tls 1.1 disable 1.0 */
2086 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
2087 		!= SSL_OP_NO_TLSv1){
2088 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
2089 		SSL_CTX_free(ctx);
2090 		return 0;
2091 	}
2092 #endif
2093 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
2094 	/* if we have tls 1.2 disable 1.1 */
2095 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
2096 		!= SSL_OP_NO_TLSv1_1){
2097 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
2098 		SSL_CTX_free(ctx);
2099 		return 0;
2100 	}
2101 #endif
2102 #if defined(SSL_OP_NO_RENEGOTIATION)
2103 	/* disable client renegotiation */
2104 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2105 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2106 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2107 		SSL_CTX_free(ctx);
2108 		return 0;
2109 	}
2110 #endif
2111 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2112 	/* if we detect system-wide crypto policies, use those */
2113 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2114 		/* if we have sha256, set the cipher list to have no known vulns */
2115 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2116 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2117 	}
2118 #endif
2119 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2120 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2121 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2122 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2123 		SSL_CTX_free(ctx);
2124 		return 0;
2125 	}
2126 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2127 	SSL_CTX_set_security_level(ctx, 0);
2128 #endif
2129 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2130 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2131 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2132 		SSL_CTX_free(ctx);
2133 		return NULL;
2134 	}
2135 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2136 		log_msg(LOG_ERR, "error for private key file: %s", key);
2137 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2138 		SSL_CTX_free(ctx);
2139 		return NULL;
2140 	}
2141 	if(!SSL_CTX_check_private_key(ctx)) {
2142 		log_msg(LOG_ERR, "error for key file: %s", key);
2143 		log_crypto_err("Error in SSL_CTX check_private_key");
2144 		SSL_CTX_free(ctx);
2145 		return NULL;
2146 	}
2147 	listen_sslctx_setup_2(ctx);
2148 	if(verifypem && verifypem[0]) {
2149 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2150 			log_crypto_err("Error in SSL_CTX verify locations");
2151 			SSL_CTX_free(ctx);
2152 			return NULL;
2153 		}
2154 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2155 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2156 	}
2157 	return ctx;
2158 }
2159 
2160 SSL_CTX*
server_tls_ctx_create(struct nsd * nsd,char * verifypem,char * ocspfile)2161 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2162 {
2163 	char *key, *pem;
2164 	SSL_CTX *ctx;
2165 
2166 	key = nsd->options->tls_service_key;
2167 	pem = nsd->options->tls_service_pem;
2168 	if(!key || key[0] == 0) {
2169 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2170 		return NULL;
2171 	}
2172 	if(!pem || pem[0] == 0) {
2173 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2174 		return NULL;
2175 	}
2176 
2177 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2178 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2179 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2180 	if(!ctx) {
2181 		log_msg(LOG_ERR, "could not setup server TLS context");
2182 		return NULL;
2183 	}
2184 	if(ocspfile && ocspfile[0]) {
2185 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2186 			log_crypto_err("Error reading OCSPfile");
2187 			SSL_CTX_free(ctx);
2188 			return NULL;
2189 		} else {
2190 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2191 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2192 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2193 				SSL_CTX_free(ctx);
2194 				return NULL;
2195 			}
2196 		}
2197 	}
2198 	return ctx;
2199 }
2200 
2201 /* check if tcp_handler_accept_data created for TLS dedicated port */
2202 int
using_tls_port(struct sockaddr * addr,const char * tls_port)2203 using_tls_port(struct sockaddr* addr, const char* tls_port)
2204 {
2205 	in_port_t port = 0;
2206 
2207 	if (addr->sa_family == AF_INET)
2208 		port = ((struct sockaddr_in*)addr)->sin_port;
2209 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2210 	else
2211 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2212 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2213 	if (atoi(tls_port) == ntohs(port))
2214 		return 1;
2215 
2216 	return 0;
2217 }
2218 #endif
2219 
2220 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2221 ssize_t
block_read(struct nsd * nsd,int s,void * p,ssize_t sz,int timeout)2222 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2223 {
2224 	uint8_t* buf = (uint8_t*) p;
2225 	ssize_t total = 0;
2226 	struct pollfd fd;
2227 	memset(&fd, 0, sizeof(fd));
2228 	fd.fd = s;
2229 	fd.events = POLLIN;
2230 
2231 	while( total < sz) {
2232 		ssize_t ret;
2233 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2234 		if(ret == -1) {
2235 			if(errno == EAGAIN)
2236 				/* blocking read */
2237 				continue;
2238 			if(errno == EINTR) {
2239 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2240 					return -1;
2241 				/* other signals can be handled later */
2242 				continue;
2243 			}
2244 			/* some error */
2245 			return -1;
2246 		}
2247 		if(ret == 0) {
2248 			/* operation timed out */
2249 			return -2;
2250 		}
2251 		ret = read(s, buf+total, sz-total);
2252 		if(ret == -1) {
2253 			if(errno == EAGAIN)
2254 				/* blocking read */
2255 				continue;
2256 			if(errno == EINTR) {
2257 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2258 					return -1;
2259 				/* other signals can be handled later */
2260 				continue;
2261 			}
2262 			/* some error */
2263 			return -1;
2264 		}
2265 		if(ret == 0) {
2266 			/* closed connection! */
2267 			return 0;
2268 		}
2269 		total += ret;
2270 	}
2271 	return total;
2272 }
2273 
2274 static void
reload_process_tasks(struct nsd * nsd,udb_ptr * last_task,int cmdsocket)2275 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2276 {
2277 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2278 	udb_ptr t, next;
2279 	udb_base* u = nsd->task[nsd->mytask];
2280 	udb_ptr_init(&next, u);
2281 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2282 	udb_base_set_userdata(u, 0);
2283 	while(!udb_ptr_is_null(&t)) {
2284 		/* store next in list so this one can be deleted or reused */
2285 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2286 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2287 
2288 		/* process task t */
2289 		/* append results for task t and update last_task */
2290 		task_process_in_reload(nsd, u, last_task, &t);
2291 
2292 		/* go to next */
2293 		udb_ptr_set_ptr(&t, u, &next);
2294 
2295 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2296 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2297 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2298 			if(cmd == NSD_QUIT) {
2299 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2300 				/* unlink files of remainder of tasks */
2301 				while(!udb_ptr_is_null(&t)) {
2302 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2303 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2304 					}
2305 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2306 				}
2307 				udb_ptr_unlink(&t, u);
2308 				udb_ptr_unlink(&next, u);
2309 				exit(0);
2310 			}
2311 		}
2312 
2313 	}
2314 	udb_ptr_unlink(&t, u);
2315 	udb_ptr_unlink(&next, u);
2316 }
2317 
2318 void server_verify(struct nsd *nsd, int cmdsocket);
2319 
2320 struct quit_sync_event_data {
2321 	struct event_base* base;
2322 	size_t read;
2323 	union {
2324 		uint8_t buf[sizeof(sig_atomic_t)];
2325 		sig_atomic_t cmd;
2326 	} to_read;
2327 };
2328 
server_reload_handle_sigchld(int sig,short event,void * ATTR_UNUSED (arg))2329 static void server_reload_handle_sigchld(int sig, short event,
2330 		void* ATTR_UNUSED(arg))
2331 {
2332 	assert(sig == SIGCHLD);
2333 	assert(event & EV_SIGNAL);
2334 
2335 	/* reap the exited old-serve child(s) */
2336 	while(waitpid(-1, NULL, WNOHANG) > 0) {
2337 		/* pass */
2338 	}
2339 }
2340 
server_reload_handle_quit_sync_ack(int cmdsocket,short event,void * arg)2341 static void server_reload_handle_quit_sync_ack(int cmdsocket, short event,
2342 		void* arg)
2343 {
2344 	struct quit_sync_event_data* cb_data =
2345 		(struct quit_sync_event_data*)arg;
2346 	ssize_t r;
2347 
2348 	if(event & EV_TIMEOUT) {
2349 		sig_atomic_t cmd = NSD_QUIT_SYNC;
2350 
2351 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2352 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) {
2353 			log_msg(LOG_ERR, "problems sending command from "
2354 				"reload to old-main: %s", strerror(errno));
2355 		}
2356 		/* Wait for cmdsocket to become readable or for next timeout,
2357 		 * (this works because event is added EV_TIMEOUT|EV_PERSIST).
2358 		 */
2359 		return;
2360 	}
2361 	assert(event & EV_READ);
2362 	assert(cb_data->read < sizeof(cb_data->to_read.cmd));
2363 
2364 	r = read(cmdsocket, cb_data->to_read.buf + cb_data->read,
2365 			sizeof(cb_data->to_read.cmd) - cb_data->read);
2366 	if(r == 0) {
2367 		log_msg(LOG_ERR, "reload: old-main quit during quit sync");
2368 		cb_data->to_read.cmd = NSD_RELOAD;
2369 
2370 	} else if(r == -1) {
2371 		if(errno == EAGAIN || errno == EINTR)
2372 			return;
2373 
2374 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: "
2375 			"%s", strerror(errno));
2376 		cb_data->to_read.cmd = NSD_RELOAD;
2377 
2378 	} else if (cb_data->read + r  < sizeof(cb_data->to_read.cmd)) {
2379 		/* More to read */
2380 		cb_data->read += r;
2381 		return;
2382 
2383 	} else {
2384 		assert(cb_data->read + r == sizeof(cb_data->to_read.cmd));
2385 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d",
2386 					(int)cb_data->to_read.cmd));
2387 	}
2388 	/* Done */
2389 	event_base_loopexit(cb_data->base, NULL);
2390 }
2391 
2392 /*
2393  * Reload the database, stop parent, re-fork children and continue.
2394  * as server_main.
2395  */
2396 static void
server_reload(struct nsd * nsd,region_type * server_region,netio_type * netio,int cmdsocket)2397 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2398 	int cmdsocket)
2399 {
2400 	pid_t mypid;
2401 	sig_atomic_t cmd;
2402 	udb_ptr last_task;
2403 	struct sigaction old_sigchld, ign_sigchld;
2404 	struct radnode* node;
2405 	zone_type* zone;
2406 	enum soainfo_hint hint;
2407 	struct quit_sync_event_data cb_data;
2408 	struct event signal_event, cmd_event;
2409 	struct timeval reload_sync_timeout;
2410 
2411 	/* ignore SIGCHLD from the previous server_main that used this pid */
2412 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2413 	ign_sigchld.sa_handler = SIG_IGN;
2414 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2415 
2416 #ifdef HAVE_CPUSET_T
2417 	if(nsd->use_cpu_affinity) {
2418 		set_cpu_affinity(nsd->cpuset);
2419 	}
2420 #endif
2421 
2422 	/* see what tasks we got from xfrd */
2423 	task_remap(nsd->task[nsd->mytask]);
2424 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2425 	reload_process_tasks(nsd, &last_task, cmdsocket);
2426 
2427 #ifndef NDEBUG
2428 	if(nsd_debug_level >= 1)
2429 		region_log_stats(nsd->db->region);
2430 #endif /* NDEBUG */
2431 	initialize_dname_compression_tables(nsd);
2432 
2433 #ifdef BIND8_STATS
2434 	/* Restart dumping stats if required.  */
2435 	time(&nsd->st->boot);
2436 	set_bind8_alarm(nsd);
2437 	/* Switch to a different set of stat array for new server processes,
2438 	 * because they can briefly coexist with the old processes. They
2439 	 * have their own stat structure. */
2440 	nsd->stat_current = (nsd->stat_current==0?1:0);
2441 #endif
2442 #ifdef USE_ZONE_STATS
2443 	server_zonestat_realloc(nsd); /* realloc for new children */
2444 	server_zonestat_switch(nsd);
2445 #endif
2446 
2447 	if(nsd->options->verify_enable) {
2448 #ifdef RATELIMIT
2449 		/* allocate resources for rate limiting. use a slot that is guaranteed
2450 		   not mapped to a file so no persistent data is overwritten */
2451 		rrl_init(nsd->child_count + 1);
2452 #endif
2453 
2454 		/* spin-up server and execute verifiers for each zone */
2455 		server_verify(nsd, cmdsocket);
2456 #ifdef RATELIMIT
2457 		/* deallocate rate limiting resources */
2458 		rrl_deinit(nsd->child_count + 1);
2459 #endif
2460 	}
2461 
2462 	for(node = radix_first(nsd->db->zonetree);
2463 	    node != NULL;
2464 	    node = radix_next(node))
2465 	{
2466 		zone = (zone_type *)node->elem;
2467 		if(zone->is_updated) {
2468 			if(zone->is_bad) {
2469 				nsd->mode = NSD_RELOAD_FAILED;
2470 				hint = soainfo_bad;
2471 			} else {
2472 				hint = soainfo_ok;
2473 			}
2474 			/* update(s), verified or not, possibly with subsequent
2475 			   skipped update(s). skipped update(s) are picked up
2476 			   by failed update check in xfrd */
2477 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2478 			                 zone, hint);
2479 		} else if(zone->is_skipped) {
2480 			/* corrupt or inconsistent update without preceding
2481 			   update(s), communicate soainfo_gone */
2482 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2483 			                 zone, soainfo_gone);
2484 		}
2485 		zone->is_updated = 0;
2486 		zone->is_skipped = 0;
2487 	}
2488 
2489 	if(nsd->mode == NSD_RELOAD_FAILED) {
2490 		exit(NSD_RELOAD_FAILED);
2491 	}
2492 
2493 	/* listen for the signals of failed children again */
2494 	sigaction(SIGCHLD, &old_sigchld, NULL);
2495 #ifdef USE_DNSTAP
2496 	if (nsd->dt_collector) {
2497 		int *swap_fd_send;
2498 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2499 		/* Swap fd_send with fd_swap so old serve child and new serve
2500 		 * childs will not write to the same pipe ends simultaneously */
2501 		swap_fd_send = nsd->dt_collector_fd_send;
2502 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2503 		nsd->dt_collector_fd_swap = swap_fd_send;
2504 
2505 	}
2506 #endif
2507 	/* Start new child processes */
2508 	if (server_start_children(nsd, server_region, netio, &nsd->
2509 		xfrd_listener->fd) != 0) {
2510 		send_children_quit(nsd);
2511 		exit(1);
2512 	}
2513 
2514 	/* if the old-main has quit, we must quit too, poll the fd for cmds */
2515 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2516 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2517 		if(cmd == NSD_QUIT) {
2518 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2519 			send_children_quit(nsd);
2520 			exit(0);
2521 		}
2522 	}
2523 
2524 	/* Send quit command to old-main: blocking, wait for receipt.
2525 	 * The old-main process asks the old-serve processes to quit, however
2526 	 * if a reload succeeded before, this process is the parent of the
2527 	 * old-serve processes, so we need to reap the children for it.
2528 	 */
2529 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2530 	cmd = NSD_QUIT_SYNC;
2531 	if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2532 	{
2533 		log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2534 			strerror(errno));
2535 	}
2536 
2537 	reload_sync_timeout.tv_sec = RELOAD_SYNC_TIMEOUT;
2538 	reload_sync_timeout.tv_usec = 0;
2539 
2540 	cb_data.base = nsd_child_event_base();
2541 	cb_data.to_read.cmd = cmd;
2542 	cb_data.read = 0;
2543 
2544 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST,
2545 	    server_reload_handle_sigchld, NULL);
2546 	if(event_base_set(cb_data.base, &signal_event) != 0
2547 	|| event_add(&signal_event, NULL) != 0) {
2548 		log_msg(LOG_ERR, "NSD quit sync: could not add signal event");
2549 	}
2550 
2551 	event_set(&cmd_event, cmdsocket, EV_READ|EV_TIMEOUT|EV_PERSIST,
2552 	    server_reload_handle_quit_sync_ack, &cb_data);
2553 	if(event_base_set(cb_data.base, &cmd_event) != 0
2554 	|| event_add(&cmd_event, &reload_sync_timeout) != 0) {
2555 		log_msg(LOG_ERR, "NSD quit sync: could not add command event");
2556 	}
2557 
2558 	/* short-lived main loop */
2559 	event_base_dispatch(cb_data.base);
2560 
2561 	/* remove command and signal event handlers */
2562 	event_del(&cmd_event);
2563 	event_del(&signal_event);
2564 	event_base_free(cb_data.base);
2565 	cmd = cb_data.to_read.cmd;
2566 
2567 	if(cmd == NSD_QUIT) {
2568 		/* small race condition possible here, parent got quit cmd. */
2569 		send_children_quit(nsd);
2570 		exit(1);
2571 	}
2572 	assert(cmd == NSD_RELOAD);
2573 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2574 	task_process_sync(nsd->task[nsd->mytask]);
2575 #ifdef USE_ZONE_STATS
2576 	server_zonestat_realloc(nsd); /* realloc for next children */
2577 #endif
2578 
2579 	/* send soainfo to the xfrd process, signal it that reload is done,
2580 	 * it picks up the taskudb */
2581 	cmd = NSD_RELOAD_DONE;
2582 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2583 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2584 			strerror(errno));
2585 	}
2586 	mypid = getpid();
2587 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2588 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2589 			strerror(errno));
2590 	}
2591 
2592 	/* try to reopen file */
2593 	if (nsd->file_rotation_ok)
2594 		log_reopen(nsd->log_filename, 1);
2595 	/* exit reload, continue as new server_main */
2596 }
2597 
2598 /*
2599  * Get the mode depending on the signal hints that have been received.
2600  * Multiple signal hints can be received and will be handled in turn.
2601  */
2602 static sig_atomic_t
server_signal_mode(struct nsd * nsd)2603 server_signal_mode(struct nsd *nsd)
2604 {
2605 	if(nsd->signal_hint_quit) {
2606 		nsd->signal_hint_quit = 0;
2607 		return NSD_QUIT;
2608 	}
2609 	else if(nsd->signal_hint_shutdown) {
2610 		nsd->signal_hint_shutdown = 0;
2611 		return NSD_SHUTDOWN;
2612 	}
2613 	else if(nsd->signal_hint_child) {
2614 		nsd->signal_hint_child = 0;
2615 		return NSD_REAP_CHILDREN;
2616 	}
2617 	else if(nsd->signal_hint_reload) {
2618 		nsd->signal_hint_reload = 0;
2619 		return NSD_RELOAD;
2620 	}
2621 	else if(nsd->signal_hint_reload_hup) {
2622 		nsd->signal_hint_reload_hup = 0;
2623 		return NSD_RELOAD_REQ;
2624 	}
2625 	else if(nsd->signal_hint_stats) {
2626 		nsd->signal_hint_stats = 0;
2627 #ifdef BIND8_STATS
2628 		set_bind8_alarm(nsd);
2629 #endif
2630 		return NSD_STATS;
2631 	}
2632 	else if(nsd->signal_hint_statsusr) {
2633 		nsd->signal_hint_statsusr = 0;
2634 		return NSD_STATS;
2635 	}
2636 	return NSD_RUN;
2637 }
2638 
2639 /*
2640  * The main server simply waits for signals and child processes to
2641  * terminate.  Child processes are restarted as necessary.
2642  */
2643 void
server_main(struct nsd * nsd)2644 server_main(struct nsd *nsd)
2645 {
2646 	region_type *server_region = region_create(xalloc, free);
2647 	netio_type *netio = netio_create(server_region);
2648 	netio_handler_type reload_listener;
2649 	int reload_sockets[2] = {-1, -1};
2650 	struct timespec timeout_spec;
2651 	int status;
2652 	pid_t child_pid;
2653 	pid_t reload_pid = -1;
2654 	sig_atomic_t mode;
2655 
2656 	/* Ensure we are the main process */
2657 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2658 
2659 	/* Add listener for the XFRD process */
2660 	netio_add_handler(netio, nsd->xfrd_listener);
2661 
2662 #ifdef BIND8_STATS
2663 	nsd->st = &nsd->stat_map[0];
2664 	nsd->st->db_disk = 0;
2665 	nsd->st->db_mem = region_get_mem(nsd->db->region);
2666 #endif
2667 
2668 	/* Start the child processes that handle incoming queries */
2669 	if (server_start_children(nsd, server_region, netio,
2670 		&nsd->xfrd_listener->fd) != 0) {
2671 		send_children_quit(nsd);
2672 		exit(1);
2673 	}
2674 	reload_listener.fd = -1;
2675 
2676 	/* This_child MUST be 0, because this is the parent process */
2677 	assert(nsd->this_child == 0);
2678 
2679 	/* Run the server until we get a shutdown signal */
2680 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2681 		/* Did we receive a signal that changes our mode? */
2682 		if(mode == NSD_RUN) {
2683 			nsd->mode = mode = server_signal_mode(nsd);
2684 		}
2685 
2686 		switch (mode) {
2687 		case NSD_RUN:
2688 			/* see if any child processes terminated */
2689 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2690 				int is_child = delete_child_pid(nsd, child_pid);
2691 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2692 					if(nsd->children[is_child].child_fd == -1)
2693 						nsd->children[is_child].has_exited = 1;
2694 					parent_check_all_children_exited(nsd);
2695 				} else if(is_child != -1) {
2696 					log_msg(LOG_WARNING,
2697 					       "server %d died unexpectedly with status %d, restarting",
2698 					       (int) child_pid, status);
2699 					restart_child_servers(nsd, server_region, netio,
2700 						&nsd->xfrd_listener->fd);
2701 				} else if (child_pid == reload_pid) {
2702 					sig_atomic_t cmd = NSD_RELOAD_FAILED;
2703 					pid_t mypid;
2704 					log_msg(LOG_WARNING,
2705 					       "Reload process %d failed with status %d, continuing with old database",
2706 					       (int) child_pid, status);
2707 #ifdef HAVE_SETPROCTITLE
2708 					setproctitle("main");
2709 #endif
2710 #ifdef USE_LOG_PROCESS_ROLE
2711 					log_set_process_role("main");
2712 #endif
2713 					reload_pid = -1;
2714 					if(reload_listener.fd != -1) close(reload_listener.fd);
2715 					netio_remove_handler(netio, &reload_listener);
2716 					reload_listener.fd = -1;
2717 					reload_listener.event_types = NETIO_EVENT_NONE;
2718 					task_process_sync(nsd->task[nsd->mytask]);
2719 					/* inform xfrd reload attempt ended */
2720 					if(!write_socket(nsd->xfrd_listener->fd,
2721 						&cmd, sizeof(cmd))) {
2722 						log_msg(LOG_ERR, "problems "
2723 						  "sending SOAEND to xfrd: %s",
2724 						  strerror(errno));
2725 					}
2726 					mypid = getpid();
2727 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2728 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2729 							strerror(errno));
2730 					}
2731 #ifdef USE_DNSTAP
2732 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2733 					log_msg(LOG_WARNING,
2734 					       "dnstap-collector %d terminated with status %d",
2735 					       (int) child_pid, status);
2736 					if(nsd->dt_collector) {
2737 						dt_collector_close(nsd->dt_collector, nsd);
2738 						dt_collector_destroy(nsd->dt_collector, nsd);
2739 						nsd->dt_collector = NULL;
2740 					}
2741 					/* Only respawn a crashed (or exited)
2742 					 * dnstap-collector when not reloading,
2743 					 * to not induce a reload during a
2744 					 * reload (which would seriously
2745 					 * disrupt nsd procedures and lead to
2746 					 * unpredictable results)!
2747 					 *
2748 					 * This will *leave* a dnstap-collector
2749 					 * process terminated, but because
2750 					 * signalling of the reload process to
2751 					 * the main process to respawn in this
2752 					 * situation will be cumbersome, and
2753 					 * because this situation is so
2754 					 * specific (and therefore hopefully
2755 					 * extremely rare or non-existing at
2756 					 * all), plus the fact that we are left
2757 					 * with a perfectly function NSD
2758 					 * (besides not logging dnstap
2759 					 * messages), I consider it acceptable
2760 					 * to leave this unresolved.
2761 					 */
2762 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2763 						nsd->dt_collector = dt_collector_create(nsd);
2764 						dt_collector_start(nsd->dt_collector, nsd);
2765 						nsd->mode = NSD_RELOAD_REQ;
2766 					}
2767 #endif
2768 				} else if(status != 0) {
2769 					/* check for status, because we get
2770 					 * the old-servermain because reload
2771 					 * is the process-parent of old-main,
2772 					 * and we get older server-processes
2773 					 * that are exiting after a reload */
2774 					log_msg(LOG_WARNING,
2775 					       "process %d terminated with status %d",
2776 					       (int) child_pid, status);
2777 				}
2778 			}
2779 			if (child_pid == -1) {
2780 				if (errno == EINTR) {
2781 					continue;
2782 				}
2783 				if (errno != ECHILD)
2784 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2785 			}
2786 			if (nsd->mode != NSD_RUN)
2787 				break;
2788 
2789 			/* timeout to collect processes. In case no sigchild happens. */
2790 			timeout_spec.tv_sec = 1;
2791 			timeout_spec.tv_nsec = 0;
2792 
2793 			/* listen on ports, timeout for collecting terminated children */
2794 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2795 				if (errno != EINTR) {
2796 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2797 				}
2798 			}
2799 			if(nsd->restart_children) {
2800 				restart_child_servers(nsd, server_region, netio,
2801 					&nsd->xfrd_listener->fd);
2802 				nsd->restart_children = 0;
2803 			}
2804 			if(nsd->reload_failed) {
2805 				sig_atomic_t cmd = NSD_RELOAD_FAILED;
2806 				pid_t mypid;
2807 				nsd->reload_failed = 0;
2808 				log_msg(LOG_WARNING,
2809 				       "Reload process %d failed, continuing with old database",
2810 				       (int) reload_pid);
2811 #ifdef HAVE_SETPROCTITLE
2812 				setproctitle("main");
2813 #endif
2814 #ifdef USE_LOG_PROCESS_ROLE
2815 				log_set_process_role("main");
2816 #endif
2817 				reload_pid = -1;
2818 				if(reload_listener.fd != -1) close(reload_listener.fd);
2819 				netio_remove_handler(netio, &reload_listener);
2820 				reload_listener.fd = -1;
2821 				reload_listener.event_types = NETIO_EVENT_NONE;
2822 				task_process_sync(nsd->task[nsd->mytask]);
2823 				/* inform xfrd reload attempt ended */
2824 				if(!write_socket(nsd->xfrd_listener->fd,
2825 					&cmd, sizeof(cmd))) {
2826 					log_msg(LOG_ERR, "problems "
2827 					  "sending SOAEND to xfrd: %s",
2828 					  strerror(errno));
2829 				}
2830 				mypid = getpid();
2831 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2832 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2833 						strerror(errno));
2834 				}
2835 			}
2836 
2837 			break;
2838 		case NSD_RELOAD_REQ: {
2839 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2840 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2841 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2842 				"main: ipc send reload_req to xfrd"));
2843 			if(!write_socket(nsd->xfrd_listener->fd,
2844 				&cmd, sizeof(cmd))) {
2845 				log_msg(LOG_ERR, "server_main: could not send "
2846 				"reload_req to xfrd: %s", strerror(errno));
2847 			}
2848 			nsd->mode = NSD_RUN;
2849 			} break;
2850 		case NSD_RELOAD:
2851 			/* Continue to run nsd after reload */
2852 			nsd->mode = NSD_RUN;
2853 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2854 			if (reload_pid != -1) {
2855 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2856 				       (int) reload_pid);
2857 				break;
2858 			}
2859 
2860 			/* switch the mytask to keep track of who owns task*/
2861 			nsd->mytask = 1 - nsd->mytask;
2862 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2863 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2864 				reload_pid = -1;
2865 				break;
2866 			}
2867 
2868 			/* Do actual reload */
2869 			reload_pid = fork();
2870 			switch (reload_pid) {
2871 			case -1:
2872 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2873 				break;
2874 			default:
2875 				/* PARENT */
2876 				close(reload_sockets[0]);
2877 #ifdef HAVE_SETPROCTITLE
2878 				setproctitle("load");
2879 #endif
2880 #ifdef USE_LOG_PROCESS_ROLE
2881 				log_set_process_role("load");
2882 #endif
2883 				server_reload(nsd, server_region, netio,
2884 					reload_sockets[1]);
2885 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2886 #ifdef HAVE_SETPROCTITLE
2887 				setproctitle("main");
2888 #endif
2889 #ifdef USE_LOG_PROCESS_ROLE
2890 				log_set_process_role("main");
2891 #endif
2892 				close(reload_sockets[1]);
2893 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2894 				/* drop stale xfrd ipc data */
2895 				((struct ipc_handler_conn_data*)nsd->
2896 					xfrd_listener->user_data)
2897 					->conn->is_reading = 0;
2898 				reload_pid = -1;
2899 				reload_listener.fd = -1;
2900 				reload_listener.event_types = NETIO_EVENT_NONE;
2901 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2902 				break;
2903 			case 0:
2904 				/* CHILD */
2905 				/* server_main keep running until NSD_QUIT_SYNC
2906 				 * received from reload. */
2907 				close(reload_sockets[1]);
2908 #ifdef HAVE_SETPROCTITLE
2909 				setproctitle("old-main");
2910 #endif
2911 #ifdef USE_LOG_PROCESS_ROLE
2912 				log_set_process_role("old-main");
2913 #endif
2914 				reload_listener.fd = reload_sockets[0];
2915 				reload_listener.timeout = NULL;
2916 				reload_listener.user_data = nsd;
2917 				reload_listener.event_types = NETIO_EVENT_READ;
2918 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2919 				netio_add_handler(netio, &reload_listener);
2920 				reload_pid = getppid();
2921 				break;
2922 			}
2923 			break;
2924 		case NSD_QUIT_SYNC:
2925 			/* synchronisation of xfrd, parent and reload */
2926 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2927 				sig_atomic_t cmd = NSD_RELOAD;
2928 				/* stop xfrd ipc writes in progress */
2929 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2930 					"main: ipc send indication reload"));
2931 				if(!write_socket(nsd->xfrd_listener->fd,
2932 					&cmd, sizeof(cmd))) {
2933 					log_msg(LOG_ERR, "server_main: could not send reload "
2934 					"indication to xfrd: %s", strerror(errno));
2935 				}
2936 				/* wait for ACK from xfrd */
2937 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2938 				nsd->quit_sync_done = 1;
2939 			}
2940 			nsd->mode = NSD_RUN;
2941 			break;
2942 		case NSD_QUIT:
2943 			/* silent shutdown during reload */
2944 			if(reload_listener.fd != -1) {
2945 				/* acknowledge the quit, to sync reload that we will really quit now */
2946 				sig_atomic_t cmd = NSD_RELOAD;
2947 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2948 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2949 					log_msg(LOG_ERR, "server_main: "
2950 						"could not ack quit: %s", strerror(errno));
2951 				}
2952 				close(reload_listener.fd);
2953 			}
2954 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2955 			/* only quit children after xfrd has acked */
2956 			send_children_quit(nsd);
2957 
2958 #ifdef MEMCLEAN /* OS collects memory pages */
2959 			region_destroy(server_region);
2960 #endif
2961 			server_shutdown(nsd);
2962 
2963 			/* ENOTREACH */
2964 			break;
2965 		case NSD_SHUTDOWN:
2966 			break;
2967 		case NSD_REAP_CHILDREN:
2968 			/* continue; wait for child in run loop */
2969 			nsd->mode = NSD_RUN;
2970 			break;
2971 		case NSD_STATS:
2972 #ifdef BIND8_STATS
2973 			set_children_stats(nsd);
2974 #endif
2975 			nsd->mode = NSD_RUN;
2976 			break;
2977 		default:
2978 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2979 			nsd->mode = NSD_RUN;
2980 			break;
2981 		}
2982 	}
2983 	log_msg(LOG_WARNING, "signal received, shutting down...");
2984 
2985 	/* close opened ports to avoid race with restart of nsd */
2986 	server_close_all_sockets(nsd->udp, nsd->ifs);
2987 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2988 	daemon_remote_close(nsd->rc);
2989 	send_children_quit_and_wait(nsd);
2990 
2991 	/* Unlink it if possible... */
2992 	unlinkpid(nsd->pidfile);
2993 	unlink(nsd->task[0]->fname);
2994 	unlink(nsd->task[1]->fname);
2995 #ifdef USE_ZONE_STATS
2996 	unlink(nsd->zonestatfname[0]);
2997 	unlink(nsd->zonestatfname[1]);
2998 #endif
2999 #ifdef BIND8_STATS
3000 	server_stat_free(nsd);
3001 #endif
3002 #ifdef USE_DNSTAP
3003 	dt_collector_close(nsd->dt_collector, nsd);
3004 #endif
3005 
3006 	if(reload_listener.fd != -1) {
3007 		sig_atomic_t cmd = NSD_QUIT;
3008 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
3009 			"main: ipc send quit to reload-process"));
3010 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
3011 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
3012 				strerror(errno));
3013 		}
3014 		fsync(reload_listener.fd);
3015 		close(reload_listener.fd);
3016 		/* wait for reload to finish processing */
3017 		while(1) {
3018 			if(waitpid(reload_pid, NULL, 0) == -1) {
3019 				if(errno == EINTR) continue;
3020 				if(errno == ECHILD) break;
3021 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
3022 					(int)reload_pid, strerror(errno));
3023 			}
3024 			break;
3025 		}
3026 	}
3027 	if(nsd->xfrd_listener->fd != -1) {
3028 		/* complete quit, stop xfrd */
3029 		sig_atomic_t cmd = NSD_QUIT;
3030 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
3031 			"main: ipc send quit to xfrd"));
3032 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
3033 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
3034 				strerror(errno));
3035 		}
3036 		fsync(nsd->xfrd_listener->fd);
3037 		close(nsd->xfrd_listener->fd);
3038 		(void)kill(nsd->pid, SIGTERM);
3039 	}
3040 
3041 #ifdef MEMCLEAN /* OS collects memory pages */
3042 	region_destroy(server_region);
3043 #endif
3044 	server_shutdown(nsd);
3045 }
3046 
3047 static query_state_type
server_process_query(struct nsd * nsd,struct query * query,uint32_t * now_p)3048 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
3049 {
3050 	return query_process(query, nsd, now_p);
3051 }
3052 
3053 static query_state_type
server_process_query_udp(struct nsd * nsd,struct query * query,uint32_t * now_p)3054 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
3055 {
3056 #ifdef RATELIMIT
3057 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
3058 		if(query->edns.cookie_status != COOKIE_VALID
3059 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
3060 		&& rrl_process_query(query))
3061 			return rrl_slip(query);
3062 		else	return QUERY_PROCESSED;
3063 	}
3064 	return QUERY_DISCARDED;
3065 #else
3066 	return query_process(query, nsd, now_p);
3067 #endif
3068 }
3069 
3070 const char*
nsd_event_vs(void)3071 nsd_event_vs(void)
3072 {
3073 #ifdef USE_MINI_EVENT
3074 	return "";
3075 #else
3076 	return event_get_version();
3077 #endif
3078 }
3079 
3080 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
ub_ev_backend2str(int b)3081 static const char* ub_ev_backend2str(int b)
3082 {
3083 	switch(b) {
3084 	case EVBACKEND_SELECT:	return "select";
3085 	case EVBACKEND_POLL:	return "poll";
3086 	case EVBACKEND_EPOLL:	return "epoll";
3087 	case EVBACKEND_KQUEUE:	return "kqueue";
3088 	case EVBACKEND_DEVPOLL: return "devpoll";
3089 	case EVBACKEND_PORT:	return "evport";
3090 	}
3091 	return "unknown";
3092 }
3093 #endif
3094 
3095 const char*
nsd_event_method(void)3096 nsd_event_method(void)
3097 {
3098 #ifdef USE_MINI_EVENT
3099 	return "select";
3100 #else
3101 	struct event_base* b = nsd_child_event_base();
3102 	const char* m;
3103 #  ifdef EV_FEATURE_BACKENDS
3104 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
3105 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
3106 	m = event_base_get_method(b);
3107 #  else
3108 	m = "?";
3109 #  endif
3110 #  ifdef MEMCLEAN
3111 	event_base_free(b);
3112 #  endif
3113 	return m;
3114 #endif
3115 }
3116 
3117 struct event_base*
nsd_child_event_base(void)3118 nsd_child_event_base(void)
3119 {
3120 	struct event_base* base;
3121 #ifdef USE_MINI_EVENT
3122 	static time_t secs;
3123 	static struct timeval now;
3124 	base = event_init(&secs, &now);
3125 #else
3126 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
3127 	/* libev */
3128 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
3129 #  else
3130 	/* libevent */
3131 #    ifdef HAVE_EVENT_BASE_NEW
3132 	base = event_base_new();
3133 #    else
3134 	base = event_init();
3135 #    endif
3136 #  endif
3137 #endif
3138 	return base;
3139 }
3140 
3141 static void
add_udp_handler(struct nsd * nsd,struct nsd_socket * sock,struct udp_handler_data * data)3142 add_udp_handler(
3143 	struct nsd *nsd,
3144 	struct nsd_socket *sock,
3145 	struct udp_handler_data *data)
3146 {
3147 	struct event *handler = &data->event;
3148 
3149 	data->nsd = nsd;
3150 	data->socket = sock;
3151 
3152 	if(nsd->options->proxy_protocol_port &&
3153 		sockaddr_uses_proxy_protocol_port(nsd->options,
3154 		(struct sockaddr *)&sock->addr.ai_addr)) {
3155 		data->pp2_enabled = 1;
3156 	}
3157 
3158 	memset(handler, 0, sizeof(*handler));
3159 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
3160 	if(event_base_set(nsd->event_base, handler) != 0)
3161 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
3162 	if(event_add(handler, NULL) != 0)
3163 		log_msg(LOG_ERR, "nsd udp: event_add failed");
3164 }
3165 
3166 void
add_tcp_handler(struct nsd * nsd,struct nsd_socket * sock,struct tcp_accept_handler_data * data)3167 add_tcp_handler(
3168 	struct nsd *nsd,
3169 	struct nsd_socket *sock,
3170 	struct tcp_accept_handler_data *data)
3171 {
3172 	struct event *handler = &data->event;
3173 
3174 	data->nsd = nsd;
3175 	data->socket = sock;
3176 
3177 	if(nsd->options->proxy_protocol_port &&
3178 		sockaddr_uses_proxy_protocol_port(nsd->options,
3179 		(struct sockaddr *)&sock->addr.ai_addr)) {
3180 		data->pp2_enabled = 1;
3181 	}
3182 
3183 #ifdef HAVE_SSL
3184 	if (nsd->tls_ctx &&
3185 	    nsd->options->tls_port &&
3186 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
3187 	{
3188 		data->tls_accept = 1;
3189 		if(verbosity >= 2) {
3190 			char buf[48];
3191 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
3192 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3193 		}
3194 	} else {
3195 		data->tls_accept = 0;
3196 	}
3197 #endif
3198 
3199 	memset(handler, 0, sizeof(*handler));
3200 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
3201 	if(event_base_set(nsd->event_base, handler) != 0)
3202 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3203 	if(event_add(handler, NULL) != 0)
3204 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
3205 	data->event_added = 1;
3206 }
3207 
3208 /*
3209  * Serve DNS request to verifiers (short-lived)
3210  */
server_verify(struct nsd * nsd,int cmdsocket)3211 void server_verify(struct nsd *nsd, int cmdsocket)
3212 {
3213 	size_t size = 0;
3214 	struct event cmd_event, signal_event, exit_event;
3215 	struct zone *zone;
3216 
3217 	assert(nsd != NULL);
3218 
3219 	zone = verify_next_zone(nsd, NULL);
3220 	if(zone == NULL)
3221 		return;
3222 
3223 	nsd->server_region = region_create(xalloc, free);
3224 	nsd->event_base = nsd_child_event_base();
3225 
3226 	nsd->next_zone_to_verify = zone;
3227 	nsd->verifier_count = 0;
3228 	nsd->verifier_limit = nsd->options->verifier_count;
3229 	size = sizeof(struct verifier) * nsd->verifier_limit;
3230 	if(pipe(nsd->verifier_pipe) == -1) {
3231 		log_msg(LOG_ERR, "verify: could not create pipe: %s",
3232 				strerror(errno));
3233 		goto fail_pipe;
3234 	}
3235 	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3236 	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3237 	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3238 
3239 	for(size_t i = 0; i < nsd->verifier_limit; i++) {
3240 		nsd->verifiers[i].nsd = nsd;
3241 		nsd->verifiers[i].zone = NULL;
3242 		nsd->verifiers[i].pid = -1;
3243 		nsd->verifiers[i].output_stream.fd = -1;
3244 		nsd->verifiers[i].output_stream.priority = LOG_INFO;
3245 		nsd->verifiers[i].error_stream.fd = -1;
3246 		nsd->verifiers[i].error_stream.priority = LOG_ERR;
3247 	}
3248 
3249 	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3250 	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3251 	   event_add(&cmd_event, NULL) != 0)
3252 	{
3253 		log_msg(LOG_ERR, "verify: could not add command event");
3254 		goto fail;
3255 	}
3256 
3257 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3258 	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3259 	   signal_add(&signal_event, NULL) != 0)
3260 	{
3261 		log_msg(LOG_ERR, "verify: could not add signal event");
3262 		goto fail;
3263 	}
3264 
3265 	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3266 	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3267 	   event_add(&exit_event, NULL) != 0)
3268   {
3269 		log_msg(LOG_ERR, "verify: could not add exit event");
3270 		goto fail;
3271 	}
3272 
3273 	memset(msgs, 0, sizeof(msgs));
3274 	for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
3275 		queries[i] = query_create(nsd->server_region,
3276 			compressed_dname_offsets,
3277 			compression_table_size, compressed_dnames);
3278 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3279 		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3280 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3281 		msgs[i].msg_hdr.msg_iov = &iovecs[i];
3282 		msgs[i].msg_hdr.msg_iovlen = 1;
3283 		msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr;
3284 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3285 	}
3286 
3287 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3288 		struct udp_handler_data *data;
3289 		data = region_alloc_zero(
3290 			nsd->server_region, sizeof(*data));
3291 		add_udp_handler(nsd, &nsd->verify_udp[i], data);
3292 	}
3293 
3294 	tcp_accept_handler_count = nsd->verify_ifs;
3295 	tcp_accept_handlers = region_alloc_array(nsd->server_region,
3296 		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3297 
3298 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3299 		struct tcp_accept_handler_data *data;
3300 		data = &tcp_accept_handlers[i];
3301 		memset(data, 0, sizeof(*data));
3302 		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3303 	}
3304 
3305 	while(nsd->next_zone_to_verify != NULL &&
3306 	      nsd->verifier_count < nsd->verifier_limit)
3307 	{
3308 		verify_zone(nsd, nsd->next_zone_to_verify);
3309 		nsd->next_zone_to_verify
3310 			= verify_next_zone(nsd, nsd->next_zone_to_verify);
3311 	}
3312 
3313 	/* short-lived main loop */
3314 	event_base_dispatch(nsd->event_base);
3315 
3316 	/* remove command and exit event handlers */
3317 	event_del(&exit_event);
3318 	event_del(&signal_event);
3319 	event_del(&cmd_event);
3320 
3321 	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3322 	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3323 fail:
3324 	close(nsd->verifier_pipe[0]);
3325 	close(nsd->verifier_pipe[1]);
3326 fail_pipe:
3327 	event_base_free(nsd->event_base);
3328 	region_destroy(nsd->server_region);
3329 
3330 	nsd->event_base = NULL;
3331 	nsd->server_region = NULL;
3332 	nsd->verifier_limit = 0;
3333 	nsd->verifier_pipe[0] = -1;
3334 	nsd->verifier_pipe[1] = -1;
3335 	nsd->verifiers = NULL;
3336 }
3337 
3338 /*
3339  * Serve DNS requests.
3340  */
3341 void
server_child(struct nsd * nsd)3342 server_child(struct nsd *nsd)
3343 {
3344 	size_t i, from, numifs;
3345 	region_type *server_region = region_create(xalloc, free);
3346 	struct event_base* event_base = nsd_child_event_base();
3347 	sig_atomic_t mode;
3348 #ifdef USE_LOG_PROCESS_ROLE
3349 	static char child_name[20];
3350 #endif
3351 
3352 	if(!event_base) {
3353 		log_msg(LOG_ERR, "nsd server could not create event base");
3354 		exit(1);
3355 	}
3356 	nsd->event_base = event_base;
3357 	nsd->server_region = server_region;
3358 
3359 #ifdef RATELIMIT
3360 	rrl_init(nsd->this_child->child_num);
3361 #endif
3362 
3363 	assert(nsd->server_kind != NSD_SERVER_MAIN);
3364 
3365 #ifdef HAVE_SETPROCTITLE
3366 	setproctitle("server %d", nsd->this_child->child_num + 1);
3367 #endif
3368 #ifdef USE_LOG_PROCESS_ROLE
3369 	snprintf(child_name, sizeof(child_name), "srv%d",
3370 		nsd->this_child->child_num + 1);
3371 	log_set_process_role(child_name);
3372 #endif
3373 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3374 
3375 #ifdef HAVE_CPUSET_T
3376 	if(nsd->use_cpu_affinity) {
3377 		set_cpu_affinity(nsd->this_child->cpuset);
3378 	}
3379 #endif
3380 #ifdef BIND8_STATS
3381 	nsd->st = &nsd->stats_per_child[nsd->stat_current]
3382 		[nsd->this_child->child_num];
3383 	nsd->st->boot = nsd->stat_map[0].boot;
3384 	memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc));
3385 #endif
3386 
3387 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3388 		server_close_all_sockets(nsd->tcp, nsd->ifs);
3389 	}
3390 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3391 		server_close_all_sockets(nsd->udp, nsd->ifs);
3392 	}
3393 
3394 	if (nsd->this_child->parent_fd != -1) {
3395 		struct event *handler;
3396 		struct ipc_handler_conn_data* user_data =
3397 			(struct ipc_handler_conn_data*)region_alloc(
3398 			server_region, sizeof(struct ipc_handler_conn_data));
3399 		user_data->nsd = nsd;
3400 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3401 
3402 		handler = (struct event*) region_alloc(
3403 			server_region, sizeof(*handler));
3404 		memset(handler, 0, sizeof(*handler));
3405 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3406 			EV_READ, child_handle_parent_command, user_data);
3407 		if(event_base_set(event_base, handler) != 0)
3408 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3409 		if(event_add(handler, NULL) != 0)
3410 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3411 	}
3412 
3413 	if(nsd->reuseport) {
3414 		numifs = nsd->ifs / nsd->reuseport;
3415 		from = numifs * nsd->this_child->child_num;
3416 		if(from+numifs > nsd->ifs) { /* should not happen */
3417 			from = 0;
3418 			numifs = nsd->ifs;
3419 		}
3420 	} else {
3421 		from = 0;
3422 		numifs = nsd->ifs;
3423 	}
3424 
3425 	if (nsd->server_kind & NSD_SERVER_UDP) {
3426 		int child = nsd->this_child->child_num;
3427 		memset(msgs, 0, sizeof(msgs));
3428 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3429 			queries[i] = query_create(server_region,
3430 				compressed_dname_offsets,
3431 				compression_table_size, compressed_dnames);
3432 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3433 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3434 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3435 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3436 			msgs[i].msg_hdr.msg_iovlen  = 1;
3437 			msgs[i].msg_hdr.msg_name    = &queries[i]->remote_addr;
3438 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3439 		}
3440 
3441 		for (i = 0; i < nsd->ifs; i++) {
3442 			int listen;
3443 			struct udp_handler_data *data;
3444 
3445 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3446 
3447 			if(i >= from && i < (from + numifs) && listen) {
3448 				data = region_alloc_zero(
3449 					nsd->server_region, sizeof(*data));
3450 				add_udp_handler(nsd, &nsd->udp[i], data);
3451 			} else {
3452 				/* close sockets intended for other servers */
3453 				server_close_socket(&nsd->udp[i]);
3454 			}
3455 		}
3456 	}
3457 
3458 	/*
3459 	 * Keep track of all the TCP accept handlers so we can enable
3460 	 * and disable them based on the current number of active TCP
3461 	 * connections.
3462 	 */
3463 	if (nsd->server_kind & NSD_SERVER_TCP) {
3464 		int child = nsd->this_child->child_num;
3465 		tcp_accept_handler_count = numifs;
3466 		tcp_accept_handlers = region_alloc_array(server_region,
3467 			numifs, sizeof(*tcp_accept_handlers));
3468 
3469 		for (i = 0; i < nsd->ifs; i++) {
3470 			int listen;
3471 			struct tcp_accept_handler_data *data;
3472 
3473 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3474 
3475 			if(i >= from && i < (from + numifs) && listen) {
3476 				data = &tcp_accept_handlers[i-from];
3477 				memset(data, 0, sizeof(*data));
3478 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3479 			} else {
3480 				/* close sockets intended for other servers */
3481 				/*
3482 				 * uncomment this once tcp servers are no
3483 				 * longer copied in the tcp fd copy line
3484 				 * in server_init().
3485 				server_close_socket(&nsd->tcp[i]);
3486 				*/
3487 				/* close sockets not meant for this server*/
3488 				if(!listen)
3489 					server_close_socket(&nsd->tcp[i]);
3490 			}
3491 		}
3492 	} else {
3493 		tcp_accept_handler_count = 0;
3494 	}
3495 
3496 	/* The main loop... */
3497 	while ((mode = nsd->mode) != NSD_QUIT) {
3498 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3499 
3500 		/* Do we need to do the statistics... */
3501 		if (mode == NSD_STATS) {
3502 #ifdef BIND8_STATS
3503 			int p = nsd->st_period;
3504 			nsd->st_period = 1; /* force stats printout */
3505 			/* Dump the statistics */
3506 			bind8_stats(nsd);
3507 			nsd->st_period = p;
3508 #else /* !BIND8_STATS */
3509 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3510 #endif /* BIND8_STATS */
3511 
3512 			nsd->mode = NSD_RUN;
3513 		}
3514 		else if (mode == NSD_REAP_CHILDREN) {
3515 			/* got signal, notify parent. parent reaps terminated children. */
3516 			if (nsd->this_child->parent_fd != -1) {
3517 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3518 				if (write(nsd->this_child->parent_fd,
3519 				    &parent_notify,
3520 				    sizeof(parent_notify)) == -1)
3521 				{
3522 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3523 						(int) nsd->this_child->pid, strerror(errno));
3524 				}
3525 			} else /* no parent, so reap 'em */
3526 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3527 			nsd->mode = NSD_RUN;
3528 		}
3529 		else if(mode == NSD_RUN) {
3530 			/* Wait for a query... */
3531 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3532 				if (errno != EINTR) {
3533 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3534 					break;
3535 				}
3536 			}
3537 		} else if(mode == NSD_QUIT) {
3538 			/* ignore here, quit */
3539 		} else {
3540 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3541 				(int)mode);
3542 			nsd->mode = NSD_RUN;
3543 		}
3544 	}
3545 
3546 	service_remaining_tcp(nsd);
3547 #ifdef	BIND8_STATS
3548 	bind8_stats(nsd);
3549 #endif /* BIND8_STATS */
3550 
3551 #ifdef MEMCLEAN /* OS collects memory pages */
3552 #ifdef RATELIMIT
3553 	rrl_deinit(nsd->this_child->child_num);
3554 #endif
3555 	event_base_free(event_base);
3556 	region_destroy(server_region);
3557 #endif
3558 	server_shutdown(nsd);
3559 }
3560 
remaining_tcp_timeout(int ATTR_UNUSED (fd),short event,void * arg)3561 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3562 {
3563 	int* timed_out = (int*)arg;
3564         assert(event & EV_TIMEOUT); (void)event;
3565 	/* wake up the service tcp thread, note event is no longer
3566 	 * registered */
3567 	*timed_out = 1;
3568 }
3569 
3570 void
service_remaining_tcp(struct nsd * nsd)3571 service_remaining_tcp(struct nsd* nsd)
3572 {
3573 	struct tcp_handler_data* p;
3574 	struct event_base* event_base;
3575 	/* check if it is needed */
3576 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3577 		return;
3578 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3579 #ifdef USE_DNSTAP
3580 	/* remove dnstap collector, we cannot write there because the new
3581 	 * child process is using the file descriptor, or the child
3582 	 * process after that. */
3583 	dt_collector_destroy(nsd->dt_collector, nsd);
3584 	nsd->dt_collector = NULL;
3585 #endif
3586 	/* setup event base */
3587 	event_base = nsd_child_event_base();
3588 	if(!event_base) {
3589 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3590 		return;
3591 	}
3592 	/* register tcp connections */
3593 	for(p = tcp_active_list; p != NULL; p = p->next) {
3594 		struct timeval timeout;
3595 		int fd = p->event.ev_fd;
3596 #ifdef USE_MINI_EVENT
3597 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3598 #else
3599 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3600 #endif
3601 		void (*fn)(int, short, void*);
3602 #ifdef HAVE_SSL
3603 		if(p->tls) {
3604 			if((event&EV_READ))
3605 				fn = handle_tls_reading;
3606 			else	fn = handle_tls_writing;
3607 		} else {
3608 #endif
3609 			if((event&EV_READ))
3610 				fn = handle_tcp_reading;
3611 			else	fn = handle_tcp_writing;
3612 #ifdef HAVE_SSL
3613 		}
3614 #endif
3615 
3616 		p->tcp_no_more_queries = 1;
3617 		/* set timeout to 3 seconds (previously 1/10 second) */
3618 		if(p->tcp_timeout > 3000)
3619 			p->tcp_timeout = 3000;
3620 		timeout.tv_sec = p->tcp_timeout / 1000;
3621 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3622 		event_del(&p->event);
3623 		memset(&p->event, 0, sizeof(p->event));
3624 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3625 			fn, p);
3626 		if(event_base_set(event_base, &p->event) != 0)
3627 			log_msg(LOG_ERR, "event base set failed");
3628 		if(event_add(&p->event, &timeout) != 0)
3629 			log_msg(LOG_ERR, "event add failed");
3630 	}
3631 
3632 	/* handle it */
3633 	while(nsd->current_tcp_count > 0) {
3634 		mode_t m = server_signal_mode(nsd);
3635 		struct event timeout;
3636 		struct timeval tv;
3637 		int timed_out = 0;
3638 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3639 			m == NSD_REAP_CHILDREN) {
3640 			/* quit */
3641 			break;
3642 		}
3643 		/* timer */
3644 		/* have to do something every 3 seconds */
3645 		tv.tv_sec = 3;
3646 		tv.tv_usec = 0;
3647 		memset(&timeout, 0, sizeof(timeout));
3648 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3649 			&timed_out);
3650 		if(event_base_set(event_base, &timeout) != 0)
3651 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3652 		if(event_add(&timeout, &tv) != 0)
3653 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3654 
3655 		/* service loop */
3656 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3657 			if (errno != EINTR) {
3658 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3659 				break;
3660 			}
3661 		}
3662 		if(!timed_out) {
3663 			event_del(&timeout);
3664 		} else {
3665 			/* timed out, quit */
3666 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3667 			break;
3668 		}
3669 	}
3670 #ifdef MEMCLEAN
3671 	event_base_free(event_base);
3672 #endif
3673 	/* continue to quit after return */
3674 }
3675 
3676 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3677  * are always used, even if nonblocking operations are broken, in which case
3678  * NUM_RECV_PER_SELECT is defined to 1 (one).
3679  */
3680 #if defined(HAVE_RECVMMSG)
3681 #define nsd_recvmmsg recvmmsg
3682 #else /* !HAVE_RECVMMSG */
3683 
3684 static int
nsd_recvmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags,struct timespec * timeout)3685 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3686              int flags, struct timespec *timeout)
3687 {
3688 	unsigned int vpos = 0;
3689 	ssize_t rcvd;
3690 
3691 	/* timeout is ignored, ensure caller does not expect it to work */
3692 	assert(timeout == NULL); (void)timeout;
3693 
3694 	while(vpos < vlen) {
3695 		rcvd = recvfrom(sockfd,
3696 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3697 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3698 		                flags,
3699 		                msgvec[vpos].msg_hdr.msg_name,
3700 		               &msgvec[vpos].msg_hdr.msg_namelen);
3701 		if(rcvd < 0) {
3702 			break;
3703 		} else {
3704 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3705 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3706 			vpos++;
3707 		}
3708 	}
3709 
3710 	if(vpos) {
3711 		/* error will be picked up next time */
3712 		return (int)vpos;
3713 	} else if(errno == 0) {
3714 		return 0;
3715 	} else if(errno == EAGAIN) {
3716 		return 0;
3717 	}
3718 
3719 	return -1;
3720 }
3721 #endif /* HAVE_RECVMMSG */
3722 
3723 #ifdef HAVE_SENDMMSG
3724 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3725 #else /* !HAVE_SENDMMSG */
3726 
3727 static int
nsd_sendmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags)3728 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3729 {
3730 	unsigned int vpos = 0;
3731 	ssize_t snd;
3732 
3733 	while(vpos < vlen) {
3734 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3735 		snd = sendto(sockfd,
3736 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3737 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3738 		             flags,
3739 		             msgvec[vpos].msg_hdr.msg_name,
3740 		             msgvec[vpos].msg_hdr.msg_namelen);
3741 		if(snd < 0) {
3742 			break;
3743 		} else {
3744 			msgvec[vpos].msg_len = (unsigned int)snd;
3745 			vpos++;
3746 		}
3747 	}
3748 
3749 	if(vpos) {
3750 		return (int)vpos;
3751 	} else if(errno == 0) {
3752 		return 0;
3753 	}
3754 
3755 	return -1;
3756 }
3757 #endif /* HAVE_SENDMMSG */
3758 
3759 static int
port_is_zero(struct sockaddr_storage * addr)3760 port_is_zero(
3761 #ifdef INET6
3762         struct sockaddr_storage *addr
3763 #else
3764         struct sockaddr_in *addr
3765 #endif
3766 	)
3767 {
3768 #ifdef INET6
3769 	if(addr->ss_family == AF_INET6) {
3770 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3771 	} else if(addr->ss_family == AF_INET) {
3772 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3773 	}
3774 	return 0;
3775 #else
3776 	if(addr->sin_family == AF_INET) {
3777 		return addr->sin_port == 0;
3778 	}
3779 	return 0;
3780 #endif
3781 }
3782 
3783 /* Parses the PROXYv2 header from buf and updates the struct.
3784  * Returns 1 on success, 0 on failure. */
3785 static int
consume_pp2_header(struct buffer * buf,struct query * q,int stream)3786 consume_pp2_header(struct buffer* buf, struct query* q, int stream)
3787 {
3788 	size_t size;
3789 	struct pp2_header* header;
3790 	int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf));
3791 	if(err) {
3792 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse "
3793 			"PROXYv2 header: %s", pp_lookup_error(err)));
3794 		return 0;
3795 	}
3796 	header = (struct pp2_header*)buffer_begin(buf);
3797 	size = PP2_HEADER_SIZE + read_uint16(&header->len);
3798 	if(size > buffer_limit(buf)) {
3799 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer "
3800 			"size to read PROXYv2 header"));
3801 		return 0;
3802 	}
3803 	if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) {
3804 		/* A connection from the proxy itself.
3805 		 * No need to do anything with addresses. */
3806 		goto done;
3807 	}
3808 	if(header->fam_prot == PP2_UNSPEC_UNSPEC) {
3809 		/* Unspecified family and protocol. This could be used for
3810 		 * health checks by proxies.
3811 		 * No need to do anything with addresses. */
3812 		goto done;
3813 	}
3814 	/* Read the proxied address */
3815 	switch(header->fam_prot) {
3816 		case PP2_INET_STREAM:
3817 		case PP2_INET_DGRAM:
3818 			{
3819 			struct sockaddr_in* addr =
3820 				(struct sockaddr_in*)&q->client_addr;
3821 			addr->sin_family = AF_INET;
3822 			memmove(&addr->sin_addr.s_addr,
3823 				&header->addr.addr4.src_addr, 4);
3824 			memmove(&addr->sin_port, &header->addr.addr4.src_port,
3825 				2);
3826 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in);
3827 			}
3828 			/* Ignore the destination address; it should be us. */
3829 			break;
3830 #ifdef INET6
3831 		case PP2_INET6_STREAM:
3832 		case PP2_INET6_DGRAM:
3833 			{
3834 			struct sockaddr_in6* addr =
3835 				(struct sockaddr_in6*)&q->client_addr;
3836 			memset(addr, 0, sizeof(*addr));
3837 			addr->sin6_family = AF_INET6;
3838 			memmove(&addr->sin6_addr,
3839 				header->addr.addr6.src_addr, 16);
3840 			memmove(&addr->sin6_port, &header->addr.addr6.src_port,
3841 				2);
3842 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6);
3843 			}
3844 			/* Ignore the destination address; it should be us. */
3845 			break;
3846 #endif /* INET6 */
3847 		default:
3848 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported "
3849 				"family and protocol 0x%x",
3850 				(int)header->fam_prot));
3851 			return 0;
3852 	}
3853 	q->is_proxied = 1;
3854 done:
3855 	if(!stream) {
3856 		/* We are reading a whole packet;
3857 		 * Move the rest of the data to overwrite the PROXYv2 header */
3858 		/* XXX can we do better to avoid memmove? */
3859 		memmove(header, ((char*)header)+size, buffer_limit(buf)-size);
3860 		buffer_set_limit(buf, buffer_limit(buf)-size);
3861 	}
3862 	return 1;
3863 }
3864 
3865 static void
handle_udp(int fd,short event,void * arg)3866 handle_udp(int fd, short event, void* arg)
3867 {
3868 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3869 	int received, sent, recvcount, i;
3870 	struct query *q;
3871 	uint32_t now = 0;
3872 
3873 	if (!(event & EV_READ)) {
3874 		return;
3875 	}
3876 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3877 	/* this printf strangely gave a performance increase on Linux */
3878 	/* printf("recvcount %d \n", recvcount); */
3879 	if (recvcount == -1) {
3880 		if (errno != EAGAIN && errno != EINTR) {
3881 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3882 			STATUP(data->nsd, rxerr);
3883 			/* No zone statup */
3884 		}
3885 		/* Simply no data available */
3886 		return;
3887 	}
3888 	for (i = 0; i < recvcount; i++) {
3889 	loopstart:
3890 		received = msgs[i].msg_len;
3891 		queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen;
3892 		queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr);
3893 		queries[i]->is_proxied = 0;
3894 		q = queries[i];
3895 		if (received == -1) {
3896 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3897 #if defined(HAVE_RECVMMSG)
3898 				msgs[i].msg_hdr.msg_flags
3899 #else
3900 				errno
3901 #endif
3902 				));
3903 			STATUP(data->nsd, rxerr);
3904 			/* No zone statup */
3905 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3906 			iovecs[i].iov_len = buffer_remaining(q->packet);
3907 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3908 			goto swap_drop;
3909 		}
3910 
3911 		/* Account... */
3912 #ifdef BIND8_STATS
3913 		if (data->socket->addr.ai_family == AF_INET) {
3914 			STATUP(data->nsd, qudp);
3915 		} else if (data->socket->addr.ai_family == AF_INET6) {
3916 			STATUP(data->nsd, qudp6);
3917 		}
3918 #endif
3919 
3920 		buffer_skip(q->packet, received);
3921 		buffer_flip(q->packet);
3922 		if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) {
3923 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not "
3924 				"consume PROXYv2 header"));
3925 			goto swap_drop;
3926 		}
3927 		if(!q->is_proxied) {
3928 			q->client_addrlen = q->remote_addrlen;
3929 			memmove(&q->client_addr, &q->remote_addr,
3930 				q->remote_addrlen);
3931 		}
3932 #ifdef USE_DNSTAP
3933 		/*
3934 		 * sending UDP-query with server address (local) and client address to dnstap process
3935 		 */
3936 		log_addr("query from client", &q->client_addr);
3937 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3938 		if(verbosity >= 6 && q->is_proxied)
3939 			log_addr("query via proxy", &q->remote_addr);
3940 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen,
3941 			q->tcp, q->packet);
3942 #endif /* USE_DNSTAP */
3943 
3944 		/* Process and answer the query... */
3945 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3946 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3947 				STATUP(data->nsd, nona);
3948 				ZTATUP(data->nsd, q->zone, nona);
3949 			}
3950 
3951 #ifdef USE_ZONE_STATS
3952 			if (data->socket->addr.ai_family == AF_INET) {
3953 				ZTATUP(data->nsd, q->zone, qudp);
3954 			} else if (data->socket->addr.ai_family == AF_INET6) {
3955 				ZTATUP(data->nsd, q->zone, qudp6);
3956 			}
3957 #endif
3958 
3959 			/* Add EDNS0 and TSIG info if necessary.  */
3960 			query_add_optional(q, data->nsd, &now);
3961 
3962 			buffer_flip(q->packet);
3963 			iovecs[i].iov_len = buffer_remaining(q->packet);
3964 #ifdef BIND8_STATS
3965 			/* Account the rcode & TC... */
3966 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3967 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3968 			if (TC(q->packet)) {
3969 				STATUP(data->nsd, truncated);
3970 				ZTATUP(data->nsd, q->zone, truncated);
3971 			}
3972 #endif /* BIND8_STATS */
3973 #ifdef USE_DNSTAP
3974 			/*
3975 			 * sending UDP-response with server address (local) and client address to dnstap process
3976 			 */
3977 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3978 			log_addr("response to client", &q->client_addr);
3979 			if(verbosity >= 6 && q->is_proxied)
3980 				log_addr("response via proxy", &q->remote_addr);
3981 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3982 				&q->client_addr, q->client_addrlen, q->tcp, q->packet,
3983 				q->zone);
3984 #endif /* USE_DNSTAP */
3985 		} else {
3986 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3987 			iovecs[i].iov_len = buffer_remaining(q->packet);
3988 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3989 		swap_drop:
3990 			STATUP(data->nsd, dropped);
3991 			ZTATUP(data->nsd, q->zone, dropped);
3992 			if(i != recvcount-1) {
3993 				/* swap with last and decrease recvcount */
3994 				struct mmsghdr mtmp = msgs[i];
3995 				struct iovec iotmp = iovecs[i];
3996 				recvcount--;
3997 				msgs[i] = msgs[recvcount];
3998 				iovecs[i] = iovecs[recvcount];
3999 				queries[i] = queries[recvcount];
4000 				msgs[recvcount] = mtmp;
4001 				iovecs[recvcount] = iotmp;
4002 				queries[recvcount] = q;
4003 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
4004 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
4005 				goto loopstart;
4006 			} else { recvcount --; }
4007 		}
4008 	}
4009 
4010 	/* send until all are sent */
4011 	i = 0;
4012 	while(i<recvcount) {
4013 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
4014 		if(sent == -1) {
4015 			if(errno == ENOBUFS ||
4016 #ifdef EWOULDBLOCK
4017 				errno == EWOULDBLOCK ||
4018 #endif
4019 				errno == EAGAIN) {
4020 				/* block to wait until send buffer avail */
4021 				int flag, errstore;
4022 				if((flag = fcntl(fd, F_GETFL)) == -1) {
4023 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
4024 					flag = 0;
4025 				}
4026 				flag &= ~O_NONBLOCK;
4027 				if(fcntl(fd, F_SETFL, flag) == -1)
4028 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
4029 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
4030 				errstore = errno;
4031 				flag |= O_NONBLOCK;
4032 				if(fcntl(fd, F_SETFL, flag) == -1)
4033 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
4034 				if(sent != -1) {
4035 					i += sent;
4036 					continue;
4037 				}
4038 				errno = errstore;
4039 			}
4040 			if(errno == EINVAL) {
4041 				/* skip the invalid argument entry,
4042 				 * send the remaining packets in the list */
4043 				if(!(port_is_zero((void*)&queries[i]->remote_addr) &&
4044 					verbosity < 3)) {
4045 					const char* es = strerror(errno);
4046 					char a[64];
4047 					addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
4048 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
4049 				}
4050 				i += 1;
4051 				continue;
4052 			}
4053 			/* don't log transient network full errors, unless
4054 			 * on higher verbosity */
4055 			if(!(errno == ENOBUFS && verbosity < 1) &&
4056 #ifdef EWOULDBLOCK
4057 			   errno != EWOULDBLOCK &&
4058 #endif
4059 			   errno != EAGAIN) {
4060 				const char* es = strerror(errno);
4061 				char a[64];
4062 				addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
4063 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
4064 			}
4065 #ifdef BIND8_STATS
4066 			data->nsd->st->txerr += recvcount-i;
4067 #endif /* BIND8_STATS */
4068 			break;
4069 		}
4070 		i += sent;
4071 	}
4072 	for(i=0; i<recvcount; i++) {
4073 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
4074 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
4075 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
4076 	}
4077 }
4078 
4079 #ifdef HAVE_SSL
4080 /*
4081  * Setup an event for the tcp handler.
4082  */
4083 static void
tcp_handler_setup_event(struct tcp_handler_data * data,void (* fn)(int,short,void *),int fd,short event)4084 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
4085        int fd, short event)
4086 {
4087 	struct timeval timeout;
4088 	struct event_base* ev_base;
4089 
4090 	timeout.tv_sec = data->nsd->tcp_timeout;
4091 	timeout.tv_usec = 0L;
4092 
4093 	ev_base = data->event.ev_base;
4094 	event_del(&data->event);
4095 	memset(&data->event, 0, sizeof(data->event));
4096 	event_set(&data->event, fd, event, fn, data);
4097 	if(event_base_set(ev_base, &data->event) != 0)
4098 		log_msg(LOG_ERR, "event base set failed");
4099 	if(event_add(&data->event, &timeout) != 0)
4100 		log_msg(LOG_ERR, "event add failed");
4101 }
4102 #endif /* HAVE_SSL */
4103 
4104 static void
cleanup_tcp_handler(struct tcp_handler_data * data)4105 cleanup_tcp_handler(struct tcp_handler_data* data)
4106 {
4107 	event_del(&data->event);
4108 #ifdef HAVE_SSL
4109 	if(data->tls) {
4110 		SSL_shutdown(data->tls);
4111 		SSL_free(data->tls);
4112 		data->tls = NULL;
4113 	}
4114 #endif
4115 	data->pp2_header_state = pp2_header_none;
4116 	close(data->event.ev_fd);
4117 	if(data->prev)
4118 		data->prev->next = data->next;
4119 	else	tcp_active_list = data->next;
4120 	if(data->next)
4121 		data->next->prev = data->prev;
4122 
4123 	/*
4124 	 * Enable the TCP accept handlers when the current number of
4125 	 * TCP connections is about to drop below the maximum number
4126 	 * of TCP connections.
4127 	 */
4128 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
4129 		configure_handler_event_types(EV_READ|EV_PERSIST);
4130 		if(slowaccept) {
4131 			event_del(&slowaccept_event);
4132 			slowaccept = 0;
4133 		}
4134 	}
4135 	--data->nsd->current_tcp_count;
4136 	assert(data->nsd->current_tcp_count >= 0);
4137 
4138 	region_destroy(data->region);
4139 }
4140 
4141 /* Read more data into the buffer for tcp read. Pass the amount of additional
4142  * data required. Returns false if nothing needs to be done this event, or
4143  * true if the additional data is in the buffer. */
4144 static int
more_read_buf_tcp(int fd,struct tcp_handler_data * data,void * bufpos,size_t add_amount,ssize_t * received)4145 more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos,
4146 	size_t add_amount, ssize_t* received)
4147 {
4148 	*received = read(fd, bufpos, add_amount);
4149 	if (*received == -1) {
4150 		if (errno == EAGAIN || errno == EINTR) {
4151 			/*
4152 			 * Read would block, wait until more
4153 			 * data is available.
4154 			 */
4155 			return 0;
4156 		} else {
4157 			char buf[48];
4158 			addr2str(&data->query->remote_addr, buf, sizeof(buf));
4159 #ifdef ECONNRESET
4160 			if (verbosity >= 2 || errno != ECONNRESET)
4161 #endif /* ECONNRESET */
4162 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
4163 			cleanup_tcp_handler(data);
4164 			return 0;
4165 		}
4166 	} else if (*received == 0) {
4167 		/* EOF */
4168 		cleanup_tcp_handler(data);
4169 		return 0;
4170 	}
4171 	return 1;
4172 }
4173 
4174 static void
handle_tcp_reading(int fd,short event,void * arg)4175 handle_tcp_reading(int fd, short event, void* arg)
4176 {
4177 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4178 	ssize_t received;
4179 	struct event_base* ev_base;
4180 	struct timeval timeout;
4181 	uint32_t now = 0;
4182 
4183 	if ((event & EV_TIMEOUT)) {
4184 		/* Connection timed out.  */
4185 		cleanup_tcp_handler(data);
4186 		return;
4187 	}
4188 
4189 	if ((data->nsd->tcp_query_count > 0 &&
4190 	     data->query_count >= data->nsd->tcp_query_count) ||
4191 	    (data->query_count > 0 && data->tcp_no_more_queries))
4192   {
4193 		/* No more queries allowed on this tcp connection. */
4194 		cleanup_tcp_handler(data);
4195 		return;
4196 	}
4197 
4198 	assert((event & EV_READ));
4199 
4200 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4201 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4202 		data->query_needs_reset = 0;
4203 	}
4204 
4205 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4206 		struct pp2_header* header = NULL;
4207 		size_t want_read_size = 0;
4208 		size_t current_read_size = 0;
4209 		if(data->pp2_header_state == pp2_header_none) {
4210 			want_read_size = PP2_HEADER_SIZE;
4211 			if(buffer_remaining(data->query->packet) <
4212 				want_read_size) {
4213 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4214 				cleanup_tcp_handler(data);
4215 				return;
4216 			}
4217 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4218 			current_read_size = want_read_size;
4219 			if(data->bytes_transmitted < current_read_size) {
4220 				if(!more_read_buf_tcp(fd, data,
4221 					(void*)buffer_at(data->query->packet,
4222 						data->bytes_transmitted),
4223 					current_read_size - data->bytes_transmitted,
4224 					&received))
4225 					return;
4226 				data->bytes_transmitted += received;
4227 				buffer_skip(data->query->packet, received);
4228 				if(data->bytes_transmitted != current_read_size)
4229 					return;
4230 				data->pp2_header_state = pp2_header_init;
4231 			}
4232 		}
4233 		if(data->pp2_header_state == pp2_header_init) {
4234 			int err;
4235 			err = pp2_read_header(buffer_begin(data->query->packet),
4236 				buffer_limit(data->query->packet));
4237 			if(err) {
4238 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4239 				cleanup_tcp_handler(data);
4240 				return;
4241 			}
4242 			header = (struct pp2_header*)buffer_begin(data->query->packet);
4243 			want_read_size = ntohs(header->len);
4244 			if(buffer_limit(data->query->packet) <
4245 				PP2_HEADER_SIZE + want_read_size) {
4246 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4247 				cleanup_tcp_handler(data);
4248 				return;
4249 			}
4250 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4251 			current_read_size = PP2_HEADER_SIZE + want_read_size;
4252 			if(want_read_size == 0) {
4253 				/* nothing more to read; header is complete */
4254 				data->pp2_header_state = pp2_header_done;
4255 			} else if(data->bytes_transmitted < current_read_size) {
4256 				if(!more_read_buf_tcp(fd, data,
4257 					(void*)buffer_at(data->query->packet,
4258 						data->bytes_transmitted),
4259 					current_read_size - data->bytes_transmitted,
4260 					&received))
4261 					return;
4262 				data->bytes_transmitted += received;
4263 				buffer_skip(data->query->packet, received);
4264 				if(data->bytes_transmitted != current_read_size)
4265 					return;
4266 				data->pp2_header_state = pp2_header_done;
4267 			}
4268 		}
4269 		if(data->pp2_header_state != pp2_header_done || !header) {
4270 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4271 
4272 			cleanup_tcp_handler(data);
4273 			return;
4274 		}
4275 		buffer_flip(data->query->packet);
4276 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4277 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4278 
4279 			cleanup_tcp_handler(data);
4280 			return;
4281 		}
4282 		/* Clear and reset the buffer to read the following
4283 		 * DNS packet(s). */
4284 		buffer_clear(data->query->packet);
4285 		data->bytes_transmitted = 0;
4286 	}
4287 
4288 	/*
4289 	 * Check if we received the leading packet length bytes yet.
4290 	 */
4291 	if (data->bytes_transmitted < sizeof(uint16_t)) {
4292 		if(!more_read_buf_tcp(fd, data,
4293 			(char*) &data->query->tcplen + data->bytes_transmitted,
4294 			sizeof(uint16_t) - data->bytes_transmitted, &received))
4295 			return;
4296 		data->bytes_transmitted += received;
4297 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4298 			/*
4299 			 * Not done with the tcplen yet, wait for more
4300 			 * data to become available.
4301 			 */
4302 			return;
4303 		}
4304 		assert(data->bytes_transmitted == sizeof(uint16_t));
4305 
4306 		data->query->tcplen = ntohs(data->query->tcplen);
4307 
4308 		/*
4309 		 * Minimum query size is:
4310 		 *
4311 		 *     Size of the header (12)
4312 		 *   + Root domain name   (1)
4313 		 *   + Query class        (2)
4314 		 *   + Query type         (2)
4315 		 */
4316 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4317 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4318 			cleanup_tcp_handler(data);
4319 			return;
4320 		}
4321 
4322 		if (data->query->tcplen > data->query->maxlen) {
4323 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4324 			cleanup_tcp_handler(data);
4325 			return;
4326 		}
4327 
4328 		buffer_set_limit(data->query->packet, data->query->tcplen);
4329 	}
4330 
4331 	assert(buffer_remaining(data->query->packet) > 0);
4332 
4333 	/* Read the (remaining) query data.  */
4334 	if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet),
4335 		buffer_remaining(data->query->packet), &received))
4336 		return;
4337 	data->bytes_transmitted += received;
4338 	buffer_skip(data->query->packet, received);
4339 	if (buffer_remaining(data->query->packet) > 0) {
4340 		/*
4341 		 * Message not yet complete, wait for more data to
4342 		 * become available.
4343 		 */
4344 		return;
4345 	}
4346 
4347 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4348 
4349 	/* Account... */
4350 #ifdef BIND8_STATS
4351 #ifndef INET6
4352 	STATUP(data->nsd, ctcp);
4353 #else
4354 	if (data->query->remote_addr.ss_family == AF_INET) {
4355 		STATUP(data->nsd, ctcp);
4356 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4357 		STATUP(data->nsd, ctcp6);
4358 	}
4359 #endif
4360 #endif /* BIND8_STATS */
4361 
4362 	/* We have a complete query, process it.  */
4363 
4364 	/* tcp-query-count: handle query counter ++ */
4365 	data->query_count++;
4366 
4367 	buffer_flip(data->query->packet);
4368 #ifdef USE_DNSTAP
4369 	/*
4370 	 * and send TCP-query with found address (local) and client address to dnstap process
4371 	 */
4372 	log_addr("query from client", &data->query->client_addr);
4373 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4374 	if(verbosity >= 6 && data->query->is_proxied)
4375 		log_addr("query via proxy", &data->query->remote_addr);
4376 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4377 		data->query->client_addrlen, data->query->tcp, data->query->packet);
4378 #endif /* USE_DNSTAP */
4379 	data->query_state = server_process_query(data->nsd, data->query, &now);
4380 	if (data->query_state == QUERY_DISCARDED) {
4381 		/* Drop the packet and the entire connection... */
4382 		STATUP(data->nsd, dropped);
4383 		ZTATUP(data->nsd, data->query->zone, dropped);
4384 		cleanup_tcp_handler(data);
4385 		return;
4386 	}
4387 
4388 #ifdef BIND8_STATS
4389 	if (RCODE(data->query->packet) == RCODE_OK
4390 	    && !AA(data->query->packet))
4391 	{
4392 		STATUP(data->nsd, nona);
4393 		ZTATUP(data->nsd, data->query->zone, nona);
4394 	}
4395 #endif /* BIND8_STATS */
4396 
4397 #ifdef USE_ZONE_STATS
4398 #ifndef INET6
4399 	ZTATUP(data->nsd, data->query->zone, ctcp);
4400 #else
4401 	if (data->query->remote_addr.ss_family == AF_INET) {
4402 		ZTATUP(data->nsd, data->query->zone, ctcp);
4403 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4404 		ZTATUP(data->nsd, data->query->zone, ctcp6);
4405 	}
4406 #endif
4407 #endif /* USE_ZONE_STATS */
4408 
4409 	query_add_optional(data->query, data->nsd, &now);
4410 
4411 	/* Switch to the tcp write handler.  */
4412 	buffer_flip(data->query->packet);
4413 	data->query->tcplen = buffer_remaining(data->query->packet);
4414 #ifdef BIND8_STATS
4415 	/* Account the rcode & TC... */
4416 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4417 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4418 	if (TC(data->query->packet)) {
4419 		STATUP(data->nsd, truncated);
4420 		ZTATUP(data->nsd, data->query->zone, truncated);
4421 	}
4422 #endif /* BIND8_STATS */
4423 #ifdef USE_DNSTAP
4424 	/*
4425 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4426 	 */
4427 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4428 	log_addr("response to client", &data->query->client_addr);
4429 	if(verbosity >= 6 && data->query->is_proxied)
4430 		log_addr("response via proxy", &data->query->remote_addr);
4431 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4432 		data->query->client_addrlen, data->query->tcp, data->query->packet,
4433 		data->query->zone);
4434 #endif /* USE_DNSTAP */
4435 	data->bytes_transmitted = 0;
4436 
4437 	timeout.tv_sec = data->tcp_timeout / 1000;
4438 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4439 
4440 	ev_base = data->event.ev_base;
4441 	event_del(&data->event);
4442 	memset(&data->event, 0, sizeof(data->event));
4443 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4444 		handle_tcp_writing, data);
4445 	if(event_base_set(ev_base, &data->event) != 0)
4446 		log_msg(LOG_ERR, "event base set tcpr failed");
4447 	if(event_add(&data->event, &timeout) != 0)
4448 		log_msg(LOG_ERR, "event add tcpr failed");
4449 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4450 	handle_tcp_writing(fd, EV_WRITE, data);
4451 }
4452 
4453 static void
handle_tcp_writing(int fd,short event,void * arg)4454 handle_tcp_writing(int fd, short event, void* arg)
4455 {
4456 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4457 	ssize_t sent;
4458 	struct query *q = data->query;
4459 	struct timeval timeout;
4460 	struct event_base* ev_base;
4461 	uint32_t now = 0;
4462 
4463 	if ((event & EV_TIMEOUT)) {
4464 		/* Connection timed out.  */
4465 		cleanup_tcp_handler(data);
4466 		return;
4467 	}
4468 
4469 	assert((event & EV_WRITE));
4470 
4471 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
4472 		/* Writing the response packet length.  */
4473 		uint16_t n_tcplen = htons(q->tcplen);
4474 #ifdef HAVE_WRITEV
4475 		struct iovec iov[2];
4476 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4477 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4478 		iov[1].iov_base = buffer_begin(q->packet);
4479 		iov[1].iov_len = buffer_limit(q->packet);
4480 		sent = writev(fd, iov, 2);
4481 #else /* HAVE_WRITEV */
4482 		sent = write(fd,
4483 			     (const char *) &n_tcplen + data->bytes_transmitted,
4484 			     sizeof(n_tcplen) - data->bytes_transmitted);
4485 #endif /* HAVE_WRITEV */
4486 		if (sent == -1) {
4487 			if (errno == EAGAIN || errno == EINTR) {
4488 				/*
4489 				 * Write would block, wait until
4490 				 * socket becomes writable again.
4491 				 */
4492 				return;
4493 			} else {
4494 #ifdef ECONNRESET
4495 				if(verbosity >= 2 || errno != ECONNRESET)
4496 #endif /* ECONNRESET */
4497 #ifdef EPIPE
4498 				  if(verbosity >= 2 || errno != EPIPE)
4499 #endif /* EPIPE 'broken pipe' */
4500 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4501 				cleanup_tcp_handler(data);
4502 				return;
4503 			}
4504 		}
4505 
4506 		data->bytes_transmitted += sent;
4507 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
4508 			/*
4509 			 * Writing not complete, wait until socket
4510 			 * becomes writable again.
4511 			 */
4512 			return;
4513 		}
4514 
4515 #ifdef HAVE_WRITEV
4516 		sent -= sizeof(n_tcplen);
4517 		/* handle potential 'packet done' code */
4518 		goto packet_could_be_done;
4519 #endif
4520  	}
4521 
4522 	sent = write(fd,
4523 		     buffer_current(q->packet),
4524 		     buffer_remaining(q->packet));
4525 	if (sent == -1) {
4526 		if (errno == EAGAIN || errno == EINTR) {
4527 			/*
4528 			 * Write would block, wait until
4529 			 * socket becomes writable again.
4530 			 */
4531 			return;
4532 		} else {
4533 #ifdef ECONNRESET
4534 			if(verbosity >= 2 || errno != ECONNRESET)
4535 #endif /* ECONNRESET */
4536 #ifdef EPIPE
4537 				  if(verbosity >= 2 || errno != EPIPE)
4538 #endif /* EPIPE 'broken pipe' */
4539 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4540 			cleanup_tcp_handler(data);
4541 			return;
4542 		}
4543 	}
4544 
4545 	data->bytes_transmitted += sent;
4546 #ifdef HAVE_WRITEV
4547   packet_could_be_done:
4548 #endif
4549 	buffer_skip(q->packet, sent);
4550 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4551 		/*
4552 		 * Still more data to write when socket becomes
4553 		 * writable again.
4554 		 */
4555 		return;
4556 	}
4557 
4558 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4559 
4560 	if (data->query_state == QUERY_IN_AXFR ||
4561 		data->query_state == QUERY_IN_IXFR) {
4562 		/* Continue processing AXFR and writing back results.  */
4563 		buffer_clear(q->packet);
4564 		if(data->query_state == QUERY_IN_AXFR)
4565 			data->query_state = query_axfr(data->nsd, q, 0);
4566 		else data->query_state = query_ixfr(data->nsd, q);
4567 		if (data->query_state != QUERY_PROCESSED) {
4568 			query_add_optional(data->query, data->nsd, &now);
4569 
4570 			/* Reset data. */
4571 			buffer_flip(q->packet);
4572 			q->tcplen = buffer_remaining(q->packet);
4573 			data->bytes_transmitted = 0;
4574 			/* Reset timeout.  */
4575 			timeout.tv_sec = data->tcp_timeout / 1000;
4576 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4577 			ev_base = data->event.ev_base;
4578 			event_del(&data->event);
4579 			memset(&data->event, 0, sizeof(data->event));
4580 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4581 				handle_tcp_writing, data);
4582 			if(event_base_set(ev_base, &data->event) != 0)
4583 				log_msg(LOG_ERR, "event base set tcpw failed");
4584 			if(event_add(&data->event, &timeout) != 0)
4585 				log_msg(LOG_ERR, "event add tcpw failed");
4586 
4587 			/*
4588 			 * Write data if/when the socket is writable
4589 			 * again.
4590 			 */
4591 			return;
4592 		}
4593 	}
4594 
4595 	/*
4596 	 * Done sending, wait for the next request to arrive on the
4597 	 * TCP socket by installing the TCP read handler.
4598 	 */
4599 	if ((data->nsd->tcp_query_count > 0 &&
4600 		data->query_count >= data->nsd->tcp_query_count) ||
4601 		data->tcp_no_more_queries) {
4602 
4603 		(void) shutdown(fd, SHUT_WR);
4604 	}
4605 
4606 	data->bytes_transmitted = 0;
4607 	data->query_needs_reset = 1;
4608 
4609 	timeout.tv_sec = data->tcp_timeout / 1000;
4610 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4611 	ev_base = data->event.ev_base;
4612 	event_del(&data->event);
4613 	memset(&data->event, 0, sizeof(data->event));
4614 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4615 		handle_tcp_reading, data);
4616 	if(event_base_set(ev_base, &data->event) != 0)
4617 		log_msg(LOG_ERR, "event base set tcpw failed");
4618 	if(event_add(&data->event, &timeout) != 0)
4619 		log_msg(LOG_ERR, "event add tcpw failed");
4620 }
4621 
4622 #ifdef HAVE_SSL
4623 /** create SSL object and associate fd */
4624 static SSL*
incoming_ssl_fd(SSL_CTX * ctx,int fd)4625 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4626 {
4627 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4628 	if(!ssl) {
4629 		log_crypto_err("could not SSL_new");
4630 		return NULL;
4631 	}
4632 	SSL_set_accept_state(ssl);
4633 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4634 	if(!SSL_set_fd(ssl, fd)) {
4635 		log_crypto_err("could not SSL_set_fd");
4636 		SSL_free(ssl);
4637 		return NULL;
4638 	}
4639 	return ssl;
4640 }
4641 
4642 /** TLS handshake to upgrade TCP connection */
4643 static int
tls_handshake(struct tcp_handler_data * data,int fd,int writing)4644 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4645 {
4646 	int r;
4647 	if(data->shake_state == tls_hs_read_event) {
4648 		/* read condition satisfied back to writing */
4649 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4650 		data->shake_state = tls_hs_none;
4651 		return 1;
4652 	}
4653 	if(data->shake_state == tls_hs_write_event) {
4654 		/* write condition satisfied back to reading */
4655 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4656 		data->shake_state = tls_hs_none;
4657 		return 1;
4658 	}
4659 
4660 	/* (continue to) setup the TLS connection */
4661 	ERR_clear_error();
4662 	r = SSL_do_handshake(data->tls);
4663 
4664 	if(r != 1) {
4665 		int want = SSL_get_error(data->tls, r);
4666 		if(want == SSL_ERROR_WANT_READ) {
4667 			if(data->shake_state == tls_hs_read) {
4668 				/* try again later */
4669 				return 1;
4670 			}
4671 			data->shake_state = tls_hs_read;
4672 			/* switch back to reading mode */
4673 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4674 			return 1;
4675 		} else if(want == SSL_ERROR_WANT_WRITE) {
4676 			if(data->shake_state == tls_hs_write) {
4677 				/* try again later */
4678 				return 1;
4679 			}
4680 			data->shake_state = tls_hs_write;
4681 			/* switch back to writing mode */
4682 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4683 			return 1;
4684 		} else {
4685 			if(r == 0)
4686 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4687 			else {
4688 				unsigned long err = ERR_get_error();
4689 				if(!squelch_err_ssl_handshake(err)) {
4690 					char a[64], s[256];
4691 					addr2str(&data->query->remote_addr, a, sizeof(a));
4692 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4693 					log_crypto_from_err(s, err);
4694 				}
4695 			}
4696 			cleanup_tcp_handler(data);
4697 			return 0;
4698 		}
4699 	}
4700 
4701 	/* Use to log successful upgrade for testing - could be removed*/
4702 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4703 	/* set back to the event we need to have when reading (or writing) */
4704 	if(data->shake_state == tls_hs_read && writing) {
4705 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4706 	} else if(data->shake_state == tls_hs_write && !writing) {
4707 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4708 	}
4709 	data->shake_state = tls_hs_none;
4710 	return 1;
4711 }
4712 
4713 /* Read more data into the buffer for tls read. Pass the amount of additional
4714  * data required. Returns false if nothing needs to be done this event, or
4715  * true if the additional data is in the buffer. */
4716 static int
more_read_buf_tls(int fd,struct tcp_handler_data * data,void * bufpos,size_t add_amount,ssize_t * received)4717 more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos,
4718 	size_t add_amount, ssize_t* received)
4719 {
4720 	ERR_clear_error();
4721 	if((*received=SSL_read(data->tls, bufpos, add_amount)) <= 0) {
4722 		int want = SSL_get_error(data->tls, *received);
4723 		if(want == SSL_ERROR_ZERO_RETURN) {
4724 			cleanup_tcp_handler(data);
4725 			return 0; /* shutdown, closed */
4726 		} else if(want == SSL_ERROR_WANT_READ) {
4727 			/* wants to be called again */
4728 			return 0;
4729 		}
4730 		else if(want == SSL_ERROR_WANT_WRITE) {
4731 			/* switch to writing */
4732 			data->shake_state = tls_hs_write_event;
4733 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4734 			return 0;
4735 		}
4736 		cleanup_tcp_handler(data);
4737 		log_crypto_err("could not SSL_read");
4738 		return 0;
4739 	}
4740 	return 1;
4741 }
4742 
4743 /** handle TLS reading of incoming query */
4744 static void
handle_tls_reading(int fd,short event,void * arg)4745 handle_tls_reading(int fd, short event, void* arg)
4746 {
4747 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4748 	ssize_t received;
4749 	uint32_t now = 0;
4750 
4751 	if ((event & EV_TIMEOUT)) {
4752 		/* Connection timed out.  */
4753 		cleanup_tcp_handler(data);
4754 		return;
4755 	}
4756 
4757 	if ((data->nsd->tcp_query_count > 0 &&
4758 	     data->query_count >= data->nsd->tcp_query_count) ||
4759 	    (data->query_count > 0 && data->tcp_no_more_queries))
4760 	{
4761 		/* No more queries allowed on this tcp connection. */
4762 		cleanup_tcp_handler(data);
4763 		return;
4764 	}
4765 
4766 	assert((event & EV_READ));
4767 
4768 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4769 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4770 		data->query_needs_reset = 0;
4771 	}
4772 
4773 	if(data->shake_state != tls_hs_none) {
4774 		if(!tls_handshake(data, fd, 0))
4775 			return;
4776 		if(data->shake_state != tls_hs_none)
4777 			return;
4778 	}
4779 
4780 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4781 		struct pp2_header* header = NULL;
4782 		size_t want_read_size = 0;
4783 		size_t current_read_size = 0;
4784 		if(data->pp2_header_state == pp2_header_none) {
4785 			want_read_size = PP2_HEADER_SIZE;
4786 			if(buffer_remaining(data->query->packet) <
4787 				want_read_size) {
4788 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4789 				cleanup_tcp_handler(data);
4790 				return;
4791 			}
4792 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4793 			current_read_size = want_read_size;
4794 			if(data->bytes_transmitted < current_read_size) {
4795 				if(!more_read_buf_tls(fd, data,
4796 					buffer_at(data->query->packet,
4797 						data->bytes_transmitted),
4798 					current_read_size - data->bytes_transmitted,
4799 					&received))
4800 					return;
4801 				data->bytes_transmitted += received;
4802 				buffer_skip(data->query->packet, received);
4803 				if(data->bytes_transmitted != current_read_size)
4804 					return;
4805 				data->pp2_header_state = pp2_header_init;
4806 			}
4807 		}
4808 		if(data->pp2_header_state == pp2_header_init) {
4809 			int err;
4810 			err = pp2_read_header(buffer_begin(data->query->packet),
4811 				buffer_limit(data->query->packet));
4812 			if(err) {
4813 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4814 				cleanup_tcp_handler(data);
4815 				return;
4816 			}
4817 			header = (struct pp2_header*)buffer_begin(data->query->packet);
4818 			want_read_size = ntohs(header->len);
4819 			if(buffer_limit(data->query->packet) <
4820 				PP2_HEADER_SIZE + want_read_size) {
4821 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4822 				cleanup_tcp_handler(data);
4823 				return;
4824 			}
4825 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4826 			current_read_size = PP2_HEADER_SIZE + want_read_size;
4827 			if(want_read_size == 0) {
4828 				/* nothing more to read; header is complete */
4829 				data->pp2_header_state = pp2_header_done;
4830 			} else if(data->bytes_transmitted < current_read_size) {
4831 				if(!more_read_buf_tls(fd, data,
4832 					buffer_at(data->query->packet,
4833 						data->bytes_transmitted),
4834 					current_read_size - data->bytes_transmitted,
4835 					&received))
4836 					return;
4837 				data->bytes_transmitted += received;
4838 				buffer_skip(data->query->packet, received);
4839 				if(data->bytes_transmitted != current_read_size)
4840 					return;
4841 				data->pp2_header_state = pp2_header_done;
4842 			}
4843 		}
4844 		if(data->pp2_header_state != pp2_header_done || !header) {
4845 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4846 			cleanup_tcp_handler(data);
4847 			return;
4848 		}
4849 		buffer_flip(data->query->packet);
4850 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4851 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4852 			cleanup_tcp_handler(data);
4853 			return;
4854 		}
4855 		/* Clear and reset the buffer to read the following
4856 		 * DNS packet(s). */
4857 		buffer_clear(data->query->packet);
4858 		data->bytes_transmitted = 0;
4859 	}
4860 	/*
4861 	 * Check if we received the leading packet length bytes yet.
4862 	 */
4863 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4864 		if(!more_read_buf_tls(fd, data,
4865 		    (char *) &data->query->tcplen + data->bytes_transmitted,
4866 		    sizeof(uint16_t) - data->bytes_transmitted, &received))
4867 			return;
4868 		data->bytes_transmitted += received;
4869 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4870 			/*
4871 			 * Not done with the tcplen yet, wait for more
4872 			 * data to become available.
4873 			 */
4874 			return;
4875 		}
4876 
4877 		assert(data->bytes_transmitted == sizeof(uint16_t));
4878 
4879 		data->query->tcplen = ntohs(data->query->tcplen);
4880 
4881 		/*
4882 		 * Minimum query size is:
4883 		 *
4884 		 *     Size of the header (12)
4885 		 *   + Root domain name   (1)
4886 		 *   + Query class        (2)
4887 		 *   + Query type         (2)
4888 		 */
4889 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4890 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4891 			cleanup_tcp_handler(data);
4892 			return;
4893 		}
4894 
4895 		if (data->query->tcplen > data->query->maxlen) {
4896 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4897 			cleanup_tcp_handler(data);
4898 			return;
4899 		}
4900 
4901 		buffer_set_limit(data->query->packet, data->query->tcplen);
4902 	}
4903 
4904 	assert(buffer_remaining(data->query->packet) > 0);
4905 
4906 	/* Read the (remaining) query data.  */
4907 	if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet),
4908 		buffer_remaining(data->query->packet), &received))
4909 		return;
4910 	data->bytes_transmitted += received;
4911 	buffer_skip(data->query->packet, received);
4912 	if (buffer_remaining(data->query->packet) > 0) {
4913 		/*
4914 		 * Message not yet complete, wait for more data to
4915 		 * become available.
4916 		 */
4917 		return;
4918 	}
4919 
4920 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4921 
4922 	/* Account... */
4923 #ifndef INET6
4924 	STATUP(data->nsd, ctls);
4925 #else
4926 	if (data->query->remote_addr.ss_family == AF_INET) {
4927 		STATUP(data->nsd, ctls);
4928 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4929 		STATUP(data->nsd, ctls6);
4930 	}
4931 #endif
4932 
4933 	/* We have a complete query, process it.  */
4934 
4935 	/* tcp-query-count: handle query counter ++ */
4936 	data->query_count++;
4937 
4938 	buffer_flip(data->query->packet);
4939 #ifdef USE_DNSTAP
4940 	/*
4941 	 * and send TCP-query with found address (local) and client address to dnstap process
4942 	 */
4943 	log_addr("query from client", &data->query->client_addr);
4944 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4945 	if(verbosity >= 6 && data->query->is_proxied)
4946 		log_addr("query via proxy", &data->query->remote_addr);
4947 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4948 		data->query->client_addrlen, data->query->tcp, data->query->packet);
4949 #endif /* USE_DNSTAP */
4950 	data->query_state = server_process_query(data->nsd, data->query, &now);
4951 	if (data->query_state == QUERY_DISCARDED) {
4952 		/* Drop the packet and the entire connection... */
4953 		STATUP(data->nsd, dropped);
4954 		ZTATUP(data->nsd, data->query->zone, dropped);
4955 		cleanup_tcp_handler(data);
4956 		return;
4957 	}
4958 
4959 #ifdef BIND8_STATS
4960 	if (RCODE(data->query->packet) == RCODE_OK
4961 	    && !AA(data->query->packet))
4962 	{
4963 		STATUP(data->nsd, nona);
4964 		ZTATUP(data->nsd, data->query->zone, nona);
4965 	}
4966 #endif /* BIND8_STATS */
4967 
4968 #ifdef USE_ZONE_STATS
4969 #ifndef INET6
4970 	ZTATUP(data->nsd, data->query->zone, ctls);
4971 #else
4972 	if (data->query->remote_addr.ss_family == AF_INET) {
4973 		ZTATUP(data->nsd, data->query->zone, ctls);
4974 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4975 		ZTATUP(data->nsd, data->query->zone, ctls6);
4976 	}
4977 #endif
4978 #endif /* USE_ZONE_STATS */
4979 
4980 	query_add_optional(data->query, data->nsd, &now);
4981 
4982 	/* Switch to the tcp write handler.  */
4983 	buffer_flip(data->query->packet);
4984 	data->query->tcplen = buffer_remaining(data->query->packet);
4985 #ifdef BIND8_STATS
4986 	/* Account the rcode & TC... */
4987 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4988 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4989 	if (TC(data->query->packet)) {
4990 		STATUP(data->nsd, truncated);
4991 		ZTATUP(data->nsd, data->query->zone, truncated);
4992 	}
4993 #endif /* BIND8_STATS */
4994 #ifdef USE_DNSTAP
4995 	/*
4996 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4997 	 */
4998 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4999 	log_addr("response to client", &data->query->client_addr);
5000 	if(verbosity >= 6 && data->query->is_proxied)
5001 		log_addr("response via proxy", &data->query->remote_addr);
5002 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
5003 		data->query->client_addrlen, data->query->tcp, data->query->packet,
5004 		data->query->zone);
5005 #endif /* USE_DNSTAP */
5006 	data->bytes_transmitted = 0;
5007 
5008 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
5009 
5010 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
5011 	handle_tls_writing(fd, EV_WRITE, data);
5012 }
5013 
5014 /** handle TLS writing of outgoing response */
5015 static void
handle_tls_writing(int fd,short event,void * arg)5016 handle_tls_writing(int fd, short event, void* arg)
5017 {
5018 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
5019 	ssize_t sent;
5020 	struct query *q = data->query;
5021 	/* static variable that holds reassembly buffer used to put the
5022 	 * TCP length in front of the packet, like writev. */
5023 	static buffer_type* global_tls_temp_buffer = NULL;
5024 	buffer_type* write_buffer;
5025 	uint32_t now = 0;
5026 
5027 	if ((event & EV_TIMEOUT)) {
5028 		/* Connection timed out.  */
5029 		cleanup_tcp_handler(data);
5030 		return;
5031 	}
5032 
5033 	assert((event & EV_WRITE));
5034 
5035 	if(data->shake_state != tls_hs_none) {
5036 		if(!tls_handshake(data, fd, 1))
5037 			return;
5038 		if(data->shake_state != tls_hs_none)
5039 			return;
5040 	}
5041 
5042 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
5043 
5044 	/* If we are writing the start of a message, we must include the length
5045 	 * this is done with a copy into write_buffer. */
5046 	write_buffer = NULL;
5047 	if (data->bytes_transmitted == 0) {
5048 		if(!global_tls_temp_buffer) {
5049 			/* gets deallocated when nsd shuts down from
5050 			 * nsd.region */
5051 			global_tls_temp_buffer = buffer_create(nsd.region,
5052 				QIOBUFSZ + sizeof(q->tcplen));
5053 			if (!global_tls_temp_buffer) {
5054 				return;
5055 			}
5056 		}
5057 		write_buffer = global_tls_temp_buffer;
5058 		buffer_clear(write_buffer);
5059 		buffer_write_u16(write_buffer, q->tcplen);
5060 		buffer_write(write_buffer, buffer_current(q->packet),
5061 			(int)buffer_remaining(q->packet));
5062 		buffer_flip(write_buffer);
5063 	} else {
5064 		write_buffer = q->packet;
5065 	}
5066 
5067 	/* Write the response */
5068 	ERR_clear_error();
5069 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
5070 	if(sent <= 0) {
5071 		int want = SSL_get_error(data->tls, sent);
5072 		if(want == SSL_ERROR_ZERO_RETURN) {
5073 			cleanup_tcp_handler(data);
5074 			/* closed */
5075 		} else if(want == SSL_ERROR_WANT_READ) {
5076 			/* switch back to reading */
5077 			data->shake_state = tls_hs_read_event;
5078 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
5079 		} else if(want != SSL_ERROR_WANT_WRITE) {
5080 			cleanup_tcp_handler(data);
5081 			log_crypto_err("could not SSL_write");
5082 		}
5083 		return;
5084 	}
5085 
5086 	buffer_skip(write_buffer, sent);
5087 	if(buffer_remaining(write_buffer) != 0) {
5088 		/* If not all sent, sync up the real buffer if it wasn't used.*/
5089 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
5090 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
5091 		}
5092 	}
5093 
5094 	data->bytes_transmitted += sent;
5095 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
5096 		/*
5097 		 * Still more data to write when socket becomes
5098 		 * writable again.
5099 		 */
5100 		return;
5101 	}
5102 
5103 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
5104 
5105 	if (data->query_state == QUERY_IN_AXFR ||
5106 		data->query_state == QUERY_IN_IXFR) {
5107 		/* Continue processing AXFR and writing back results.  */
5108 		buffer_clear(q->packet);
5109 		if(data->query_state == QUERY_IN_AXFR)
5110 			data->query_state = query_axfr(data->nsd, q, 0);
5111 		else data->query_state = query_ixfr(data->nsd, q);
5112 		if (data->query_state != QUERY_PROCESSED) {
5113 			query_add_optional(data->query, data->nsd, &now);
5114 
5115 			/* Reset data. */
5116 			buffer_flip(q->packet);
5117 			q->tcplen = buffer_remaining(q->packet);
5118 			data->bytes_transmitted = 0;
5119 			/* Reset to writing mode.  */
5120 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
5121 
5122 			/*
5123 			 * Write data if/when the socket is writable
5124 			 * again.
5125 			 */
5126 			return;
5127 		}
5128 	}
5129 
5130 	/*
5131 	 * Done sending, wait for the next request to arrive on the
5132 	 * TCP socket by installing the TCP read handler.
5133 	 */
5134 	if ((data->nsd->tcp_query_count > 0 &&
5135 		data->query_count >= data->nsd->tcp_query_count) ||
5136 		data->tcp_no_more_queries) {
5137 
5138 		(void) shutdown(fd, SHUT_WR);
5139 	}
5140 
5141 	data->bytes_transmitted = 0;
5142 	data->query_needs_reset = 1;
5143 
5144 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
5145 }
5146 #endif
5147 
5148 static void
handle_slowaccept_timeout(int ATTR_UNUSED (fd),short ATTR_UNUSED (event),void * ATTR_UNUSED (arg))5149 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
5150 	void* ATTR_UNUSED(arg))
5151 {
5152 	if(slowaccept) {
5153 		configure_handler_event_types(EV_PERSIST | EV_READ);
5154 		slowaccept = 0;
5155 	}
5156 }
5157 
perform_accept(int fd,struct sockaddr * addr,socklen_t * addrlen)5158 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
5159 {
5160 #ifndef HAVE_ACCEPT4
5161 	int s = accept(fd, addr, addrlen);
5162 	if (s != -1) {
5163 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
5164 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
5165 			close(s);
5166 			s = -1;
5167 			errno=EINTR; /* stop error printout as error in accept4
5168 				by setting this errno, it omits printout, in
5169 				later code that calls nsd_accept4 */
5170 		}
5171 	}
5172 	return s;
5173 #else
5174 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
5175 #endif /* HAVE_ACCEPT4 */
5176 }
5177 
5178 /*
5179  * Handle an incoming TCP connection.  The connection is accepted and
5180  * a new TCP reader event handler is added.  The TCP handler
5181  * is responsible for cleanup when the connection is closed.
5182  */
5183 static void
handle_tcp_accept(int fd,short event,void * arg)5184 handle_tcp_accept(int fd, short event, void* arg)
5185 {
5186 	struct tcp_accept_handler_data *data
5187 		= (struct tcp_accept_handler_data *) arg;
5188 	int s;
5189 	int reject = 0;
5190 	struct tcp_handler_data *tcp_data;
5191 	region_type *tcp_region;
5192 #ifdef INET6
5193 	struct sockaddr_storage addr;
5194 #else
5195 	struct sockaddr_in addr;
5196 #endif
5197 	socklen_t addrlen;
5198 	struct timeval timeout;
5199 
5200 	if (!(event & EV_READ)) {
5201 		return;
5202 	}
5203 
5204 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
5205 		reject = data->nsd->options->tcp_reject_overflow;
5206 		if (!reject) {
5207 			return;
5208 		}
5209 	}
5210 
5211 	/* Accept it... */
5212 	addrlen = sizeof(addr);
5213 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
5214 	if (s == -1) {
5215 		/**
5216 		 * EMFILE and ENFILE is a signal that the limit of open
5217 		 * file descriptors has been reached. Pause accept().
5218 		 * EINTR is a signal interrupt. The others are various OS ways
5219 		 * of saying that the client has closed the connection.
5220 		 */
5221 		if (errno == EMFILE || errno == ENFILE) {
5222 			if (!slowaccept) {
5223 				/* disable accept events */
5224 				struct timeval tv;
5225 				configure_handler_event_types(0);
5226 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
5227 				tv.tv_usec = 0L;
5228 				memset(&slowaccept_event, 0,
5229 					sizeof(slowaccept_event));
5230 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
5231 					handle_slowaccept_timeout, NULL);
5232 				(void)event_base_set(data->event.ev_base,
5233 					&slowaccept_event);
5234 				(void)event_add(&slowaccept_event, &tv);
5235 				slowaccept = 1;
5236 				/* We don't want to spam the logs here */
5237 			}
5238 		} else if (errno != EINTR
5239 			&& errno != EWOULDBLOCK
5240 #ifdef ECONNABORTED
5241 			&& errno != ECONNABORTED
5242 #endif /* ECONNABORTED */
5243 #ifdef EPROTO
5244 			&& errno != EPROTO
5245 #endif /* EPROTO */
5246 			) {
5247 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
5248 		}
5249 		return;
5250 	}
5251 
5252 	if (reject) {
5253 		shutdown(s, SHUT_RDWR);
5254 		close(s);
5255 		return;
5256 	}
5257 
5258 	/*
5259 	 * This region is deallocated when the TCP connection is
5260 	 * closed by the TCP handler.
5261 	 */
5262 	tcp_region = region_create(xalloc, free);
5263 	tcp_data = (struct tcp_handler_data *) region_alloc(
5264 		tcp_region, sizeof(struct tcp_handler_data));
5265 	tcp_data->region = tcp_region;
5266 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
5267 		compression_table_size, compressed_dnames);
5268 	tcp_data->nsd = data->nsd;
5269 	tcp_data->query_count = 0;
5270 #ifdef HAVE_SSL
5271 	tcp_data->shake_state = tls_hs_none;
5272 	tcp_data->tls = NULL;
5273 #endif
5274 	tcp_data->query_needs_reset = 1;
5275 	tcp_data->pp2_enabled = data->pp2_enabled;
5276 	tcp_data->pp2_header_state = pp2_header_none;
5277 	tcp_data->prev = NULL;
5278 	tcp_data->next = NULL;
5279 
5280 	tcp_data->query_state = QUERY_PROCESSED;
5281 	tcp_data->bytes_transmitted = 0;
5282 	memcpy(&tcp_data->query->remote_addr, &addr, addrlen);
5283 	tcp_data->query->remote_addrlen = addrlen;
5284 	/* Copy remote_address to client_address.
5285 	 * Simplest way/time for streams to do that. */
5286 	memcpy(&tcp_data->query->client_addr, &addr, addrlen);
5287 	tcp_data->query->client_addrlen = addrlen;
5288 	tcp_data->query->is_proxied = 0;
5289 
5290 	tcp_data->tcp_no_more_queries = 0;
5291 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
5292 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
5293 		/* very busy, give smaller timeout */
5294 		tcp_data->tcp_timeout = 200;
5295 	}
5296 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5297 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
5298 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
5299 
5300 #ifdef USE_DNSTAP
5301 	/* save the address of the connection */
5302 	tcp_data->socket = data->socket;
5303 #endif /* USE_DNSTAP */
5304 
5305 #ifdef HAVE_SSL
5306 	if (data->tls_accept) {
5307 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
5308 		if(!tcp_data->tls) {
5309 			close(s);
5310 			return;
5311 		}
5312 		tcp_data->shake_state = tls_hs_read;
5313 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5314 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5315 			  handle_tls_reading, tcp_data);
5316 	} else {
5317 #endif
5318 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5319 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5320 			  handle_tcp_reading, tcp_data);
5321 #ifdef HAVE_SSL
5322 	}
5323 #endif
5324 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
5325 		log_msg(LOG_ERR, "cannot set tcp event base");
5326 		close(s);
5327 		region_destroy(tcp_region);
5328 		return;
5329 	}
5330 	if(event_add(&tcp_data->event, &timeout) != 0) {
5331 		log_msg(LOG_ERR, "cannot add tcp to event base");
5332 		close(s);
5333 		region_destroy(tcp_region);
5334 		return;
5335 	}
5336 	if(tcp_active_list) {
5337 		tcp_active_list->prev = tcp_data;
5338 		tcp_data->next = tcp_active_list;
5339 	}
5340 	tcp_active_list = tcp_data;
5341 
5342 	/*
5343 	 * Keep track of the total number of TCP handlers installed so
5344 	 * we can stop accepting connections when the maximum number
5345 	 * of simultaneous TCP connections is reached.
5346 	 *
5347 	 * If tcp-reject-overflow is enabled, however, then we do not
5348 	 * change the handler event type; we keep it as-is and accept
5349 	 * overflow TCP connections only so that we can forcibly kill
5350 	 * them off.
5351 	 */
5352 	++data->nsd->current_tcp_count;
5353 	if (!data->nsd->options->tcp_reject_overflow &&
5354 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
5355 	{
5356 		configure_handler_event_types(0);
5357 	}
5358 }
5359 
5360 static void
send_children_command(struct nsd * nsd,sig_atomic_t command,int timeout)5361 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
5362 {
5363 	size_t i;
5364 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5365 	for (i = 0; i < nsd->child_count; ++i) {
5366 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
5367 			if (write(nsd->children[i].child_fd,
5368 				&command,
5369 				sizeof(command)) == -1)
5370 			{
5371 				if(errno != EAGAIN && errno != EINTR)
5372 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
5373 					(int) command,
5374 					(int) nsd->children[i].pid,
5375 					strerror(errno));
5376 			} else if (timeout > 0) {
5377 				(void)block_read(NULL,
5378 					nsd->children[i].child_fd,
5379 					&command, sizeof(command), timeout);
5380 			}
5381 			fsync(nsd->children[i].child_fd);
5382 			close(nsd->children[i].child_fd);
5383 			nsd->children[i].child_fd = -1;
5384 		}
5385 	}
5386 }
5387 
5388 static void
send_children_quit(struct nsd * nsd)5389 send_children_quit(struct nsd* nsd)
5390 {
5391 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
5392 	send_children_command(nsd, NSD_QUIT, 0);
5393 }
5394 
5395 static void
send_children_quit_and_wait(struct nsd * nsd)5396 send_children_quit_and_wait(struct nsd* nsd)
5397 {
5398 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
5399 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
5400 }
5401 
5402 #ifdef BIND8_STATS
5403 static void
set_children_stats(struct nsd * nsd)5404 set_children_stats(struct nsd* nsd)
5405 {
5406 	size_t i;
5407 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5408 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
5409 	for (i = 0; i < nsd->child_count; ++i) {
5410 		nsd->children[i].need_to_send_STATS = 1;
5411 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
5412 	}
5413 }
5414 #endif /* BIND8_STATS */
5415 
5416 static void
configure_handler_event_types(short event_types)5417 configure_handler_event_types(short event_types)
5418 {
5419 	size_t i;
5420 
5421 	for (i = 0; i < tcp_accept_handler_count; ++i) {
5422 		struct event* handler = &tcp_accept_handlers[i].event;
5423 		if(event_types) {
5424 			/* reassign */
5425 			int fd = handler->ev_fd;
5426 			struct event_base* base = handler->ev_base;
5427 			if(tcp_accept_handlers[i].event_added)
5428 				event_del(handler);
5429 			memset(handler, 0, sizeof(*handler));
5430 			event_set(handler, fd, event_types,
5431 				handle_tcp_accept, &tcp_accept_handlers[i]);
5432 			if(event_base_set(base, handler) != 0)
5433 				log_msg(LOG_ERR, "conhand: cannot event_base");
5434 			if(event_add(handler, NULL) != 0)
5435 				log_msg(LOG_ERR, "conhand: cannot event_add");
5436 			tcp_accept_handlers[i].event_added = 1;
5437 		} else {
5438 			/* remove */
5439 			if(tcp_accept_handlers[i].event_added) {
5440 				event_del(handler);
5441 				tcp_accept_handlers[i].event_added = 0;
5442 			}
5443 		}
5444 	}
5445 }
5446