xref: /netbsd/external/bsd/nsd/dist/server.c (revision cb958623)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #include "ixfr.h"
85 #ifdef USE_DNSTAP
86 #include "dnstap/dnstap_collector.h"
87 #endif
88 #include "verify.h"
89 
90 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
91 
92 #ifdef USE_DNSTAP
93 /*
94  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
95  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
96  */
97 static void
log_addr(const char * descr,struct sockaddr_storage * addr)98 log_addr(const char* descr,
99 #ifdef INET6
100 	struct sockaddr_storage* addr
101 #else
102 	struct sockaddr_in* addr
103 #endif
104 	)
105 {
106 	char str_buf[64];
107 	if(verbosity < 6)
108 		return;
109 	if(
110 #ifdef INET6
111 		addr->ss_family == AF_INET
112 #else
113 		addr->sin_family == AF_INET
114 #endif
115 		) {
116 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
117 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
118 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
119 #ifdef INET6
120 	} else {
121 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
122 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
123 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
124 #endif
125 	}
126 }
127 #endif /* USE_DNSTAP */
128 
129 #ifdef USE_TCP_FASTOPEN
130   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
131   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
132 #endif
133 
134 /*
135  * Data for the UDP handlers.
136  */
137 struct udp_handler_data
138 {
139 	struct nsd        *nsd;
140 	struct nsd_socket *socket;
141 	struct event       event;
142 };
143 
144 struct tcp_accept_handler_data {
145 	struct nsd        *nsd;
146 	struct nsd_socket *socket;
147 	int                event_added;
148 	struct event       event;
149 #ifdef HAVE_SSL
150 	/* handler accepts TLS connections on the dedicated port */
151 	int                tls_accept;
152 #endif
153 };
154 
155 /*
156  * These globals are used to enable the TCP accept handlers
157  * when the number of TCP connection drops below the maximum
158  * number of TCP connections.
159  */
160 static size_t tcp_accept_handler_count;
161 static struct tcp_accept_handler_data *tcp_accept_handlers;
162 
163 static struct event slowaccept_event;
164 static int slowaccept;
165 
166 #ifdef HAVE_SSL
167 static unsigned char *ocspdata = NULL;
168 static long ocspdata_len = 0;
169 #endif
170 
171 #ifdef NONBLOCKING_IS_BROKEN
172 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
173    read multiple times from a socket when reported ready by select. */
174 # define NUM_RECV_PER_SELECT (1)
175 #else /* !NONBLOCKING_IS_BROKEN */
176 # define NUM_RECV_PER_SELECT (100)
177 #endif /* NONBLOCKING_IS_BROKEN */
178 
179 #ifndef HAVE_MMSGHDR
180 struct mmsghdr {
181 	struct msghdr msg_hdr;
182 	unsigned int  msg_len;
183 };
184 #endif
185 
186 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
187 static struct iovec iovecs[NUM_RECV_PER_SELECT];
188 static struct query *queries[NUM_RECV_PER_SELECT];
189 
190 /*
191  * Data for the TCP connection handlers.
192  *
193  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
194  * blocking the entire server on a slow TCP connection, but does make
195  * reading from and writing to the socket more complicated.
196  *
197  * Basically, whenever a read/write would block (indicated by the
198  * EAGAIN errno variable) we remember the position we were reading
199  * from/writing to and return from the TCP reading/writing event
200  * handler.  When the socket becomes readable/writable again we
201  * continue from the same position.
202  */
203 struct tcp_handler_data
204 {
205 	/*
206 	 * The region used to allocate all TCP connection related
207 	 * data, including this structure.  This region is destroyed
208 	 * when the connection is closed.
209 	 */
210 	region_type*		region;
211 
212 	/*
213 	 * The global nsd structure.
214 	 */
215 	struct nsd*			nsd;
216 
217 	/*
218 	 * The current query data for this TCP connection.
219 	 */
220 	query_type*			query;
221 
222 	/*
223 	 * The query_state is used to remember if we are performing an
224 	 * AXFR, if we're done processing, or if we should discard the
225 	 * query and connection.
226 	 */
227 	query_state_type	query_state;
228 
229 	/*
230 	 * The event for the file descriptor and tcp timeout
231 	 */
232 	struct event event;
233 
234 	/*
235 	 * The bytes_transmitted field is used to remember the number
236 	 * of bytes transmitted when receiving or sending a DNS
237 	 * packet.  The count includes the two additional bytes used
238 	 * to specify the packet length on a TCP connection.
239 	 */
240 	size_t				bytes_transmitted;
241 
242 	/*
243 	 * The number of queries handled by this specific TCP connection.
244 	 */
245 	int					query_count;
246 
247 	/*
248 	 * The timeout in msec for this tcp connection
249 	 */
250 	int	tcp_timeout;
251 
252 	/*
253 	 * If the connection is allowed to have further queries on it.
254 	 */
255 	int tcp_no_more_queries;
256 
257 #ifdef USE_DNSTAP
258 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
259 	struct nsd_socket *socket;
260 #endif /* USE_DNSTAP */
261 
262 #ifdef HAVE_SSL
263 	/*
264 	 * TLS object.
265 	 */
266 	SSL* tls;
267 
268 	/*
269 	 * TLS handshake state.
270 	 */
271 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
272 		tls_hs_read_event, tls_hs_write_event } shake_state;
273 #endif
274 	/* list of connections, for service of remaining tcp channels */
275 	struct tcp_handler_data *prev, *next;
276 };
277 /* global that is the list of active tcp channels */
278 static struct tcp_handler_data *tcp_active_list = NULL;
279 
280 /*
281  * Handle incoming queries on the UDP server sockets.
282  */
283 static void handle_udp(int fd, short event, void* arg);
284 
285 /*
286  * Handle incoming connections on the TCP sockets.  These handlers
287  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
288  * connection) but are disabled when the number of current TCP
289  * connections is equal to the maximum number of TCP connections.
290  * Disabling is done by changing the handler to wait for the
291  * NETIO_EVENT_NONE type.  This is done using the function
292  * configure_tcp_accept_handlers.
293  */
294 static void handle_tcp_accept(int fd, short event, void* arg);
295 
296 /*
297  * Handle incoming queries on a TCP connection.  The TCP connections
298  * are configured to be non-blocking and the handler may be called
299  * multiple times before a complete query is received.
300  */
301 static void handle_tcp_reading(int fd, short event, void* arg);
302 
303 /*
304  * Handle outgoing responses on a TCP connection.  The TCP connections
305  * are configured to be non-blocking and the handler may be called
306  * multiple times before a complete response is sent.
307  */
308 static void handle_tcp_writing(int fd, short event, void* arg);
309 
310 #ifdef HAVE_SSL
311 /* Create SSL object and associate fd */
312 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
313 /*
314  * Handle TLS handshake. May be called multiple times if incomplete.
315  */
316 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
317 
318 /*
319  * Handle incoming queries on a TLS over TCP connection.  The TLS
320  * connections are configured to be non-blocking and the handler may
321  * be called multiple times before a complete query is received.
322  */
323 static void handle_tls_reading(int fd, short event, void* arg);
324 
325 /*
326  * Handle outgoing responses on a TLS over TCP connection.  The TLS
327  * connections are configured to be non-blocking and the handler may
328  * be called multiple times before a complete response is sent.
329  */
330 static void handle_tls_writing(int fd, short event, void* arg);
331 #endif
332 
333 /*
334  * Send all children the quit nonblocking, then close pipe.
335  */
336 static void send_children_quit(struct nsd* nsd);
337 /* same, for shutdown time, waits for child to exit to avoid restart issues */
338 static void send_children_quit_and_wait(struct nsd* nsd);
339 
340 /* set childrens flags to send NSD_STATS to them */
341 #ifdef BIND8_STATS
342 static void set_children_stats(struct nsd* nsd);
343 #endif /* BIND8_STATS */
344 
345 /*
346  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
347  */
348 static void configure_handler_event_types(short event_types);
349 
350 static uint16_t *compressed_dname_offsets = 0;
351 static uint32_t compression_table_capacity = 0;
352 static uint32_t compression_table_size = 0;
353 static domain_type* compressed_dnames[MAXRRSPP];
354 
355 #ifdef USE_TCP_FASTOPEN
356 /* Checks to see if the kernel value must be manually changed in order for
357    TCP Fast Open to support server mode */
report_tcp_fastopen_config()358 static void report_tcp_fastopen_config() {
359 
360 	int tcp_fastopen_fp;
361 	uint8_t tcp_fastopen_value;
362 
363 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
364 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
365 	}
366 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
367 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
368 		close(tcp_fastopen_fp);
369 	}
370 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
371 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
372 		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
373 		log_msg(LOG_WARNING, "To enable TFO use the command:");
374 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
375 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
376 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
377 		close(tcp_fastopen_fp);
378 	}
379 	close(tcp_fastopen_fp);
380 }
381 #endif
382 
383 /*
384  * Remove the specified pid from the list of child pids.  Returns -1 if
385  * the pid is not in the list, child_num otherwise.  The field is set to 0.
386  */
387 static int
delete_child_pid(struct nsd * nsd,pid_t pid)388 delete_child_pid(struct nsd *nsd, pid_t pid)
389 {
390 	size_t i;
391 	for (i = 0; i < nsd->child_count; ++i) {
392 		if (nsd->children[i].pid == pid) {
393 			nsd->children[i].pid = 0;
394 			if(!nsd->children[i].need_to_exit) {
395 				if(nsd->children[i].child_fd != -1)
396 					close(nsd->children[i].child_fd);
397 				nsd->children[i].child_fd = -1;
398 				if(nsd->children[i].handler)
399 					nsd->children[i].handler->fd = -1;
400 			}
401 			return i;
402 		}
403 	}
404 	return -1;
405 }
406 
407 /*
408  * Restart child servers if necessary.
409  */
410 static int
restart_child_servers(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)411 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
412 	int* xfrd_sock_p)
413 {
414 	struct main_ipc_handler_data *ipc_data;
415 	size_t i;
416 	int sv[2];
417 
418 	/* Fork the child processes... */
419 	for (i = 0; i < nsd->child_count; ++i) {
420 		if (nsd->children[i].pid <= 0) {
421 			if (nsd->children[i].child_fd != -1)
422 				close(nsd->children[i].child_fd);
423 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
424 				log_msg(LOG_ERR, "socketpair: %s",
425 					strerror(errno));
426 				return -1;
427 			}
428 			nsd->children[i].child_fd = sv[0];
429 			nsd->children[i].parent_fd = sv[1];
430 			nsd->children[i].pid = fork();
431 			switch (nsd->children[i].pid) {
432 			default: /* SERVER MAIN */
433 				close(nsd->children[i].parent_fd);
434 				nsd->children[i].parent_fd = -1;
435 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
436 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
437 				}
438 				if(!nsd->children[i].handler)
439 				{
440 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
441 						region, sizeof(struct main_ipc_handler_data));
442 					ipc_data->nsd = nsd;
443 					ipc_data->child = &nsd->children[i];
444 					ipc_data->child_num = i;
445 					ipc_data->xfrd_sock = xfrd_sock_p;
446 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
447 					ipc_data->forward_mode = 0;
448 					ipc_data->got_bytes = 0;
449 					ipc_data->total_bytes = 0;
450 					ipc_data->acl_num = 0;
451 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
452 						region, sizeof(struct netio_handler));
453 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
454 					nsd->children[i].handler->timeout = NULL;
455 					nsd->children[i].handler->user_data = ipc_data;
456 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
457 					nsd->children[i].handler->event_handler = parent_handle_child_command;
458 					netio_add_handler(netio, nsd->children[i].handler);
459 				}
460 				/* clear any ongoing ipc */
461 				ipc_data = (struct main_ipc_handler_data*)
462 					nsd->children[i].handler->user_data;
463 				ipc_data->forward_mode = 0;
464 				/* restart - update fd */
465 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
466 				break;
467 			case 0: /* CHILD */
468 				/* the child need not be able to access the
469 				 * nsd.db file */
470 				namedb_close_udb(nsd->db);
471 #ifdef MEMCLEAN /* OS collects memory pages */
472 				region_destroy(region);
473 #endif
474 				nsd->pid = 0;
475 				nsd->child_count = 0;
476 				nsd->server_kind = nsd->children[i].kind;
477 				nsd->this_child = &nsd->children[i];
478 				nsd->this_child->child_num = i;
479 				/* remove signal flags inherited from parent
480 				   the parent will handle them. */
481 				nsd->signal_hint_reload_hup = 0;
482 				nsd->signal_hint_reload = 0;
483 				nsd->signal_hint_child = 0;
484 				nsd->signal_hint_quit = 0;
485 				nsd->signal_hint_shutdown = 0;
486 				nsd->signal_hint_stats = 0;
487 				nsd->signal_hint_statsusr = 0;
488 				close(*xfrd_sock_p);
489 				close(nsd->this_child->child_fd);
490 				nsd->this_child->child_fd = -1;
491 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
492 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
493 				}
494 				server_child(nsd);
495 				/* NOTREACH */
496 				exit(0);
497 			case -1:
498 				log_msg(LOG_ERR, "fork failed: %s",
499 					strerror(errno));
500 				return -1;
501 			}
502 		}
503 	}
504 	return 0;
505 }
506 
507 #ifdef BIND8_STATS
set_bind8_alarm(struct nsd * nsd)508 static void set_bind8_alarm(struct nsd* nsd)
509 {
510 	/* resync so that the next alarm is on the next whole minute */
511 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
512 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
513 }
514 #endif
515 
516 /* set zone stat ids for zones initially read in */
517 static void
zonestatid_tree_set(struct nsd * nsd)518 zonestatid_tree_set(struct nsd* nsd)
519 {
520 	struct radnode* n;
521 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
522 		zone_type* zone = (zone_type*)n->elem;
523 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
524 	}
525 }
526 
527 #ifdef USE_ZONE_STATS
528 void
server_zonestat_alloc(struct nsd * nsd)529 server_zonestat_alloc(struct nsd* nsd)
530 {
531 	size_t num = (nsd->options->zonestatnames->count==0?1:
532 			nsd->options->zonestatnames->count);
533 	size_t sz = sizeof(struct nsdst)*num;
534 	char tmpfile[256];
535 	uint8_t z = 0;
536 
537 	/* file names */
538 	nsd->zonestatfname[0] = 0;
539 	nsd->zonestatfname[1] = 0;
540 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
541 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
542 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
543 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
544 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
545 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
546 
547 	/* file descriptors */
548 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
549 	if(nsd->zonestatfd[0] == -1) {
550 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
551 			strerror(errno));
552 		exit(1);
553 	}
554 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
555 	if(nsd->zonestatfd[0] == -1) {
556 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
557 			strerror(errno));
558 		close(nsd->zonestatfd[0]);
559 		unlink(nsd->zonestatfname[0]);
560 		exit(1);
561 	}
562 
563 #ifdef HAVE_MMAP
564 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
565 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
566 			strerror(errno));
567 		exit(1);
568 	}
569 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
570 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
571 			nsd->zonestatfname[0], strerror(errno));
572 		exit(1);
573 	}
574 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
575 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
576 			strerror(errno));
577 		exit(1);
578 	}
579 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
580 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
581 			nsd->zonestatfname[1], strerror(errno));
582 		exit(1);
583 	}
584 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
585 		MAP_SHARED, nsd->zonestatfd[0], 0);
586 	if(nsd->zonestat[0] == MAP_FAILED) {
587 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
588 		unlink(nsd->zonestatfname[0]);
589 		unlink(nsd->zonestatfname[1]);
590 		exit(1);
591 	}
592 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
593 		MAP_SHARED, nsd->zonestatfd[1], 0);
594 	if(nsd->zonestat[1] == MAP_FAILED) {
595 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
596 		unlink(nsd->zonestatfname[0]);
597 		unlink(nsd->zonestatfname[1]);
598 		exit(1);
599 	}
600 	memset(nsd->zonestat[0], 0, sz);
601 	memset(nsd->zonestat[1], 0, sz);
602 	nsd->zonestatsize[0] = num;
603 	nsd->zonestatsize[1] = num;
604 	nsd->zonestatdesired = num;
605 	nsd->zonestatsizenow = num;
606 	nsd->zonestatnow = nsd->zonestat[0];
607 #endif /* HAVE_MMAP */
608 }
609 
610 void
zonestat_remap(struct nsd * nsd,int idx,size_t sz)611 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
612 {
613 #ifdef HAVE_MMAP
614 #ifdef MREMAP_MAYMOVE
615 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
616 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
617 		MREMAP_MAYMOVE);
618 	if(nsd->zonestat[idx] == MAP_FAILED) {
619 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
620 		exit(1);
621 	}
622 #else /* !HAVE MREMAP */
623 	if(msync(nsd->zonestat[idx],
624 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
625 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
626 	if(munmap(nsd->zonestat[idx],
627 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
628 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
629 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
630 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
631 	if(nsd->zonestat[idx] == MAP_FAILED) {
632 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
633 		exit(1);
634 	}
635 #endif /* MREMAP */
636 #endif /* HAVE_MMAP */
637 }
638 
639 /* realloc the zonestat array for the one that is not currently in use,
640  * to match the desired new size of the array (if applicable) */
641 void
server_zonestat_realloc(struct nsd * nsd)642 server_zonestat_realloc(struct nsd* nsd)
643 {
644 #ifdef HAVE_MMAP
645 	uint8_t z = 0;
646 	size_t sz;
647 	int idx = 0; /* index of the zonestat array that is not in use */
648 	if(nsd->zonestatnow == nsd->zonestat[0])
649 		idx = 1;
650 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
651 		return;
652 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
653 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
654 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
655 			strerror(errno));
656 		exit(1);
657 	}
658 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
659 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
660 			nsd->zonestatfname[idx], strerror(errno));
661 		exit(1);
662 	}
663 	zonestat_remap(nsd, idx, sz);
664 	/* zero the newly allocated region */
665 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
666 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
667 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
668 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
669 	}
670 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
671 #endif /* HAVE_MMAP */
672 }
673 
674 /* switchover to use the other array for the new children, that
675  * briefly coexist with the old children.  And we want to avoid them
676  * both writing to the same statistics arrays. */
677 void
server_zonestat_switch(struct nsd * nsd)678 server_zonestat_switch(struct nsd* nsd)
679 {
680 	if(nsd->zonestatnow == nsd->zonestat[0]) {
681 		nsd->zonestatnow = nsd->zonestat[1];
682 		nsd->zonestatsizenow = nsd->zonestatsize[1];
683 	} else {
684 		nsd->zonestatnow = nsd->zonestat[0];
685 		nsd->zonestatsizenow = nsd->zonestatsize[0];
686 	}
687 }
688 #endif /* USE_ZONE_STATS */
689 
690 static void
cleanup_dname_compression_tables(void * ptr)691 cleanup_dname_compression_tables(void *ptr)
692 {
693 	free(ptr);
694 	compressed_dname_offsets = NULL;
695 	compression_table_capacity = 0;
696 }
697 
698 static void
initialize_dname_compression_tables(struct nsd * nsd)699 initialize_dname_compression_tables(struct nsd *nsd)
700 {
701 	size_t needed = domain_table_count(nsd->db->domains) + 1;
702 	needed += EXTRA_DOMAIN_NUMBERS;
703 	if(compression_table_capacity < needed) {
704 		if(compressed_dname_offsets) {
705 			region_remove_cleanup(nsd->db->region,
706 				cleanup_dname_compression_tables,
707 				compressed_dname_offsets);
708 			free(compressed_dname_offsets);
709 		}
710 		compressed_dname_offsets = (uint16_t *) xmallocarray(
711 			needed, sizeof(uint16_t));
712 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
713 			compressed_dname_offsets);
714 		compression_table_capacity = needed;
715 		compression_table_size=domain_table_count(nsd->db->domains)+1;
716 	}
717 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
718 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
719 }
720 
721 static int
set_cloexec(struct nsd_socket * sock)722 set_cloexec(struct nsd_socket *sock)
723 {
724 	assert(sock != NULL);
725 
726 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
727 		const char *socktype =
728 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
729 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
730 			socktype, strerror(errno));
731 		return -1;
732 	}
733 
734 	return 1;
735 }
736 
737 static int
set_reuseport(struct nsd_socket * sock)738 set_reuseport(struct nsd_socket *sock)
739 {
740 #ifdef SO_REUSEPORT
741 	int on = 1;
742 #ifdef SO_REUSEPORT_LB
743 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
744 	 * SO_REUSEPORT on Linux. This is what the users want with the config
745 	 * option in nsd.conf; if we actually need local address and port reuse
746 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
747 	 * _LB they want.
748 	 */
749 	int opt = SO_REUSEPORT_LB;
750 	static const char optname[] = "SO_REUSEPORT_LB";
751 #else /* !SO_REUSEPORT_LB */
752 	int opt = SO_REUSEPORT;
753 	static const char optname[] = "SO_REUSEPORT";
754 #endif /* SO_REUSEPORT_LB */
755 
756 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
757 		return 1;
758 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
759 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
760 			optname, strerror(errno));
761 	}
762 	return -1;
763 #else
764 	(void)sock;
765 #endif /* SO_REUSEPORT */
766 
767 	return 0;
768 }
769 
770 static int
set_reuseaddr(struct nsd_socket * sock)771 set_reuseaddr(struct nsd_socket *sock)
772 {
773 #ifdef SO_REUSEADDR
774 	int on = 1;
775 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
776 		return 1;
777 	}
778 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
779 		strerror(errno));
780 	return -1;
781 #endif /* SO_REUSEADDR */
782 	return 0;
783 }
784 
785 static int
set_rcvbuf(struct nsd_socket * sock,int rcv)786 set_rcvbuf(struct nsd_socket *sock, int rcv)
787 {
788 #ifdef SO_RCVBUF
789 #ifdef SO_RCVBUFFORCE
790 	if(0 == setsockopt(
791 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
792 	{
793 		return 1;
794 	}
795 	if(errno == EPERM || errno == ENOBUFS) {
796 		return 0;
797 	}
798 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
799 		strerror(errno));
800 	return -1;
801 #else /* !SO_RCVBUFFORCE */
802 	if (0 == setsockopt(
803 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
804 	{
805 		return 1;
806 	}
807 	if(errno == ENOSYS || errno == ENOBUFS) {
808 		return 0;
809 	}
810 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
811 		strerror(errno));
812 	return -1;
813 #endif /* SO_RCVBUFFORCE */
814 #endif /* SO_RCVBUF */
815 
816 	return 0;
817 }
818 
819 static int
set_sndbuf(struct nsd_socket * sock,int snd)820 set_sndbuf(struct nsd_socket *sock, int snd)
821 {
822 #ifdef SO_SNDBUF
823 #ifdef SO_SNDBUFFORCE
824 	if(0 == setsockopt(
825 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
826 	{
827 		return 1;
828 	}
829 	if(errno == EPERM || errno == ENOBUFS) {
830 		return 0;
831 	}
832 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
833 		strerror(errno));
834 	return -1;
835 #else /* !SO_SNDBUFFORCE */
836 	if(0 == setsockopt(
837 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
838 	{
839 		return 1;
840 	}
841 	if(errno == ENOSYS || errno == ENOBUFS) {
842 		return 0;
843 	}
844 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
845 		strerror(errno));
846 	return -1;
847 #endif /* SO_SNDBUFFORCE */
848 #endif /* SO_SNDBUF */
849 
850 	return 0;
851 }
852 
853 static int
set_nonblock(struct nsd_socket * sock)854 set_nonblock(struct nsd_socket *sock)
855 {
856 	const char *socktype =
857 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
858 
859 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
860 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
861 			socktype, strerror(errno));
862 		return -1;
863 	}
864 
865 	return 1;
866 }
867 
868 #ifdef INET6
869 static int
set_ipv6_v6only(struct nsd_socket * sock)870 set_ipv6_v6only(struct nsd_socket *sock)
871 {
872 #ifdef IPV6_V6ONLY
873 	int on = 1;
874 	const char *socktype =
875 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
876 
877 	if(0 == setsockopt(
878 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
879 	{
880 		return 1;
881 	}
882 
883 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
884 		socktype, strerror(errno));
885 	return -1;
886 #else
887 	(void)sock;
888 #endif /* IPV6_V6ONLY */
889 
890 	return 0;
891 }
892 #endif /* INET6 */
893 
894 #ifdef INET6
895 static int
set_ipv6_use_min_mtu(struct nsd_socket * sock)896 set_ipv6_use_min_mtu(struct nsd_socket *sock)
897 {
898 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
899 #if defined(IPV6_USE_MIN_MTU)
900 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
901 	 * network. Therefore we do not send UDP datagrams larger than the
902 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
903 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
904 	 */
905 	int opt = IPV6_USE_MIN_MTU;
906 	int optval = 1;
907 	static const char optname[] = "IPV6_USE_MIN_MTU";
908 #elif defined(IPV6_MTU)
909 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
910 	 * to the MIN MTU to get the same.
911 	 */
912 	int opt = IPV6_MTU;
913 	int optval = IPV6_MIN_MTU;
914 	static const char optname[] = "IPV6_MTU";
915 #endif
916 	if(0 == setsockopt(
917 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
918 	{
919 		return 1;
920 	}
921 
922 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
923 		optname, strerror(errno));
924 	return -1;
925 #else
926 	(void)sock;
927 #endif /* INET6 */
928 
929 	return 0;
930 }
931 #endif /* INET6 */
932 
933 static int
set_ipv4_no_pmtu_disc(struct nsd_socket * sock)934 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
935 {
936 	int ret = 0;
937 
938 #if defined(IP_MTU_DISCOVER)
939 	int opt = IP_MTU_DISCOVER;
940 	int optval;
941 # if defined(IP_PMTUDISC_OMIT)
942 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
943 	 * information and send packets with DF=0. Fragmentation is allowed if
944 	 * and only if the packet size exceeds the outgoing interface MTU or
945 	 * the packet encounters smaller MTU link in network. This mitigates
946 	 * DNS fragmentation attacks by preventing forged PMTU information.
947 	 * FreeBSD already has same semantics without setting the option.
948 	 */
949 	optval = IP_PMTUDISC_OMIT;
950 	if(0 == setsockopt(
951 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
952 	{
953 		return 1;
954 	}
955 
956 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
957 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
958 # endif /* IP_PMTUDISC_OMIT */
959 # if defined(IP_PMTUDISC_DONT)
960 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
961 	optval = IP_PMTUDISC_DONT;
962 	if(0 == setsockopt(
963 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
964 	{
965 		return 1;
966 	}
967 
968 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
969 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
970 # endif
971 	ret = -1;
972 #elif defined(IP_DONTFRAG)
973 	int off = 0;
974 	if (0 == setsockopt(
975 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
976 	{
977 		return 1;
978 	}
979 
980 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
981 		strerror(errno));
982 	ret = -1;
983 #else
984 	(void)sock;
985 #endif
986 
987 	return ret;
988 }
989 
990 static int
set_ip_freebind(struct nsd_socket * sock)991 set_ip_freebind(struct nsd_socket *sock)
992 {
993 #ifdef IP_FREEBIND
994 	int on = 1;
995 	const char *socktype =
996 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
997 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
998 	{
999 		return 1;
1000 	}
1001 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1002 		socktype, strerror(errno));
1003 	return -1;
1004 #else
1005 	(void)sock;
1006 #endif /* IP_FREEBIND */
1007 
1008 	return 0;
1009 }
1010 
1011 static int
set_ip_transparent(struct nsd_socket * sock)1012 set_ip_transparent(struct nsd_socket *sock)
1013 {
1014 	/*
1015 	The scandalous preprocessor blob here calls for some explanation :)
1016 	POSIX does not specify an option to bind non-local IPs, so
1017 	platforms developed several implementation-specific options,
1018 	all set in the same way, but with different names.
1019 	For additional complexity, some platform manage this setting
1020 	differently for different address families (IPv4 vs IPv6).
1021 	This scandalous preprocessor blob below abstracts such variability
1022 	in the way which leaves the C code as lean and clear as possible.
1023 	*/
1024 
1025 #if defined(IP_TRANSPARENT)
1026 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1027 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1028 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1029 // as of 2020-01, Linux does not support this on IPv6 programmatically
1030 #elif defined(SO_BINDANY)
1031 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1032 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1033 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1034 #elif defined(IP_BINDANY)
1035 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1036 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1037 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1038 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1039 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1040 #endif
1041 
1042 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1043 	(void)sock;
1044 #else
1045 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1046 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1047 #	endif
1048 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1049 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1050 #	endif
1051 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1052 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1053 #	endif
1054 
1055 	int on = 1;
1056 	const char *socktype =
1057 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1058 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1059 
1060 	if(0 == setsockopt(
1061 		sock->s,
1062 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1063 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1064 		&on, sizeof(on)))
1065 	{
1066 		return 1;
1067 	}
1068 
1069 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1070 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1071 	return -1;
1072 #endif
1073 
1074 	return 0;
1075 }
1076 
1077 static int
set_tcp_maxseg(struct nsd_socket * sock,int mss)1078 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1079 {
1080 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1081 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1082 		return 1;
1083 	}
1084 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1085 		strerror(errno));
1086 	return -1;
1087 #else
1088 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1089 #endif
1090 	return 0;
1091 }
1092 
1093 #ifdef USE_TCP_FASTOPEN
1094 static int
set_tcp_fastopen(struct nsd_socket * sock)1095 set_tcp_fastopen(struct nsd_socket *sock)
1096 {
1097 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1098 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1099 	 */
1100 	int qlen;
1101 
1102 #ifdef __APPLE__
1103 	/* macOS X implementation only supports qlen of 1 via this call. The
1104 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1105 	 * kernel parameter.
1106 	 */
1107 	qlen = 1;
1108 #else
1109 	/* 5 is recommended on Linux. */
1110 	qlen = 5;
1111 #endif
1112 	if (0 == setsockopt(
1113 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1114 	{
1115 		return 1;
1116 	}
1117 
1118 	if (errno == EPERM) {
1119 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1120 				 "; this could likely be because sysctl "
1121 				 "net.inet.tcp.fastopen.enabled, "
1122 				 "net.inet.tcp.fastopen.server_enable, or "
1123 				 "net.ipv4.tcp_fastopen is disabled",
1124 			strerror(errno));
1125 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1126 	 * disabled, except when verbosity enabled for debugging
1127 	 */
1128 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1129 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1130 			strerror(errno));
1131 	}
1132 
1133 	return (errno == ENOPROTOOPT ? 0 : -1);
1134 }
1135 #endif /* USE_TCP_FASTOPEN */
1136 
1137 static int
set_bindtodevice(struct nsd_socket * sock)1138 set_bindtodevice(struct nsd_socket *sock)
1139 {
1140 #if defined(SO_BINDTODEVICE)
1141 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1142 		sock->device, strlen(sock->device)) == -1)
1143 	{
1144 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1145 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1146 		return -1;
1147 	}
1148 
1149 	return 1;
1150 #else
1151 	(void)sock;
1152 	return 0;
1153 #endif
1154 }
1155 
1156 static int
set_setfib(struct nsd_socket * sock)1157 set_setfib(struct nsd_socket *sock)
1158 {
1159 #if defined(SO_SETFIB)
1160 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1161 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1162 	{
1163 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1164 		                 "SO_SETFIB", sock->fib, strerror(errno));
1165 		return -1;
1166 	}
1167 
1168 	return 1;
1169 #else
1170 	(void)sock;
1171 	return 0;
1172 #endif
1173 }
1174 
1175 static int
open_udp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1176 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1177 {
1178 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1179 
1180 	if(-1 == (sock->s = socket(
1181 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1182 	{
1183 #ifdef INET6
1184 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1185 		   (sock->addr.ai_family == AF_INET6) &&
1186 		   (errno == EAFNOSUPPORT))
1187 		{
1188 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1189 				"not supported");
1190 			return 0;
1191 		}
1192 #endif
1193 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1194 		return -1;
1195 	}
1196 
1197 	set_cloexec(sock);
1198 
1199 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1200 		*reuseport_works = (set_reuseport(sock) == 1);
1201 
1202 	if(nsd->options->receive_buffer_size > 0)
1203 		rcv = nsd->options->receive_buffer_size;
1204 	if(set_rcvbuf(sock, rcv) == -1)
1205 		return -1;
1206 
1207 	if(nsd->options->send_buffer_size > 0)
1208 		snd = nsd->options->send_buffer_size;
1209 	if(set_sndbuf(sock, snd) == -1)
1210 		return -1;
1211 #ifdef INET6
1212 	if(sock->addr.ai_family == AF_INET6) {
1213 		if(set_ipv6_v6only(sock) == -1 ||
1214 		   set_ipv6_use_min_mtu(sock) == -1)
1215 			return -1;
1216 	} else
1217 #endif /* INET6 */
1218 	if(sock->addr.ai_family == AF_INET) {
1219 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1220 			return -1;
1221 	}
1222 
1223 	/* Set socket to non-blocking. Otherwise, on operating systems
1224 	 * with thundering herd problems, the UDP recv could block
1225 	 * after select returns readable.
1226 	 */
1227 	set_nonblock(sock);
1228 
1229 	if(nsd->options->ip_freebind)
1230 		(void)set_ip_freebind(sock);
1231 	if(nsd->options->ip_transparent)
1232 		(void)set_ip_transparent(sock);
1233 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1234 		return -1;
1235 	if(sock->fib != -1 && set_setfib(sock) == -1)
1236 		return -1;
1237 
1238 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1239 		char buf[256];
1240 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1241 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1242 			buf, strerror(errno));
1243 		return -1;
1244 	}
1245 
1246 	return 1;
1247 }
1248 
1249 static int
open_tcp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1250 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1251 {
1252 #ifdef USE_TCP_FASTOPEN
1253 	report_tcp_fastopen_config();
1254 #endif
1255 
1256 	(void)reuseport_works;
1257 
1258 	if(-1 == (sock->s = socket(
1259 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1260 	{
1261 #ifdef INET6
1262 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1263 		   (sock->addr.ai_family == AF_INET6) &&
1264 		   (errno == EAFNOSUPPORT))
1265 		{
1266 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1267 			                     "not supported");
1268 			return 0;
1269 		}
1270 #endif /* INET6 */
1271 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1272 		return -1;
1273 	}
1274 
1275 	set_cloexec(sock);
1276 
1277 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1278 		*reuseport_works = (set_reuseport(sock) == 1);
1279 
1280 	(void)set_reuseaddr(sock);
1281 
1282 #ifdef INET6
1283 	if(sock->addr.ai_family == AF_INET6) {
1284 		if (set_ipv6_v6only(sock) == -1 ||
1285 		    set_ipv6_use_min_mtu(sock) == -1)
1286 			return -1;
1287 	}
1288 #endif
1289 
1290 	if(nsd->tcp_mss > 0)
1291 		set_tcp_maxseg(sock, nsd->tcp_mss);
1292 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1293 	   it may block in accept, even if select() says readable. */
1294 	(void)set_nonblock(sock);
1295 	if(nsd->options->ip_freebind)
1296 		(void)set_ip_freebind(sock);
1297 	if(nsd->options->ip_transparent)
1298 		(void)set_ip_transparent(sock);
1299 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1300 		return -1;
1301 	if(sock->fib != -1 && set_setfib(sock) == -1)
1302 		return -1;
1303 
1304 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1305 		char buf[256];
1306 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1307 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1308 			buf, strerror(errno));
1309 		return -1;
1310 	}
1311 
1312 #ifdef USE_TCP_FASTOPEN
1313 	(void)set_tcp_fastopen(sock);
1314 #endif
1315 
1316 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1317 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1318 		return -1;
1319 	}
1320 
1321 	return 1;
1322 }
1323 
1324 /*
1325  * Initialize the server, reuseport, create and bind the sockets.
1326  */
1327 int
server_init(struct nsd * nsd)1328 server_init(struct nsd *nsd)
1329 {
1330 	size_t i;
1331 	int reuseport = 1; /* Determine if REUSEPORT works. */
1332 
1333 	/* open server interface ports */
1334 	for(i = 0; i < nsd->ifs; i++) {
1335 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1336 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1337 		{
1338 			return -1;
1339 		}
1340 	}
1341 
1342 	if(nsd->reuseport && reuseport) {
1343 		size_t ifs = nsd->ifs * nsd->reuseport;
1344 
1345 		/* increase the size of the interface arrays, there are going
1346 		 * to be separate interface file descriptors for every server
1347 		 * instance */
1348 		region_remove_cleanup(nsd->region, free, nsd->udp);
1349 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1350 
1351 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1352 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1353 		region_add_cleanup(nsd->region, free, nsd->udp);
1354 		region_add_cleanup(nsd->region, free, nsd->tcp);
1355 		if(ifs > nsd->ifs) {
1356 			memset(&nsd->udp[nsd->ifs], 0,
1357 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1358 			memset(&nsd->tcp[nsd->ifs], 0,
1359 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1360 		}
1361 
1362 		for(i = nsd->ifs; i < ifs; i++) {
1363 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1364 			nsd->udp[i].s = -1;
1365 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1366 				return -1;
1367 			}
1368 			/* Turn off REUSEPORT for TCP by copying the socket
1369 			 * file descriptor.
1370 			 * This means we should not close TCP used by
1371 			 * other servers in reuseport enabled mode, in
1372 			 * server_child().
1373 			 */
1374 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1375 		}
1376 
1377 		nsd->ifs = ifs;
1378 	} else {
1379 		nsd->reuseport = 0;
1380 	}
1381 
1382 	/* open server interface ports for verifiers */
1383 	for(i = 0; i < nsd->verify_ifs; i++) {
1384 		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1385 		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1386 		{
1387 			return -1;
1388 		}
1389 	}
1390 
1391 	return 0;
1392 }
1393 
1394 /*
1395  * Prepare the server for take off.
1396  *
1397  */
1398 int
server_prepare(struct nsd * nsd)1399 server_prepare(struct nsd *nsd)
1400 {
1401 #ifdef RATELIMIT
1402 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1403 #ifdef HAVE_GETRANDOM
1404 	uint32_t v;
1405 	if(getrandom(&v, sizeof(v), 0) == -1) {
1406 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1407 		exit(1);
1408 	}
1409 	hash_set_raninit(v);
1410 #elif defined(HAVE_ARC4RANDOM)
1411 	hash_set_raninit(arc4random());
1412 #else
1413 	uint32_t v = getpid() ^ time(NULL);
1414 	srandom((unsigned long)v);
1415 #  ifdef HAVE_SSL
1416 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1417 		hash_set_raninit(v);
1418 	else
1419 #  endif
1420 		hash_set_raninit(random());
1421 #endif
1422 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1423 		nsd->options->rrl_ratelimit,
1424 		nsd->options->rrl_whitelist_ratelimit,
1425 		nsd->options->rrl_slip,
1426 		nsd->options->rrl_ipv4_prefix_length,
1427 		nsd->options->rrl_ipv6_prefix_length);
1428 #endif /* RATELIMIT */
1429 
1430 	/* Open the database... */
1431 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1432 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1433 			nsd->dbfile, strerror(errno));
1434 		unlink(nsd->task[0]->fname);
1435 		unlink(nsd->task[1]->fname);
1436 #ifdef USE_ZONE_STATS
1437 		unlink(nsd->zonestatfname[0]);
1438 		unlink(nsd->zonestatfname[1]);
1439 #endif
1440 		xfrd_del_tempdir(nsd);
1441 		return -1;
1442 	}
1443 	/* check if zone files have been modified */
1444 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1445 	 * for all zones */
1446 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1447 		nsd->options->database[0] == 0))
1448 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1449 	zonestatid_tree_set(nsd);
1450 
1451 	compression_table_capacity = 0;
1452 	initialize_dname_compression_tables(nsd);
1453 
1454 #ifdef	BIND8_STATS
1455 	/* Initialize times... */
1456 	time(&nsd->st.boot);
1457 	set_bind8_alarm(nsd);
1458 #endif /* BIND8_STATS */
1459 
1460 	return 0;
1461 }
1462 
1463 /*
1464  * Fork the required number of servers.
1465  */
1466 static int
server_start_children(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)1467 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1468 	int* xfrd_sock_p)
1469 {
1470 	size_t i;
1471 
1472 	/* Start all child servers initially.  */
1473 	for (i = 0; i < nsd->child_count; ++i) {
1474 		nsd->children[i].pid = 0;
1475 	}
1476 
1477 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1478 }
1479 
1480 static void
server_close_socket(struct nsd_socket * sock)1481 server_close_socket(struct nsd_socket *sock)
1482 {
1483 	if(sock->s != -1) {
1484 		close(sock->s);
1485 		sock->s = -1;
1486 	}
1487 }
1488 
1489 void
server_close_all_sockets(struct nsd_socket sockets[],size_t n)1490 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1491 {
1492 	size_t i;
1493 
1494 	/* Close all the sockets... */
1495 	for (i = 0; i < n; ++i) {
1496 		server_close_socket(&sockets[i]);
1497 	}
1498 }
1499 
1500 /*
1501  * Close the sockets, shutdown the server and exit.
1502  * Does not return.
1503  */
1504 void
server_shutdown(struct nsd * nsd)1505 server_shutdown(struct nsd *nsd)
1506 {
1507 	size_t i;
1508 
1509 	server_close_all_sockets(nsd->udp, nsd->ifs);
1510 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1511 	/* CHILD: close command channel to parent */
1512 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1513 	{
1514 		close(nsd->this_child->parent_fd);
1515 		nsd->this_child->parent_fd = -1;
1516 	}
1517 	/* SERVER: close command channels to children */
1518 	if(!nsd->this_child)
1519 	{
1520 		for(i=0; i < nsd->child_count; ++i)
1521 			if(nsd->children[i].child_fd != -1)
1522 			{
1523 				close(nsd->children[i].child_fd);
1524 				nsd->children[i].child_fd = -1;
1525 			}
1526 	}
1527 
1528 	tsig_finalize();
1529 #ifdef HAVE_SSL
1530 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1531 	if (nsd->tls_ctx)
1532 		SSL_CTX_free(nsd->tls_ctx);
1533 #endif
1534 
1535 #ifdef MEMCLEAN /* OS collects memory pages */
1536 #ifdef RATELIMIT
1537 	rrl_mmap_deinit_keep_mmap();
1538 #endif
1539 #ifdef USE_DNSTAP
1540 	dt_collector_destroy(nsd->dt_collector, nsd);
1541 #endif
1542 	udb_base_free_keep_mmap(nsd->task[0]);
1543 	udb_base_free_keep_mmap(nsd->task[1]);
1544 	namedb_free_ixfr(nsd->db);
1545 	namedb_close_udb(nsd->db); /* keeps mmap */
1546 	namedb_close(nsd->db);
1547 	nsd_options_destroy(nsd->options);
1548 	region_destroy(nsd->region);
1549 #endif
1550 	log_finalize();
1551 	exit(0);
1552 }
1553 
1554 void
server_prepare_xfrd(struct nsd * nsd)1555 server_prepare_xfrd(struct nsd* nsd)
1556 {
1557 	char tmpfile[256];
1558 	/* create task mmaps */
1559 	nsd->mytask = 0;
1560 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1561 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1562 	nsd->task[0] = task_file_create(tmpfile);
1563 	if(!nsd->task[0]) {
1564 #ifdef USE_ZONE_STATS
1565 		unlink(nsd->zonestatfname[0]);
1566 		unlink(nsd->zonestatfname[1]);
1567 #endif
1568 		xfrd_del_tempdir(nsd);
1569 		exit(1);
1570 	}
1571 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1572 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1573 	nsd->task[1] = task_file_create(tmpfile);
1574 	if(!nsd->task[1]) {
1575 		unlink(nsd->task[0]->fname);
1576 #ifdef USE_ZONE_STATS
1577 		unlink(nsd->zonestatfname[0]);
1578 		unlink(nsd->zonestatfname[1]);
1579 #endif
1580 		xfrd_del_tempdir(nsd);
1581 		exit(1);
1582 	}
1583 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1584 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1585 	/* create xfrd listener structure */
1586 	nsd->xfrd_listener = region_alloc(nsd->region,
1587 		sizeof(netio_handler_type));
1588 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1589 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1590 	nsd->xfrd_listener->fd = -1;
1591 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1592 		nsd;
1593 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1594 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1595 }
1596 
1597 
1598 void
server_start_xfrd(struct nsd * nsd,int del_db,int reload_active)1599 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1600 {
1601 	pid_t pid;
1602 	int sockets[2] = {0,0};
1603 	struct ipc_handler_conn_data *data;
1604 
1605 	if(nsd->xfrd_listener->fd != -1)
1606 		close(nsd->xfrd_listener->fd);
1607 	if(del_db) {
1608 		/* recreate taskdb that xfrd was using, it may be corrupt */
1609 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1610 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1611 		nsd->task[1-nsd->mytask]->fname = NULL;
1612 		/* free alloc already, so udb does not shrink itself */
1613 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1614 		nsd->task[1-nsd->mytask]->alloc = NULL;
1615 		udb_base_free(nsd->task[1-nsd->mytask]);
1616 		/* create new file, overwrite the old one */
1617 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1618 		free(tmpfile);
1619 	}
1620 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1621 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1622 		return;
1623 	}
1624 	pid = fork();
1625 	switch (pid) {
1626 	case -1:
1627 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1628 		break;
1629 	default:
1630 		/* PARENT: close first socket, use second one */
1631 		close(sockets[0]);
1632 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1633 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1634 		}
1635 		if(del_db) xfrd_free_namedb(nsd);
1636 		/* use other task than I am using, since if xfrd died and is
1637 		 * restarted, the reload is using nsd->mytask */
1638 		nsd->mytask = 1 - nsd->mytask;
1639 
1640 #ifdef HAVE_SETPROCTITLE
1641 		setproctitle("xfrd");
1642 #endif
1643 #ifdef HAVE_CPUSET_T
1644 		if(nsd->use_cpu_affinity) {
1645 			set_cpu_affinity(nsd->xfrd_cpuset);
1646 		}
1647 #endif
1648 
1649 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1650 		/* ENOTREACH */
1651 		break;
1652 	case 0:
1653 		/* CHILD: close second socket, use first one */
1654 		close(sockets[1]);
1655 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1656 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1657 		}
1658 		nsd->xfrd_listener->fd = sockets[0];
1659 		break;
1660 	}
1661 	/* server-parent only */
1662 	nsd->xfrd_listener->timeout = NULL;
1663 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1664 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1665 	/* clear ongoing ipc reads */
1666 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1667 	data->conn->is_reading = 0;
1668 }
1669 
1670 /** add all soainfo to taskdb */
1671 static void
add_all_soa_to_task(struct nsd * nsd,struct udb_base * taskudb)1672 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1673 {
1674 	struct radnode* n;
1675 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1676 	/* add all SOA INFO to mytask */
1677 	udb_ptr_init(&task_last, taskudb);
1678 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1679 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1680 	}
1681 	udb_ptr_unlink(&task_last, taskudb);
1682 }
1683 
1684 void
server_send_soa_xfrd(struct nsd * nsd,int shortsoa)1685 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1686 {
1687 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1688 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1689 	 *   then they exchange and process.
1690 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1691 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1692 	 *   expire notifications can be sent back via a normal reload later
1693 	 *   (xfrd will wait for current running reload to finish if any).
1694 	 */
1695 	sig_atomic_t cmd = 0;
1696 	pid_t mypid;
1697 	int xfrd_sock = nsd->xfrd_listener->fd;
1698 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1699 	udb_ptr t;
1700 	if(!shortsoa) {
1701 		if(nsd->signal_hint_shutdown) {
1702 		shutdown:
1703 			log_msg(LOG_WARNING, "signal received, shutting down...");
1704 			server_close_all_sockets(nsd->udp, nsd->ifs);
1705 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1706 #ifdef HAVE_SSL
1707 			daemon_remote_close(nsd->rc);
1708 #endif
1709 			/* Unlink it if possible... */
1710 			unlinkpid(nsd->pidfile);
1711 			unlink(nsd->task[0]->fname);
1712 			unlink(nsd->task[1]->fname);
1713 #ifdef USE_ZONE_STATS
1714 			unlink(nsd->zonestatfname[0]);
1715 			unlink(nsd->zonestatfname[1]);
1716 #endif
1717 			/* write the nsd.db to disk, wait for it to complete */
1718 			udb_base_sync(nsd->db->udb, 1);
1719 			udb_base_close(nsd->db->udb);
1720 			server_shutdown(nsd);
1721 			/* ENOTREACH */
1722 			exit(0);
1723 		}
1724 	}
1725 	if(shortsoa) {
1726 		/* put SOA in xfrd task because mytask may be in use */
1727 		taskudb = nsd->task[1-nsd->mytask];
1728 	}
1729 
1730 	add_all_soa_to_task(nsd, taskudb);
1731 	if(!shortsoa) {
1732 		/* wait for xfrd to signal task is ready, RELOAD signal */
1733 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1734 			cmd != NSD_RELOAD) {
1735 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1736 			exit(1);
1737 		}
1738 		if(nsd->signal_hint_shutdown) {
1739 			goto shutdown;
1740 		}
1741 	}
1742 	/* give xfrd our task, signal it with RELOAD_DONE */
1743 	task_process_sync(taskudb);
1744 	cmd = NSD_RELOAD_DONE;
1745 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1746 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1747 			(int)nsd->pid, strerror(errno));
1748 	}
1749 	mypid = getpid();
1750 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1751 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1752 			strerror(errno));
1753 	}
1754 
1755 	if(!shortsoa) {
1756 		/* process the xfrd task works (expiry data) */
1757 		nsd->mytask = 1 - nsd->mytask;
1758 		taskudb = nsd->task[nsd->mytask];
1759 		task_remap(taskudb);
1760 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1761 		while(!udb_ptr_is_null(&t)) {
1762 			task_process_expire(nsd->db, TASKLIST(&t));
1763 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1764 		}
1765 		udb_ptr_unlink(&t, taskudb);
1766 		task_clear(taskudb);
1767 
1768 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1769 		cmd = NSD_RELOAD_DONE;
1770 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1771 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1772 				(int)nsd->pid, strerror(errno));
1773 		}
1774 	}
1775 }
1776 
1777 #ifdef HAVE_SSL
1778 static void
log_crypto_from_err(const char * str,unsigned long err)1779 log_crypto_from_err(const char* str, unsigned long err)
1780 {
1781 	/* error:[error code]:[library name]:[function name]:[reason string] */
1782 	char buf[128];
1783 	unsigned long e;
1784 	ERR_error_string_n(err, buf, sizeof(buf));
1785 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1786 	while( (e=ERR_get_error()) ) {
1787 		ERR_error_string_n(e, buf, sizeof(buf));
1788 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1789 	}
1790 }
1791 
1792 void
log_crypto_err(const char * str)1793 log_crypto_err(const char* str)
1794 {
1795 	log_crypto_from_err(str, ERR_get_error());
1796 }
1797 
1798 /** true if the ssl handshake error has to be squelched from the logs */
1799 static int
squelch_err_ssl_handshake(unsigned long err)1800 squelch_err_ssl_handshake(unsigned long err)
1801 {
1802 	if(verbosity >= 3)
1803 		return 0; /* only squelch on low verbosity */
1804 	/* this is very specific, we could filter on ERR_GET_REASON()
1805 	 * (the third element in ERR_PACK) */
1806 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1807 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1808 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1809 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1810 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1811 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1812 #endif
1813 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1814 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1815 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1816 #  ifdef SSL_R_VERSION_TOO_LOW
1817 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1818 #  endif
1819 #endif
1820 		)
1821 		return 1;
1822 	return 0;
1823 }
1824 
1825 void
perform_openssl_init(void)1826 perform_openssl_init(void)
1827 {
1828 	/* init SSL library */
1829 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1830 	ERR_load_crypto_strings();
1831 #endif
1832 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1833 	ERR_load_SSL_strings();
1834 #endif
1835 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1836 	OpenSSL_add_all_algorithms();
1837 #else
1838 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1839 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1840 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1841 #endif
1842 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1843 	(void)SSL_library_init();
1844 #else
1845 	OPENSSL_init_ssl(0, NULL);
1846 #endif
1847 
1848 	if(!RAND_status()) {
1849 		/* try to seed it */
1850 		unsigned char buf[256];
1851 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1852 		size_t i;
1853 		v = seed;
1854 		for(i=0; i<256/sizeof(v); i++) {
1855 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1856 			v = v*seed + (unsigned int)i;
1857 		}
1858 		RAND_seed(buf, 256);
1859 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1860 	}
1861 }
1862 
1863 static int
get_ocsp(char * filename,unsigned char ** ocsp)1864 get_ocsp(char *filename, unsigned char **ocsp)
1865 {
1866 	BIO *bio;
1867 	OCSP_RESPONSE *response;
1868 	int len = -1;
1869 	unsigned char *p, *buf;
1870 	assert(filename);
1871 
1872 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1873 		log_crypto_err("get_ocsp: BIO_new_file failed");
1874 		return -1;
1875 	}
1876 
1877 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1878 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1879 		BIO_free(bio);
1880 		return -1;
1881 	}
1882 
1883 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1884 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1885 		OCSP_RESPONSE_free(response);
1886 		BIO_free(bio);
1887 		return -1;
1888 	}
1889 
1890 	if ((buf = malloc((size_t) len)) == NULL) {
1891 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1892 		OCSP_RESPONSE_free(response);
1893 		BIO_free(bio);
1894 		return -1;
1895 	}
1896 
1897 	p = buf;
1898 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1899 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1900 		free(buf);
1901 		OCSP_RESPONSE_free(response);
1902 		BIO_free(bio);
1903 		return -1;
1904 	}
1905 
1906 	OCSP_RESPONSE_free(response);
1907 	BIO_free(bio);
1908 
1909 	*ocsp = buf;
1910 	return len;
1911 }
1912 
1913 /* further setup ssl ctx after the keys are loaded */
1914 static void
listen_sslctx_setup_2(void * ctxt)1915 listen_sslctx_setup_2(void* ctxt)
1916 {
1917 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1918 	(void)ctx;
1919 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1920 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1921 		/* ENOTREACH */
1922 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1923 	}
1924 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1925 	if(1) {
1926 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1927 		if (!ecdh) {
1928 			log_crypto_err("could not find p256, not enabling ECDHE");
1929 		} else {
1930 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1931 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1932 			}
1933 			EC_KEY_free (ecdh);
1934 		}
1935 	}
1936 #endif
1937 }
1938 
1939 static int
add_ocsp_data_cb(SSL * s,void * ATTR_UNUSED (arg))1940 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1941 {
1942 	if(ocspdata) {
1943 		unsigned char *p;
1944 		if ((p=malloc(ocspdata_len)) == NULL) {
1945 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1946 			return SSL_TLSEXT_ERR_NOACK;
1947 		}
1948 		memcpy(p, ocspdata, ocspdata_len);
1949 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1950 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1951 			free(p);
1952 			return SSL_TLSEXT_ERR_NOACK;
1953 		}
1954 		return SSL_TLSEXT_ERR_OK;
1955 	} else {
1956 		return SSL_TLSEXT_ERR_NOACK;
1957 	}
1958 }
1959 
1960 SSL_CTX*
server_tls_ctx_setup(char * key,char * pem,char * verifypem)1961 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1962 {
1963 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1964 	if(!ctx) {
1965 		log_crypto_err("could not SSL_CTX_new");
1966 		return NULL;
1967 	}
1968 	/* no SSLv2, SSLv3 because has defects */
1969 #if SSL_OP_NO_SSLv2 != 0
1970 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1971 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1972 		SSL_CTX_free(ctx);
1973 		return NULL;
1974 	}
1975 #endif
1976 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1977 		!= SSL_OP_NO_SSLv3){
1978 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1979 		SSL_CTX_free(ctx);
1980 		return 0;
1981 	}
1982 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1983 	/* if we have tls 1.1 disable 1.0 */
1984 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1985 		!= SSL_OP_NO_TLSv1){
1986 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1987 		SSL_CTX_free(ctx);
1988 		return 0;
1989 	}
1990 #endif
1991 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1992 	/* if we have tls 1.2 disable 1.1 */
1993 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1994 		!= SSL_OP_NO_TLSv1_1){
1995 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1996 		SSL_CTX_free(ctx);
1997 		return 0;
1998 	}
1999 #endif
2000 #if defined(SSL_OP_NO_RENEGOTIATION)
2001 	/* disable client renegotiation */
2002 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2003 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2004 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2005 		SSL_CTX_free(ctx);
2006 		return 0;
2007 	}
2008 #endif
2009 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2010 	/* if we detect system-wide crypto policies, use those */
2011 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2012 		/* if we have sha256, set the cipher list to have no known vulns */
2013 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2014 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2015 	}
2016 #endif
2017 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2018 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2019 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2020 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2021 		SSL_CTX_free(ctx);
2022 		return 0;
2023 	}
2024 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2025 	SSL_CTX_set_security_level(ctx, 0);
2026 #endif
2027 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2028 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2029 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2030 		SSL_CTX_free(ctx);
2031 		return NULL;
2032 	}
2033 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2034 		log_msg(LOG_ERR, "error for private key file: %s", key);
2035 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2036 		SSL_CTX_free(ctx);
2037 		return NULL;
2038 	}
2039 	if(!SSL_CTX_check_private_key(ctx)) {
2040 		log_msg(LOG_ERR, "error for key file: %s", key);
2041 		log_crypto_err("Error in SSL_CTX check_private_key");
2042 		SSL_CTX_free(ctx);
2043 		return NULL;
2044 	}
2045 	listen_sslctx_setup_2(ctx);
2046 	if(verifypem && verifypem[0]) {
2047 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2048 			log_crypto_err("Error in SSL_CTX verify locations");
2049 			SSL_CTX_free(ctx);
2050 			return NULL;
2051 		}
2052 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2053 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2054 	}
2055 	return ctx;
2056 }
2057 
2058 SSL_CTX*
server_tls_ctx_create(struct nsd * nsd,char * verifypem,char * ocspfile)2059 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2060 {
2061 	char *key, *pem;
2062 	SSL_CTX *ctx;
2063 
2064 	key = nsd->options->tls_service_key;
2065 	pem = nsd->options->tls_service_pem;
2066 	if(!key || key[0] == 0) {
2067 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2068 		return NULL;
2069 	}
2070 	if(!pem || pem[0] == 0) {
2071 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2072 		return NULL;
2073 	}
2074 
2075 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2076 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2077 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2078 	if(!ctx) {
2079 		log_msg(LOG_ERR, "could not setup server TLS context");
2080 		return NULL;
2081 	}
2082 	if(ocspfile && ocspfile[0]) {
2083 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2084 			log_crypto_err("Error reading OCSPfile");
2085 			SSL_CTX_free(ctx);
2086 			return NULL;
2087 		} else {
2088 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2089 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2090 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2091 				SSL_CTX_free(ctx);
2092 				return NULL;
2093 			}
2094 		}
2095 	}
2096 	return ctx;
2097 }
2098 
2099 /* check if tcp_handler_accept_data created for TLS dedicated port */
2100 int
using_tls_port(struct sockaddr * addr,const char * tls_port)2101 using_tls_port(struct sockaddr* addr, const char* tls_port)
2102 {
2103 	in_port_t port = 0;
2104 
2105 	if (addr->sa_family == AF_INET)
2106 		port = ((struct sockaddr_in*)addr)->sin_port;
2107 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2108 	else
2109 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2110 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2111 	if (atoi(tls_port) == ntohs(port))
2112 		return 1;
2113 
2114 	return 0;
2115 }
2116 #endif
2117 
2118 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2119 ssize_t
block_read(struct nsd * nsd,int s,void * p,ssize_t sz,int timeout)2120 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2121 {
2122 	uint8_t* buf = (uint8_t*) p;
2123 	ssize_t total = 0;
2124 	struct pollfd fd;
2125 	memset(&fd, 0, sizeof(fd));
2126 	fd.fd = s;
2127 	fd.events = POLLIN;
2128 
2129 	while( total < sz) {
2130 		ssize_t ret;
2131 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2132 		if(ret == -1) {
2133 			if(errno == EAGAIN)
2134 				/* blocking read */
2135 				continue;
2136 			if(errno == EINTR) {
2137 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2138 					return -1;
2139 				/* other signals can be handled later */
2140 				continue;
2141 			}
2142 			/* some error */
2143 			return -1;
2144 		}
2145 		if(ret == 0) {
2146 			/* operation timed out */
2147 			return -2;
2148 		}
2149 		ret = read(s, buf+total, sz-total);
2150 		if(ret == -1) {
2151 			if(errno == EAGAIN)
2152 				/* blocking read */
2153 				continue;
2154 			if(errno == EINTR) {
2155 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2156 					return -1;
2157 				/* other signals can be handled later */
2158 				continue;
2159 			}
2160 			/* some error */
2161 			return -1;
2162 		}
2163 		if(ret == 0) {
2164 			/* closed connection! */
2165 			return 0;
2166 		}
2167 		total += ret;
2168 	}
2169 	return total;
2170 }
2171 
2172 static void
reload_process_tasks(struct nsd * nsd,udb_ptr * last_task,int cmdsocket)2173 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2174 {
2175 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2176 	udb_ptr t, next;
2177 	udb_base* u = nsd->task[nsd->mytask];
2178 	udb_ptr_init(&next, u);
2179 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2180 	udb_base_set_userdata(u, 0);
2181 	while(!udb_ptr_is_null(&t)) {
2182 		/* store next in list so this one can be deleted or reused */
2183 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2184 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2185 
2186 		/* process task t */
2187 		/* append results for task t and update last_task */
2188 		task_process_in_reload(nsd, u, last_task, &t);
2189 
2190 		/* go to next */
2191 		udb_ptr_set_ptr(&t, u, &next);
2192 
2193 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2194 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2195 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2196 			if(cmd == NSD_QUIT) {
2197 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2198 				/* sync to disk (if needed) */
2199 				udb_base_sync(nsd->db->udb, 0);
2200 				/* unlink files of remainder of tasks */
2201 				while(!udb_ptr_is_null(&t)) {
2202 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2203 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2204 					}
2205 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2206 				}
2207 				udb_ptr_unlink(&t, u);
2208 				udb_ptr_unlink(&next, u);
2209 				exit(0);
2210 			}
2211 		}
2212 
2213 	}
2214 	udb_ptr_unlink(&t, u);
2215 	udb_ptr_unlink(&next, u);
2216 }
2217 
2218 #ifdef BIND8_STATS
2219 static void
parent_send_stats(struct nsd * nsd,int cmdfd)2220 parent_send_stats(struct nsd* nsd, int cmdfd)
2221 {
2222 	size_t i;
2223 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2224 		log_msg(LOG_ERR, "could not write stats to reload");
2225 		return;
2226 	}
2227 	for(i=0; i<nsd->child_count; i++)
2228 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2229 			sizeof(stc_type))) {
2230 			log_msg(LOG_ERR, "could not write stats to reload");
2231 			return;
2232 		}
2233 }
2234 
2235 static void
reload_do_stats(int cmdfd,struct nsd * nsd,udb_ptr * last)2236 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2237 {
2238 	struct nsdst s;
2239 	stc_type* p;
2240 	size_t i;
2241 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2242 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2243 		log_msg(LOG_ERR, "could not read stats from oldpar");
2244 		return;
2245 	}
2246 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2247 	s.db_mem = region_get_mem(nsd->db->region);
2248 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2249 		nsd->child_count);
2250 	if(!p) return;
2251 	for(i=0; i<nsd->child_count; i++) {
2252 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2253 			sizeof(stc_type))
2254 			return;
2255 	}
2256 }
2257 #endif /* BIND8_STATS */
2258 
2259 void server_verify(struct nsd *nsd, int cmdsocket);
2260 
2261 /*
2262  * Reload the database, stop parent, re-fork children and continue.
2263  * as server_main.
2264  */
2265 static void
server_reload(struct nsd * nsd,region_type * server_region,netio_type * netio,int cmdsocket)2266 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2267 	int cmdsocket)
2268 {
2269 	pid_t mypid;
2270 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2271 	int ret;
2272 	udb_ptr last_task;
2273 	struct sigaction old_sigchld, ign_sigchld;
2274 	struct radnode* node;
2275 	zone_type* zone;
2276 	enum soainfo_hint hint;
2277 	/* ignore SIGCHLD from the previous server_main that used this pid */
2278 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2279 	ign_sigchld.sa_handler = SIG_IGN;
2280 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2281 
2282 #ifdef HAVE_SETPROCTITLE
2283 	setproctitle("main");
2284 #endif
2285 #ifdef HAVE_CPUSET_T
2286 	if(nsd->use_cpu_affinity) {
2287 		set_cpu_affinity(nsd->cpuset);
2288 	}
2289 #endif
2290 
2291 	/* see what tasks we got from xfrd */
2292 	task_remap(nsd->task[nsd->mytask]);
2293 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2294 	udb_compact_inhibited(nsd->db->udb, 1);
2295 	reload_process_tasks(nsd, &last_task, cmdsocket);
2296 	udb_compact_inhibited(nsd->db->udb, 0);
2297 	udb_compact(nsd->db->udb);
2298 
2299 #ifndef NDEBUG
2300 	if(nsd_debug_level >= 1)
2301 		region_log_stats(nsd->db->region);
2302 #endif /* NDEBUG */
2303 	/* sync to disk (if needed) */
2304 	udb_base_sync(nsd->db->udb, 0);
2305 
2306 	initialize_dname_compression_tables(nsd);
2307 
2308 #ifdef BIND8_STATS
2309 	/* Restart dumping stats if required.  */
2310 	time(&nsd->st.boot);
2311 	set_bind8_alarm(nsd);
2312 #endif
2313 #ifdef USE_ZONE_STATS
2314 	server_zonestat_realloc(nsd); /* realloc for new children */
2315 	server_zonestat_switch(nsd);
2316 #endif
2317 
2318 	if(nsd->options->verify_enable) {
2319 #ifdef RATELIMIT
2320 		/* allocate resources for rate limiting. use a slot that is guaranteed
2321 		   not mapped to a file so no persistent data is overwritten */
2322 		rrl_init(nsd->child_count + 1);
2323 #endif
2324 
2325 		/* spin-up server and execute verifiers for each zone */
2326 		server_verify(nsd, cmdsocket);
2327 #ifdef RATELIMIT
2328 		/* deallocate rate limiting resources */
2329 		rrl_deinit(nsd->child_count + 1);
2330 #endif
2331 	}
2332 
2333 	for(node = radix_first(nsd->db->zonetree);
2334 	    node != NULL;
2335 	    node = radix_next(node))
2336 	{
2337 		zone = (zone_type *)node->elem;
2338 		if(zone->is_updated) {
2339 			if(zone->is_bad) {
2340 				nsd->mode = NSD_RELOAD_FAILED;
2341 				hint = soainfo_bad;
2342 			} else {
2343 				hint = soainfo_ok;
2344 			}
2345 			/* update(s), verified or not, possibly with subsequent
2346 			   skipped update(s). skipped update(s) are picked up
2347 			   by failed update check in xfrd */
2348 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2349 			                 zone, hint);
2350 		} else if(zone->is_skipped) {
2351 			/* corrupt or inconsistent update without preceding
2352 			   update(s), communicate soainfo_gone */
2353 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2354 			                 zone, soainfo_gone);
2355 		}
2356 		zone->is_updated = 0;
2357 		zone->is_skipped = 0;
2358 	}
2359 
2360 	if(nsd->mode == NSD_RELOAD_FAILED) {
2361 		exit(NSD_RELOAD_FAILED);
2362 	}
2363 
2364 	/* listen for the signals of failed children again */
2365 	sigaction(SIGCHLD, &old_sigchld, NULL);
2366 #ifdef USE_DNSTAP
2367 	if (nsd->dt_collector) {
2368 		int *swap_fd_send;
2369 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2370 		/* Swap fd_send with fd_swap so old serve child and new serve
2371 		 * childs will not write to the same pipe ends simultaneously */
2372 		swap_fd_send = nsd->dt_collector_fd_send;
2373 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2374 		nsd->dt_collector_fd_swap = swap_fd_send;
2375 
2376 	}
2377 #endif
2378 	/* Start new child processes */
2379 	if (server_start_children(nsd, server_region, netio, &nsd->
2380 		xfrd_listener->fd) != 0) {
2381 		send_children_quit(nsd);
2382 		exit(1);
2383 	}
2384 
2385 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2386 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2387 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2388 		if(cmd == NSD_QUIT) {
2389 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2390 			send_children_quit(nsd);
2391 			exit(0);
2392 		}
2393 	}
2394 
2395 	/* Send quit command to parent: blocking, wait for receipt. */
2396 	do {
2397 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2398 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2399 		{
2400 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2401 				strerror(errno));
2402 		}
2403 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2404 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2405 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2406 			RELOAD_SYNC_TIMEOUT);
2407 		if(ret == -2) {
2408 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2409 		}
2410 	} while (ret == -2);
2411 	if(ret == -1) {
2412 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2413 			strerror(errno));
2414 	}
2415 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2416 	if(cmd == NSD_QUIT) {
2417 		/* small race condition possible here, parent got quit cmd. */
2418 		send_children_quit(nsd);
2419 		exit(1);
2420 	}
2421 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2422 #ifdef BIND8_STATS
2423 	reload_do_stats(cmdsocket, nsd, &last_task);
2424 #endif
2425 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2426 	task_process_sync(nsd->task[nsd->mytask]);
2427 #ifdef USE_ZONE_STATS
2428 	server_zonestat_realloc(nsd); /* realloc for next children */
2429 #endif
2430 
2431 	/* send soainfo to the xfrd process, signal it that reload is done,
2432 	 * it picks up the taskudb */
2433 	cmd = NSD_RELOAD_DONE;
2434 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2435 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2436 			strerror(errno));
2437 	}
2438 	mypid = getpid();
2439 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2440 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2441 			strerror(errno));
2442 	}
2443 
2444 	/* try to reopen file */
2445 	if (nsd->file_rotation_ok)
2446 		log_reopen(nsd->log_filename, 1);
2447 	/* exit reload, continue as new server_main */
2448 }
2449 
2450 /*
2451  * Get the mode depending on the signal hints that have been received.
2452  * Multiple signal hints can be received and will be handled in turn.
2453  */
2454 static sig_atomic_t
server_signal_mode(struct nsd * nsd)2455 server_signal_mode(struct nsd *nsd)
2456 {
2457 	if(nsd->signal_hint_quit) {
2458 		nsd->signal_hint_quit = 0;
2459 		return NSD_QUIT;
2460 	}
2461 	else if(nsd->signal_hint_shutdown) {
2462 		nsd->signal_hint_shutdown = 0;
2463 		return NSD_SHUTDOWN;
2464 	}
2465 	else if(nsd->signal_hint_child) {
2466 		nsd->signal_hint_child = 0;
2467 		return NSD_REAP_CHILDREN;
2468 	}
2469 	else if(nsd->signal_hint_reload) {
2470 		nsd->signal_hint_reload = 0;
2471 		return NSD_RELOAD;
2472 	}
2473 	else if(nsd->signal_hint_reload_hup) {
2474 		nsd->signal_hint_reload_hup = 0;
2475 		return NSD_RELOAD_REQ;
2476 	}
2477 	else if(nsd->signal_hint_stats) {
2478 		nsd->signal_hint_stats = 0;
2479 #ifdef BIND8_STATS
2480 		set_bind8_alarm(nsd);
2481 #endif
2482 		return NSD_STATS;
2483 	}
2484 	else if(nsd->signal_hint_statsusr) {
2485 		nsd->signal_hint_statsusr = 0;
2486 		return NSD_STATS;
2487 	}
2488 	return NSD_RUN;
2489 }
2490 
2491 /*
2492  * The main server simply waits for signals and child processes to
2493  * terminate.  Child processes are restarted as necessary.
2494  */
2495 void
server_main(struct nsd * nsd)2496 server_main(struct nsd *nsd)
2497 {
2498 	region_type *server_region = region_create(xalloc, free);
2499 	netio_type *netio = netio_create(server_region);
2500 	netio_handler_type reload_listener;
2501 	int reload_sockets[2] = {-1, -1};
2502 	struct timespec timeout_spec;
2503 	int status;
2504 	pid_t child_pid;
2505 	pid_t reload_pid = -1;
2506 	sig_atomic_t mode;
2507 
2508 	/* Ensure we are the main process */
2509 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2510 
2511 	/* Add listener for the XFRD process */
2512 	netio_add_handler(netio, nsd->xfrd_listener);
2513 
2514 	/* Start the child processes that handle incoming queries */
2515 	if (server_start_children(nsd, server_region, netio,
2516 		&nsd->xfrd_listener->fd) != 0) {
2517 		send_children_quit(nsd);
2518 		exit(1);
2519 	}
2520 	reload_listener.fd = -1;
2521 
2522 	/* This_child MUST be 0, because this is the parent process */
2523 	assert(nsd->this_child == 0);
2524 
2525 	/* Run the server until we get a shutdown signal */
2526 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2527 		/* Did we receive a signal that changes our mode? */
2528 		if(mode == NSD_RUN) {
2529 			nsd->mode = mode = server_signal_mode(nsd);
2530 		}
2531 
2532 		switch (mode) {
2533 		case NSD_RUN:
2534 			/* see if any child processes terminated */
2535 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2536 				int is_child = delete_child_pid(nsd, child_pid);
2537 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2538 					if(nsd->children[is_child].child_fd == -1)
2539 						nsd->children[is_child].has_exited = 1;
2540 					parent_check_all_children_exited(nsd);
2541 				} else if(is_child != -1) {
2542 					log_msg(LOG_WARNING,
2543 					       "server %d died unexpectedly with status %d, restarting",
2544 					       (int) child_pid, status);
2545 					restart_child_servers(nsd, server_region, netio,
2546 						&nsd->xfrd_listener->fd);
2547 				} else if (child_pid == reload_pid) {
2548 					sig_atomic_t cmd = NSD_RELOAD_FAILED;
2549 					pid_t mypid;
2550 					log_msg(LOG_WARNING,
2551 					       "Reload process %d failed with status %d, continuing with old database",
2552 					       (int) child_pid, status);
2553 					reload_pid = -1;
2554 					if(reload_listener.fd != -1) close(reload_listener.fd);
2555 					netio_remove_handler(netio, &reload_listener);
2556 					reload_listener.fd = -1;
2557 					reload_listener.event_types = NETIO_EVENT_NONE;
2558 					task_process_sync(nsd->task[nsd->mytask]);
2559 					/* inform xfrd reload attempt ended */
2560 					if(!write_socket(nsd->xfrd_listener->fd,
2561 						&cmd, sizeof(cmd))) {
2562 						log_msg(LOG_ERR, "problems "
2563 						  "sending SOAEND to xfrd: %s",
2564 						  strerror(errno));
2565 					}
2566 					mypid = getpid();
2567 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2568 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2569 							strerror(errno));
2570 					}
2571 #ifdef USE_DNSTAP
2572 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2573 					log_msg(LOG_WARNING,
2574 					       "dnstap-collector %d terminated with status %d",
2575 					       (int) child_pid, status);
2576 					if(nsd->dt_collector) {
2577 						dt_collector_close(nsd->dt_collector, nsd);
2578 						dt_collector_destroy(nsd->dt_collector, nsd);
2579 						nsd->dt_collector = NULL;
2580 					}
2581 					/* Only respawn a crashed (or exited)
2582 					 * dnstap-collector when not reloading,
2583 					 * to not induce a reload during a
2584 					 * reload (which would seriously
2585 					 * disrupt nsd procedures and lead to
2586 					 * unpredictable results)!
2587 					 *
2588 					 * This will *leave* a dnstap-collector
2589 					 * process terminated, but because
2590 					 * signalling of the reload process to
2591 					 * the main process to respawn in this
2592 					 * situation will be cumbersome, and
2593 					 * because this situation is so
2594 					 * specific (and therefore hopefully
2595 					 * extremely rare or non-existing at
2596 					 * all), plus the fact that we are left
2597 					 * with a perfectly function NSD
2598 					 * (besides not logging dnstap
2599 					 * messages), I consider it acceptable
2600 					 * to leave this unresolved.
2601 					 */
2602 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2603 						nsd->dt_collector = dt_collector_create(nsd);
2604 						dt_collector_start(nsd->dt_collector, nsd);
2605 						nsd->mode = NSD_RELOAD_REQ;
2606 					}
2607 #endif
2608 				} else if(status != 0) {
2609 					/* check for status, because we get
2610 					 * the old-servermain because reload
2611 					 * is the process-parent of old-main,
2612 					 * and we get older server-processes
2613 					 * that are exiting after a reload */
2614 					log_msg(LOG_WARNING,
2615 					       "process %d terminated with status %d",
2616 					       (int) child_pid, status);
2617 				}
2618 			}
2619 			if (child_pid == -1) {
2620 				if (errno == EINTR) {
2621 					continue;
2622 				}
2623 				if (errno != ECHILD)
2624 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2625 			}
2626 			if (nsd->mode != NSD_RUN)
2627 				break;
2628 
2629 			/* timeout to collect processes. In case no sigchild happens. */
2630 			timeout_spec.tv_sec = 60;
2631 			timeout_spec.tv_nsec = 0;
2632 
2633 			/* listen on ports, timeout for collecting terminated children */
2634 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2635 				if (errno != EINTR) {
2636 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2637 				}
2638 			}
2639 			if(nsd->restart_children) {
2640 				restart_child_servers(nsd, server_region, netio,
2641 					&nsd->xfrd_listener->fd);
2642 				nsd->restart_children = 0;
2643 			}
2644 			if(nsd->reload_failed) {
2645 				sig_atomic_t cmd = NSD_RELOAD_FAILED;
2646 				pid_t mypid;
2647 				nsd->reload_failed = 0;
2648 				log_msg(LOG_WARNING,
2649 				       "Reload process %d failed, continuing with old database",
2650 				       (int) reload_pid);
2651 				reload_pid = -1;
2652 				if(reload_listener.fd != -1) close(reload_listener.fd);
2653 				netio_remove_handler(netio, &reload_listener);
2654 				reload_listener.fd = -1;
2655 				reload_listener.event_types = NETIO_EVENT_NONE;
2656 				task_process_sync(nsd->task[nsd->mytask]);
2657 				/* inform xfrd reload attempt ended */
2658 				if(!write_socket(nsd->xfrd_listener->fd,
2659 					&cmd, sizeof(cmd))) {
2660 					log_msg(LOG_ERR, "problems "
2661 					  "sending SOAEND to xfrd: %s",
2662 					  strerror(errno));
2663 				}
2664 				mypid = getpid();
2665 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2666 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2667 						strerror(errno));
2668 				}
2669 			}
2670 
2671 			break;
2672 		case NSD_RELOAD_REQ: {
2673 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2674 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2675 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2676 				"main: ipc send reload_req to xfrd"));
2677 			if(!write_socket(nsd->xfrd_listener->fd,
2678 				&cmd, sizeof(cmd))) {
2679 				log_msg(LOG_ERR, "server_main: could not send "
2680 				"reload_req to xfrd: %s", strerror(errno));
2681 			}
2682 			nsd->mode = NSD_RUN;
2683 			} break;
2684 		case NSD_RELOAD:
2685 			/* Continue to run nsd after reload */
2686 			nsd->mode = NSD_RUN;
2687 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2688 			if (reload_pid != -1) {
2689 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2690 				       (int) reload_pid);
2691 				break;
2692 			}
2693 
2694 			/* switch the mytask to keep track of who owns task*/
2695 			nsd->mytask = 1 - nsd->mytask;
2696 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2697 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2698 				reload_pid = -1;
2699 				break;
2700 			}
2701 
2702 			/* Do actual reload */
2703 			reload_pid = fork();
2704 			switch (reload_pid) {
2705 			case -1:
2706 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2707 				break;
2708 			default:
2709 				/* PARENT */
2710 				close(reload_sockets[0]);
2711 				server_reload(nsd, server_region, netio,
2712 					reload_sockets[1]);
2713 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2714 				close(reload_sockets[1]);
2715 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2716 				/* drop stale xfrd ipc data */
2717 				((struct ipc_handler_conn_data*)nsd->
2718 					xfrd_listener->user_data)
2719 					->conn->is_reading = 0;
2720 				reload_pid = -1;
2721 				reload_listener.fd = -1;
2722 				reload_listener.event_types = NETIO_EVENT_NONE;
2723 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2724 				break;
2725 			case 0:
2726 				/* CHILD */
2727 				/* server_main keep running until NSD_QUIT_SYNC
2728 				 * received from reload. */
2729 				close(reload_sockets[1]);
2730 				reload_listener.fd = reload_sockets[0];
2731 				reload_listener.timeout = NULL;
2732 				reload_listener.user_data = nsd;
2733 				reload_listener.event_types = NETIO_EVENT_READ;
2734 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2735 				netio_add_handler(netio, &reload_listener);
2736 				reload_pid = getppid();
2737 				break;
2738 			}
2739 			break;
2740 		case NSD_QUIT_SYNC:
2741 			/* synchronisation of xfrd, parent and reload */
2742 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2743 				sig_atomic_t cmd = NSD_RELOAD;
2744 				/* stop xfrd ipc writes in progress */
2745 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2746 					"main: ipc send indication reload"));
2747 				if(!write_socket(nsd->xfrd_listener->fd,
2748 					&cmd, sizeof(cmd))) {
2749 					log_msg(LOG_ERR, "server_main: could not send reload "
2750 					"indication to xfrd: %s", strerror(errno));
2751 				}
2752 				/* wait for ACK from xfrd */
2753 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2754 				nsd->quit_sync_done = 1;
2755 			}
2756 			nsd->mode = NSD_RUN;
2757 			break;
2758 		case NSD_QUIT:
2759 			/* silent shutdown during reload */
2760 			if(reload_listener.fd != -1) {
2761 				/* acknowledge the quit, to sync reload that we will really quit now */
2762 				sig_atomic_t cmd = NSD_RELOAD;
2763 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2764 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2765 					log_msg(LOG_ERR, "server_main: "
2766 						"could not ack quit: %s", strerror(errno));
2767 				}
2768 #ifdef BIND8_STATS
2769 				parent_send_stats(nsd, reload_listener.fd);
2770 #endif /* BIND8_STATS */
2771 				close(reload_listener.fd);
2772 			}
2773 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2774 			/* only quit children after xfrd has acked */
2775 			send_children_quit(nsd);
2776 
2777 #ifdef MEMCLEAN /* OS collects memory pages */
2778 			region_destroy(server_region);
2779 #endif
2780 			server_shutdown(nsd);
2781 
2782 			/* ENOTREACH */
2783 			break;
2784 		case NSD_SHUTDOWN:
2785 			break;
2786 		case NSD_REAP_CHILDREN:
2787 			/* continue; wait for child in run loop */
2788 			nsd->mode = NSD_RUN;
2789 			break;
2790 		case NSD_STATS:
2791 #ifdef BIND8_STATS
2792 			set_children_stats(nsd);
2793 #endif
2794 			nsd->mode = NSD_RUN;
2795 			break;
2796 		default:
2797 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2798 			nsd->mode = NSD_RUN;
2799 			break;
2800 		}
2801 	}
2802 	log_msg(LOG_WARNING, "signal received, shutting down...");
2803 
2804 	/* close opened ports to avoid race with restart of nsd */
2805 	server_close_all_sockets(nsd->udp, nsd->ifs);
2806 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2807 #ifdef HAVE_SSL
2808 	daemon_remote_close(nsd->rc);
2809 #endif
2810 	send_children_quit_and_wait(nsd);
2811 
2812 	/* Unlink it if possible... */
2813 	unlinkpid(nsd->pidfile);
2814 	unlink(nsd->task[0]->fname);
2815 	unlink(nsd->task[1]->fname);
2816 #ifdef USE_ZONE_STATS
2817 	unlink(nsd->zonestatfname[0]);
2818 	unlink(nsd->zonestatfname[1]);
2819 #endif
2820 #ifdef USE_DNSTAP
2821 	dt_collector_close(nsd->dt_collector, nsd);
2822 #endif
2823 
2824 	if(reload_listener.fd != -1) {
2825 		sig_atomic_t cmd = NSD_QUIT;
2826 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2827 			"main: ipc send quit to reload-process"));
2828 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2829 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2830 				strerror(errno));
2831 		}
2832 		fsync(reload_listener.fd);
2833 		close(reload_listener.fd);
2834 		/* wait for reload to finish processing */
2835 		while(1) {
2836 			if(waitpid(reload_pid, NULL, 0) == -1) {
2837 				if(errno == EINTR) continue;
2838 				if(errno == ECHILD) break;
2839 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2840 					(int)reload_pid, strerror(errno));
2841 			}
2842 			break;
2843 		}
2844 	}
2845 	if(nsd->xfrd_listener->fd != -1) {
2846 		/* complete quit, stop xfrd */
2847 		sig_atomic_t cmd = NSD_QUIT;
2848 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2849 			"main: ipc send quit to xfrd"));
2850 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2851 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2852 				strerror(errno));
2853 		}
2854 		fsync(nsd->xfrd_listener->fd);
2855 		close(nsd->xfrd_listener->fd);
2856 		(void)kill(nsd->pid, SIGTERM);
2857 	}
2858 
2859 #ifdef MEMCLEAN /* OS collects memory pages */
2860 	region_destroy(server_region);
2861 #endif
2862 	/* write the nsd.db to disk, wait for it to complete */
2863 	udb_base_sync(nsd->db->udb, 1);
2864 	udb_base_close(nsd->db->udb);
2865 	server_shutdown(nsd);
2866 }
2867 
2868 static query_state_type
server_process_query(struct nsd * nsd,struct query * query,uint32_t * now_p)2869 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2870 {
2871 	return query_process(query, nsd, now_p);
2872 }
2873 
2874 static query_state_type
server_process_query_udp(struct nsd * nsd,struct query * query,uint32_t * now_p)2875 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2876 {
2877 #ifdef RATELIMIT
2878 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2879 		if(query->edns.cookie_status != COOKIE_VALID
2880 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2881 		&& rrl_process_query(query))
2882 			return rrl_slip(query);
2883 		else	return QUERY_PROCESSED;
2884 	}
2885 	return QUERY_DISCARDED;
2886 #else
2887 	return query_process(query, nsd, now_p);
2888 #endif
2889 }
2890 
2891 const char*
nsd_event_vs(void)2892 nsd_event_vs(void)
2893 {
2894 #ifdef USE_MINI_EVENT
2895 	return "";
2896 #else
2897 	return event_get_version();
2898 #endif
2899 }
2900 
2901 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
ub_ev_backend2str(int b)2902 static const char* ub_ev_backend2str(int b)
2903 {
2904 	switch(b) {
2905 	case EVBACKEND_SELECT:	return "select";
2906 	case EVBACKEND_POLL:	return "poll";
2907 	case EVBACKEND_EPOLL:	return "epoll";
2908 	case EVBACKEND_KQUEUE:	return "kqueue";
2909 	case EVBACKEND_DEVPOLL: return "devpoll";
2910 	case EVBACKEND_PORT:	return "evport";
2911 	}
2912 	return "unknown";
2913 }
2914 #endif
2915 
2916 const char*
nsd_event_method(void)2917 nsd_event_method(void)
2918 {
2919 #ifdef USE_MINI_EVENT
2920 	return "select";
2921 #else
2922 	struct event_base* b = nsd_child_event_base();
2923 	const char* m = "?";
2924 #  ifdef EV_FEATURE_BACKENDS
2925 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2926 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2927 	m = event_base_get_method(b);
2928 #  endif
2929 #  ifdef MEMCLEAN
2930 	event_base_free(b);
2931 #  endif
2932 	return m;
2933 #endif
2934 }
2935 
2936 struct event_base*
nsd_child_event_base(void)2937 nsd_child_event_base(void)
2938 {
2939 	struct event_base* base;
2940 #ifdef USE_MINI_EVENT
2941 	static time_t secs;
2942 	static struct timeval now;
2943 	base = event_init(&secs, &now);
2944 #else
2945 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2946 	/* libev */
2947 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2948 #  else
2949 	/* libevent */
2950 #    ifdef HAVE_EVENT_BASE_NEW
2951 	base = event_base_new();
2952 #    else
2953 	base = event_init();
2954 #    endif
2955 #  endif
2956 #endif
2957 	return base;
2958 }
2959 
2960 static void
add_udp_handler(struct nsd * nsd,struct nsd_socket * sock,struct udp_handler_data * data)2961 add_udp_handler(
2962 	struct nsd *nsd,
2963 	struct nsd_socket *sock,
2964 	struct udp_handler_data *data)
2965 {
2966 	struct event *handler = &data->event;
2967 
2968 	data->nsd = nsd;
2969 	data->socket = sock;
2970 
2971 	memset(handler, 0, sizeof(*handler));
2972 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2973 	if(event_base_set(nsd->event_base, handler) != 0)
2974 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2975 	if(event_add(handler, NULL) != 0)
2976 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2977 }
2978 
2979 void
add_tcp_handler(struct nsd * nsd,struct nsd_socket * sock,struct tcp_accept_handler_data * data)2980 add_tcp_handler(
2981 	struct nsd *nsd,
2982 	struct nsd_socket *sock,
2983 	struct tcp_accept_handler_data *data)
2984 {
2985 	struct event *handler = &data->event;
2986 
2987 	data->nsd = nsd;
2988 	data->socket = sock;
2989 
2990 #ifdef HAVE_SSL
2991 	if (nsd->tls_ctx &&
2992 	    nsd->options->tls_port &&
2993 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2994 	{
2995 		data->tls_accept = 1;
2996 		if(verbosity >= 2) {
2997 			char buf[48];
2998 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2999 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3000 		}
3001 	} else {
3002 		data->tls_accept = 0;
3003 	}
3004 #endif
3005 
3006 	memset(handler, 0, sizeof(*handler));
3007 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
3008 	if(event_base_set(nsd->event_base, handler) != 0)
3009 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3010 	if(event_add(handler, NULL) != 0)
3011 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
3012 	data->event_added = 1;
3013 }
3014 
3015 /*
3016  * Serve DNS request to verifiers (short-lived)
3017  */
server_verify(struct nsd * nsd,int cmdsocket)3018 void server_verify(struct nsd *nsd, int cmdsocket)
3019 {
3020 	size_t size = 0;
3021 	struct event cmd_event, signal_event, exit_event;
3022 	struct zone *zone;
3023 
3024 	assert(nsd != NULL);
3025 
3026 	zone = verify_next_zone(nsd, NULL);
3027 	if(zone == NULL)
3028 		return;
3029 
3030 	nsd->server_region = region_create(xalloc, free);
3031 	nsd->event_base = nsd_child_event_base();
3032 
3033 	nsd->next_zone_to_verify = zone;
3034 	nsd->verifier_count = 0;
3035 	nsd->verifier_limit = nsd->options->verifier_count;
3036 	size = sizeof(struct verifier) * nsd->verifier_limit;
3037 	pipe(nsd->verifier_pipe);
3038 	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3039 	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3040 	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3041 
3042 	for(size_t i = 0; i < nsd->verifier_limit; i++) {
3043 		nsd->verifiers[i].nsd = nsd;
3044 		nsd->verifiers[i].zone = NULL;
3045 		nsd->verifiers[i].pid = -1;
3046 		nsd->verifiers[i].output_stream.fd = -1;
3047 		nsd->verifiers[i].output_stream.priority = LOG_INFO;
3048 		nsd->verifiers[i].error_stream.fd = -1;
3049 		nsd->verifiers[i].error_stream.priority = LOG_ERR;
3050 	}
3051 
3052 	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3053 	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3054 	   event_add(&cmd_event, NULL) != 0)
3055 	{
3056 		log_msg(LOG_ERR, "verify: could not add command event");
3057 		goto fail;
3058 	}
3059 
3060 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3061 	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3062 	   signal_add(&signal_event, NULL) != 0)
3063 	{
3064 		log_msg(LOG_ERR, "verify: could not add signal event");
3065 		goto fail;
3066 	}
3067 
3068 	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3069 	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3070 	   event_add(&exit_event, NULL) != 0)
3071   {
3072 		log_msg(LOG_ERR, "verify: could not add exit event");
3073 		goto fail;
3074 	}
3075 
3076 	memset(msgs, 0, sizeof(msgs));
3077 	for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
3078 		queries[i] = query_create(nsd->server_region,
3079 			compressed_dname_offsets,
3080 			compression_table_size, compressed_dnames);
3081 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3082 		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3083 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3084 		msgs[i].msg_hdr.msg_iov = &iovecs[i];
3085 		msgs[i].msg_hdr.msg_iovlen = 1;
3086 		msgs[i].msg_hdr.msg_name = &queries[i]->addr;
3087 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3088 	}
3089 
3090 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3091 		struct udp_handler_data *data;
3092 		data = region_alloc_zero(
3093 			nsd->server_region, sizeof(*data));
3094 		add_udp_handler(nsd, &nsd->verify_udp[i], data);
3095 	}
3096 
3097 	tcp_accept_handler_count = nsd->verify_ifs;
3098 	tcp_accept_handlers = region_alloc_array(nsd->server_region,
3099 		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3100 
3101 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3102 		struct tcp_accept_handler_data *data;
3103 		data = &tcp_accept_handlers[i];
3104 		memset(data, 0, sizeof(*data));
3105 		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3106 	}
3107 
3108 	while(nsd->next_zone_to_verify != NULL &&
3109 	      nsd->verifier_count < nsd->verifier_limit)
3110 	{
3111 		verify_zone(nsd, nsd->next_zone_to_verify);
3112 		nsd->next_zone_to_verify
3113 			= verify_next_zone(nsd, nsd->next_zone_to_verify);
3114 	}
3115 
3116 	/* short-lived main loop */
3117 	event_base_dispatch(nsd->event_base);
3118 
3119 	/* remove command and exit event handlers */
3120 	event_del(&exit_event);
3121 	event_del(&signal_event);
3122 	event_del(&cmd_event);
3123 
3124 	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3125 	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3126 fail:
3127 	event_base_free(nsd->event_base);
3128 	close(nsd->verifier_pipe[0]);
3129 	close(nsd->verifier_pipe[1]);
3130 	region_destroy(nsd->server_region);
3131 
3132 	nsd->event_base = NULL;
3133 	nsd->server_region = NULL;
3134 	nsd->verifier_limit = 0;
3135 	nsd->verifier_pipe[0] = -1;
3136 	nsd->verifier_pipe[1] = -1;
3137 	nsd->verifiers = NULL;
3138 }
3139 
3140 /*
3141  * Serve DNS requests.
3142  */
3143 void
server_child(struct nsd * nsd)3144 server_child(struct nsd *nsd)
3145 {
3146 	size_t i, from, numifs;
3147 	region_type *server_region = region_create(xalloc, free);
3148 	struct event_base* event_base = nsd_child_event_base();
3149 	sig_atomic_t mode;
3150 
3151 	if(!event_base) {
3152 		log_msg(LOG_ERR, "nsd server could not create event base");
3153 		exit(1);
3154 	}
3155 	nsd->event_base = event_base;
3156 	nsd->server_region = server_region;
3157 
3158 #ifdef RATELIMIT
3159 	rrl_init(nsd->this_child->child_num);
3160 #endif
3161 
3162 	assert(nsd->server_kind != NSD_SERVER_MAIN);
3163 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3164 
3165 #ifdef HAVE_SETPROCTITLE
3166 	setproctitle("server %d", nsd->this_child->child_num + 1);
3167 #endif
3168 #ifdef HAVE_CPUSET_T
3169 	if(nsd->use_cpu_affinity) {
3170 		set_cpu_affinity(nsd->this_child->cpuset);
3171 	}
3172 #endif
3173 
3174 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3175 		server_close_all_sockets(nsd->tcp, nsd->ifs);
3176 	}
3177 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3178 		server_close_all_sockets(nsd->udp, nsd->ifs);
3179 	}
3180 
3181 	if (nsd->this_child->parent_fd != -1) {
3182 		struct event *handler;
3183 		struct ipc_handler_conn_data* user_data =
3184 			(struct ipc_handler_conn_data*)region_alloc(
3185 			server_region, sizeof(struct ipc_handler_conn_data));
3186 		user_data->nsd = nsd;
3187 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3188 
3189 		handler = (struct event*) region_alloc(
3190 			server_region, sizeof(*handler));
3191 		memset(handler, 0, sizeof(*handler));
3192 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3193 			EV_READ, child_handle_parent_command, user_data);
3194 		if(event_base_set(event_base, handler) != 0)
3195 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3196 		if(event_add(handler, NULL) != 0)
3197 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3198 	}
3199 
3200 	if(nsd->reuseport) {
3201 		numifs = nsd->ifs / nsd->reuseport;
3202 		from = numifs * nsd->this_child->child_num;
3203 		if(from+numifs > nsd->ifs) { /* should not happen */
3204 			from = 0;
3205 			numifs = nsd->ifs;
3206 		}
3207 	} else {
3208 		from = 0;
3209 		numifs = nsd->ifs;
3210 	}
3211 
3212 	if (nsd->server_kind & NSD_SERVER_UDP) {
3213 		int child = nsd->this_child->child_num;
3214 		memset(msgs, 0, sizeof(msgs));
3215 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3216 			queries[i] = query_create(server_region,
3217 				compressed_dname_offsets,
3218 				compression_table_size, compressed_dnames);
3219 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3220 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3221 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3222 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3223 			msgs[i].msg_hdr.msg_iovlen  = 1;
3224 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
3225 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3226 		}
3227 
3228 		for (i = 0; i < nsd->ifs; i++) {
3229 			int listen;
3230 			struct udp_handler_data *data;
3231 
3232 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3233 
3234 			if(i >= from && i < (from + numifs) && listen) {
3235 				data = region_alloc_zero(
3236 					nsd->server_region, sizeof(*data));
3237 				add_udp_handler(nsd, &nsd->udp[i], data);
3238 			} else {
3239 				/* close sockets intended for other servers */
3240 				server_close_socket(&nsd->udp[i]);
3241 			}
3242 		}
3243 	}
3244 
3245 	/*
3246 	 * Keep track of all the TCP accept handlers so we can enable
3247 	 * and disable them based on the current number of active TCP
3248 	 * connections.
3249 	 */
3250 	if (nsd->server_kind & NSD_SERVER_TCP) {
3251 		int child = nsd->this_child->child_num;
3252 		tcp_accept_handler_count = numifs;
3253 		tcp_accept_handlers = region_alloc_array(server_region,
3254 			numifs, sizeof(*tcp_accept_handlers));
3255 
3256 		for (i = 0; i < nsd->ifs; i++) {
3257 			int listen;
3258 			struct tcp_accept_handler_data *data;
3259 
3260 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3261 
3262 			if(i >= from && i < (from + numifs) && listen) {
3263 				data = &tcp_accept_handlers[i-from];
3264 				memset(data, 0, sizeof(*data));
3265 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3266 			} else {
3267 				/* close sockets intended for other servers */
3268 				/*
3269 				 * uncomment this once tcp servers are no
3270 				 * longer copied in the tcp fd copy line
3271 				 * in server_init().
3272 				server_close_socket(&nsd->tcp[i]);
3273 				*/
3274 				/* close sockets not meant for this server*/
3275 				if(!listen)
3276 					server_close_socket(&nsd->tcp[i]);
3277 			}
3278 		}
3279 	} else {
3280 		tcp_accept_handler_count = 0;
3281 	}
3282 
3283 	/* The main loop... */
3284 	while ((mode = nsd->mode) != NSD_QUIT) {
3285 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3286 
3287 		/* Do we need to do the statistics... */
3288 		if (mode == NSD_STATS) {
3289 #ifdef BIND8_STATS
3290 			int p = nsd->st.period;
3291 			nsd->st.period = 1; /* force stats printout */
3292 			/* Dump the statistics */
3293 			bind8_stats(nsd);
3294 			nsd->st.period = p;
3295 #else /* !BIND8_STATS */
3296 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3297 #endif /* BIND8_STATS */
3298 
3299 			nsd->mode = NSD_RUN;
3300 		}
3301 		else if (mode == NSD_REAP_CHILDREN) {
3302 			/* got signal, notify parent. parent reaps terminated children. */
3303 			if (nsd->this_child->parent_fd != -1) {
3304 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3305 				if (write(nsd->this_child->parent_fd,
3306 				    &parent_notify,
3307 				    sizeof(parent_notify)) == -1)
3308 				{
3309 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3310 						(int) nsd->this_child->pid, strerror(errno));
3311 				}
3312 			} else /* no parent, so reap 'em */
3313 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3314 			nsd->mode = NSD_RUN;
3315 		}
3316 		else if(mode == NSD_RUN) {
3317 			/* Wait for a query... */
3318 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3319 				if (errno != EINTR) {
3320 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3321 					break;
3322 				}
3323 			}
3324 		} else if(mode == NSD_QUIT) {
3325 			/* ignore here, quit */
3326 		} else {
3327 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3328 				(int)mode);
3329 			nsd->mode = NSD_RUN;
3330 		}
3331 	}
3332 
3333 	service_remaining_tcp(nsd);
3334 #ifdef	BIND8_STATS
3335 	bind8_stats(nsd);
3336 #endif /* BIND8_STATS */
3337 
3338 #ifdef MEMCLEAN /* OS collects memory pages */
3339 #ifdef RATELIMIT
3340 	rrl_deinit(nsd->this_child->child_num);
3341 #endif
3342 	event_base_free(event_base);
3343 	region_destroy(server_region);
3344 #endif
3345 	server_shutdown(nsd);
3346 }
3347 
remaining_tcp_timeout(int ATTR_UNUSED (fd),short event,void * arg)3348 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3349 {
3350 	int* timed_out = (int*)arg;
3351         assert(event & EV_TIMEOUT); (void)event;
3352 	/* wake up the service tcp thread, note event is no longer
3353 	 * registered */
3354 	*timed_out = 1;
3355 }
3356 
3357 void
service_remaining_tcp(struct nsd * nsd)3358 service_remaining_tcp(struct nsd* nsd)
3359 {
3360 	struct tcp_handler_data* p;
3361 	struct event_base* event_base;
3362 	/* check if it is needed */
3363 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3364 		return;
3365 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3366 #ifdef USE_DNSTAP
3367 	/* remove dnstap collector, we cannot write there because the new
3368 	 * child process is using the file descriptor, or the child
3369 	 * process after that. */
3370 	dt_collector_destroy(nsd->dt_collector, nsd);
3371 	nsd->dt_collector = NULL;
3372 #endif
3373 	/* setup event base */
3374 	event_base = nsd_child_event_base();
3375 	if(!event_base) {
3376 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3377 		return;
3378 	}
3379 	/* register tcp connections */
3380 	for(p = tcp_active_list; p != NULL; p = p->next) {
3381 		struct timeval timeout;
3382 		int fd = p->event.ev_fd;
3383 #ifdef USE_MINI_EVENT
3384 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3385 #else
3386 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3387 #endif
3388 		void (*fn)(int, short, void*);
3389 #ifdef HAVE_SSL
3390 		if(p->tls) {
3391 			if((event&EV_READ))
3392 				fn = handle_tls_reading;
3393 			else	fn = handle_tls_writing;
3394 		} else {
3395 #endif
3396 			if((event&EV_READ))
3397 				fn = handle_tcp_reading;
3398 			else	fn = handle_tcp_writing;
3399 #ifdef HAVE_SSL
3400 		}
3401 #endif
3402 
3403 		p->tcp_no_more_queries = 1;
3404 		/* set timeout to 1/10 second */
3405 		if(p->tcp_timeout > 100)
3406 			p->tcp_timeout = 100;
3407 		timeout.tv_sec = p->tcp_timeout / 1000;
3408 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3409 		event_del(&p->event);
3410 		memset(&p->event, 0, sizeof(p->event));
3411 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3412 			fn, p);
3413 		if(event_base_set(event_base, &p->event) != 0)
3414 			log_msg(LOG_ERR, "event base set failed");
3415 		if(event_add(&p->event, &timeout) != 0)
3416 			log_msg(LOG_ERR, "event add failed");
3417 	}
3418 
3419 	/* handle it */
3420 	while(nsd->current_tcp_count > 0) {
3421 		mode_t m = server_signal_mode(nsd);
3422 		struct event timeout;
3423 		struct timeval tv;
3424 		int timed_out = 0;
3425 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3426 			m == NSD_REAP_CHILDREN) {
3427 			/* quit */
3428 			break;
3429 		}
3430 		/* timer */
3431 		/* have to do something every second */
3432 		tv.tv_sec = 1;
3433 		tv.tv_usec = 0;
3434 		memset(&timeout, 0, sizeof(timeout));
3435 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3436 			&timed_out);
3437 		if(event_base_set(event_base, &timeout) != 0)
3438 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3439 		if(event_add(&timeout, &tv) != 0)
3440 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3441 
3442 		/* service loop */
3443 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3444 			if (errno != EINTR) {
3445 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3446 				break;
3447 			}
3448 		}
3449 		if(!timed_out) {
3450 			event_del(&timeout);
3451 		} else {
3452 			/* timed out, quit */
3453 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3454 			break;
3455 		}
3456 	}
3457 #ifdef MEMCLEAN
3458 	event_base_free(event_base);
3459 #endif
3460 	/* continue to quit after return */
3461 }
3462 
3463 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3464  * are always used, even if nonblocking operations are broken, in which case
3465  * NUM_RECV_PER_SELECT is defined to 1 (one).
3466  */
3467 #if defined(HAVE_RECVMMSG)
3468 #define nsd_recvmmsg recvmmsg
3469 #else /* !HAVE_RECVMMSG */
3470 
3471 static int
nsd_recvmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags,struct timespec * timeout)3472 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3473              int flags, struct timespec *timeout)
3474 {
3475 	unsigned int vpos = 0;
3476 	ssize_t rcvd;
3477 
3478 	/* timeout is ignored, ensure caller does not expect it to work */
3479 	assert(timeout == NULL); (void)timeout;
3480 
3481 	while(vpos < vlen) {
3482 		rcvd = recvfrom(sockfd,
3483 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3484 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3485 		                flags,
3486 		                msgvec[vpos].msg_hdr.msg_name,
3487 		               &msgvec[vpos].msg_hdr.msg_namelen);
3488 		if(rcvd < 0) {
3489 			break;
3490 		} else {
3491 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3492 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3493 			vpos++;
3494 		}
3495 	}
3496 
3497 	if(vpos) {
3498 		/* error will be picked up next time */
3499 		return (int)vpos;
3500 	} else if(errno == 0) {
3501 		return 0;
3502 	} else if(errno == EAGAIN) {
3503 		return 0;
3504 	}
3505 
3506 	return -1;
3507 }
3508 #endif /* HAVE_RECVMMSG */
3509 
3510 #ifdef HAVE_SENDMMSG
3511 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3512 #else /* !HAVE_SENDMMSG */
3513 
3514 static int
nsd_sendmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags)3515 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3516 {
3517 	unsigned int vpos = 0;
3518 	ssize_t snd;
3519 
3520 	while(vpos < vlen) {
3521 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3522 		snd = sendto(sockfd,
3523 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3524 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3525 		             flags,
3526 		             msgvec[vpos].msg_hdr.msg_name,
3527 		             msgvec[vpos].msg_hdr.msg_namelen);
3528 		if(snd < 0) {
3529 			break;
3530 		} else {
3531 			msgvec[vpos].msg_len = (unsigned int)snd;
3532 			vpos++;
3533 		}
3534 	}
3535 
3536 	if(vpos) {
3537 		return (int)vpos;
3538 	} else if(errno == 0) {
3539 		return 0;
3540 	}
3541 
3542 	return -1;
3543 }
3544 #endif /* HAVE_SENDMMSG */
3545 
3546 static int
port_is_zero(struct sockaddr_storage * addr)3547 port_is_zero(
3548 #ifdef INET6
3549         struct sockaddr_storage *addr
3550 #else
3551         struct sockaddr_in *addr
3552 #endif
3553 	)
3554 {
3555 #ifdef INET6
3556 	if(addr->ss_family == AF_INET6) {
3557 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3558 	} else if(addr->ss_family == AF_INET) {
3559 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3560 	}
3561 	return 0;
3562 #else
3563 	if(addr->sin_family == AF_INET) {
3564 		return addr->sin_port == 0;
3565 	}
3566 	return 0;
3567 #endif
3568 }
3569 
3570 static void
handle_udp(int fd,short event,void * arg)3571 handle_udp(int fd, short event, void* arg)
3572 {
3573 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3574 	int received, sent, recvcount, i;
3575 	struct query *q;
3576 	uint32_t now = 0;
3577 
3578 	if (!(event & EV_READ)) {
3579 		return;
3580 	}
3581 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3582 	/* this printf strangely gave a performance increase on Linux */
3583 	/* printf("recvcount %d \n", recvcount); */
3584 	if (recvcount == -1) {
3585 		if (errno != EAGAIN && errno != EINTR) {
3586 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3587 			STATUP(data->nsd, rxerr);
3588 			/* No zone statup */
3589 		}
3590 		/* Simply no data available */
3591 		return;
3592 	}
3593 	for (i = 0; i < recvcount; i++) {
3594 	loopstart:
3595 		received = msgs[i].msg_len;
3596 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3597 		q = queries[i];
3598 		if (received == -1) {
3599 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3600 #if defined(HAVE_RECVMMSG)
3601 				msgs[i].msg_hdr.msg_flags
3602 #else
3603 				errno
3604 #endif
3605 				));
3606 			STATUP(data->nsd, rxerr);
3607 			/* No zone statup */
3608 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3609 			iovecs[i].iov_len = buffer_remaining(q->packet);
3610 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3611 			goto swap_drop;
3612 		}
3613 
3614 		/* Account... */
3615 #ifdef BIND8_STATS
3616 		if (data->socket->addr.ai_family == AF_INET) {
3617 			STATUP(data->nsd, qudp);
3618 		} else if (data->socket->addr.ai_family == AF_INET6) {
3619 			STATUP(data->nsd, qudp6);
3620 		}
3621 #endif
3622 
3623 		buffer_skip(q->packet, received);
3624 		buffer_flip(q->packet);
3625 #ifdef USE_DNSTAP
3626 		/*
3627 		 * sending UDP-query with server address (local) and client address to dnstap process
3628 		 */
3629 		log_addr("query from client", &q->addr);
3630 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3631 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen,
3632 			q->tcp, q->packet);
3633 #endif /* USE_DNSTAP */
3634 
3635 		/* Process and answer the query... */
3636 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3637 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3638 				STATUP(data->nsd, nona);
3639 				ZTATUP(data->nsd, q->zone, nona);
3640 			}
3641 
3642 #ifdef USE_ZONE_STATS
3643 			if (data->socket->addr.ai_family == AF_INET) {
3644 				ZTATUP(data->nsd, q->zone, qudp);
3645 			} else if (data->socket->addr.ai_family == AF_INET6) {
3646 				ZTATUP(data->nsd, q->zone, qudp6);
3647 			}
3648 #endif
3649 
3650 			/* Add EDNS0 and TSIG info if necessary.  */
3651 			query_add_optional(q, data->nsd, &now);
3652 
3653 			buffer_flip(q->packet);
3654 			iovecs[i].iov_len = buffer_remaining(q->packet);
3655 #ifdef BIND8_STATS
3656 			/* Account the rcode & TC... */
3657 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3658 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3659 			if (TC(q->packet)) {
3660 				STATUP(data->nsd, truncated);
3661 				ZTATUP(data->nsd, q->zone, truncated);
3662 			}
3663 #endif /* BIND8_STATS */
3664 #ifdef USE_DNSTAP
3665 			/*
3666 			 * sending UDP-response with server address (local) and client address to dnstap process
3667 			 */
3668 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3669 			log_addr("response to client", &q->addr);
3670 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3671 				&q->addr, q->addrlen, q->tcp, q->packet,
3672 				q->zone);
3673 #endif /* USE_DNSTAP */
3674 		} else {
3675 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3676 			iovecs[i].iov_len = buffer_remaining(q->packet);
3677 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3678 		swap_drop:
3679 			STATUP(data->nsd, dropped);
3680 			ZTATUP(data->nsd, q->zone, dropped);
3681 			if(i != recvcount-1) {
3682 				/* swap with last and decrease recvcount */
3683 				struct mmsghdr mtmp = msgs[i];
3684 				struct iovec iotmp = iovecs[i];
3685 				recvcount--;
3686 				msgs[i] = msgs[recvcount];
3687 				iovecs[i] = iovecs[recvcount];
3688 				queries[i] = queries[recvcount];
3689 				msgs[recvcount] = mtmp;
3690 				iovecs[recvcount] = iotmp;
3691 				queries[recvcount] = q;
3692 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3693 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3694 				goto loopstart;
3695 			} else { recvcount --; }
3696 		}
3697 	}
3698 
3699 	/* send until all are sent */
3700 	i = 0;
3701 	while(i<recvcount) {
3702 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3703 		if(sent == -1) {
3704 			if(errno == ENOBUFS ||
3705 #ifdef EWOULDBLOCK
3706 				errno == EWOULDBLOCK ||
3707 #endif
3708 				errno == EAGAIN) {
3709 				/* block to wait until send buffer avail */
3710 				int flag, errstore;
3711 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3712 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3713 					flag = 0;
3714 				}
3715 				flag &= ~O_NONBLOCK;
3716 				if(fcntl(fd, F_SETFL, flag) == -1)
3717 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3718 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3719 				errstore = errno;
3720 				flag |= O_NONBLOCK;
3721 				if(fcntl(fd, F_SETFL, flag) == -1)
3722 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3723 				if(sent != -1) {
3724 					i += sent;
3725 					continue;
3726 				}
3727 				errno = errstore;
3728 			}
3729 			if(errno == EINVAL) {
3730 				/* skip the invalid argument entry,
3731 				 * send the remaining packets in the list */
3732 				if(!(port_is_zero((void*)&queries[i]->addr) &&
3733 					verbosity < 3)) {
3734 					const char* es = strerror(errno);
3735 					char a[64];
3736 					addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3737 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3738 				}
3739 				i += 1;
3740 				continue;
3741 			}
3742 			/* don't log transient network full errors, unless
3743 			 * on higher verbosity */
3744 			if(!(errno == ENOBUFS && verbosity < 1) &&
3745 #ifdef EWOULDBLOCK
3746 			   errno != EWOULDBLOCK &&
3747 #endif
3748 			   errno != EAGAIN) {
3749 				const char* es = strerror(errno);
3750 				char a[64];
3751 				addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3752 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3753 			}
3754 #ifdef BIND8_STATS
3755 			data->nsd->st.txerr += recvcount-i;
3756 #endif /* BIND8_STATS */
3757 			break;
3758 		}
3759 		i += sent;
3760 	}
3761 	for(i=0; i<recvcount; i++) {
3762 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3763 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3764 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3765 	}
3766 }
3767 
3768 #ifdef HAVE_SSL
3769 /*
3770  * Setup an event for the tcp handler.
3771  */
3772 static void
tcp_handler_setup_event(struct tcp_handler_data * data,void (* fn)(int,short,void *),int fd,short event)3773 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3774        int fd, short event)
3775 {
3776 	struct timeval timeout;
3777 	struct event_base* ev_base;
3778 
3779 	timeout.tv_sec = data->nsd->tcp_timeout;
3780 	timeout.tv_usec = 0L;
3781 
3782 	ev_base = data->event.ev_base;
3783 	event_del(&data->event);
3784 	memset(&data->event, 0, sizeof(data->event));
3785 	event_set(&data->event, fd, event, fn, data);
3786 	if(event_base_set(ev_base, &data->event) != 0)
3787 		log_msg(LOG_ERR, "event base set failed");
3788 	if(event_add(&data->event, &timeout) != 0)
3789 		log_msg(LOG_ERR, "event add failed");
3790 }
3791 #endif /* HAVE_SSL */
3792 
3793 static void
cleanup_tcp_handler(struct tcp_handler_data * data)3794 cleanup_tcp_handler(struct tcp_handler_data* data)
3795 {
3796 	event_del(&data->event);
3797 #ifdef HAVE_SSL
3798 	if(data->tls) {
3799 		SSL_shutdown(data->tls);
3800 		SSL_free(data->tls);
3801 		data->tls = NULL;
3802 	}
3803 #endif
3804 	close(data->event.ev_fd);
3805 	if(data->prev)
3806 		data->prev->next = data->next;
3807 	else	tcp_active_list = data->next;
3808 	if(data->next)
3809 		data->next->prev = data->prev;
3810 
3811 	/*
3812 	 * Enable the TCP accept handlers when the current number of
3813 	 * TCP connections is about to drop below the maximum number
3814 	 * of TCP connections.
3815 	 */
3816 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3817 		configure_handler_event_types(EV_READ|EV_PERSIST);
3818 		if(slowaccept) {
3819 			event_del(&slowaccept_event);
3820 			slowaccept = 0;
3821 		}
3822 	}
3823 	--data->nsd->current_tcp_count;
3824 	assert(data->nsd->current_tcp_count >= 0);
3825 
3826 	region_destroy(data->region);
3827 }
3828 
3829 static void
handle_tcp_reading(int fd,short event,void * arg)3830 handle_tcp_reading(int fd, short event, void* arg)
3831 {
3832 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3833 	ssize_t received;
3834 	struct event_base* ev_base;
3835 	struct timeval timeout;
3836 	uint32_t now = 0;
3837 
3838 	if ((event & EV_TIMEOUT)) {
3839 		/* Connection timed out.  */
3840 		cleanup_tcp_handler(data);
3841 		return;
3842 	}
3843 
3844 	if ((data->nsd->tcp_query_count > 0 &&
3845 		data->query_count >= data->nsd->tcp_query_count) ||
3846 		data->tcp_no_more_queries) {
3847 		/* No more queries allowed on this tcp connection. */
3848 		cleanup_tcp_handler(data);
3849 		return;
3850 	}
3851 
3852 	assert((event & EV_READ));
3853 
3854 	if (data->bytes_transmitted == 0) {
3855 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3856 	}
3857 
3858 	/*
3859 	 * Check if we received the leading packet length bytes yet.
3860 	 */
3861 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3862 		received = read(fd,
3863 				(char *) &data->query->tcplen
3864 				+ data->bytes_transmitted,
3865 				sizeof(uint16_t) - data->bytes_transmitted);
3866 		if (received == -1) {
3867 			if (errno == EAGAIN || errno == EINTR) {
3868 				/*
3869 				 * Read would block, wait until more
3870 				 * data is available.
3871 				 */
3872 				return;
3873 			} else {
3874 				char buf[48];
3875 				addr2str(&data->query->addr, buf, sizeof(buf));
3876 #ifdef ECONNRESET
3877 				if (verbosity >= 2 || errno != ECONNRESET)
3878 #endif /* ECONNRESET */
3879 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3880 				cleanup_tcp_handler(data);
3881 				return;
3882 			}
3883 		} else if (received == 0) {
3884 			/* EOF */
3885 			cleanup_tcp_handler(data);
3886 			return;
3887 		}
3888 
3889 		data->bytes_transmitted += received;
3890 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3891 			/*
3892 			 * Not done with the tcplen yet, wait for more
3893 			 * data to become available.
3894 			 */
3895 			return;
3896 		}
3897 
3898 		assert(data->bytes_transmitted == sizeof(uint16_t));
3899 
3900 		data->query->tcplen = ntohs(data->query->tcplen);
3901 
3902 		/*
3903 		 * Minimum query size is:
3904 		 *
3905 		 *     Size of the header (12)
3906 		 *   + Root domain name   (1)
3907 		 *   + Query class        (2)
3908 		 *   + Query type         (2)
3909 		 */
3910 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3911 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3912 			cleanup_tcp_handler(data);
3913 			return;
3914 		}
3915 
3916 		if (data->query->tcplen > data->query->maxlen) {
3917 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3918 			cleanup_tcp_handler(data);
3919 			return;
3920 		}
3921 
3922 		buffer_set_limit(data->query->packet, data->query->tcplen);
3923 	}
3924 
3925 	assert(buffer_remaining(data->query->packet) > 0);
3926 
3927 	/* Read the (remaining) query data.  */
3928 	received = read(fd,
3929 			buffer_current(data->query->packet),
3930 			buffer_remaining(data->query->packet));
3931 	if (received == -1) {
3932 		if (errno == EAGAIN || errno == EINTR) {
3933 			/*
3934 			 * Read would block, wait until more data is
3935 			 * available.
3936 			 */
3937 			return;
3938 		} else {
3939 			char buf[48];
3940 			addr2str(&data->query->addr, buf, sizeof(buf));
3941 #ifdef ECONNRESET
3942 			if (verbosity >= 2 || errno != ECONNRESET)
3943 #endif /* ECONNRESET */
3944 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3945 			cleanup_tcp_handler(data);
3946 			return;
3947 		}
3948 	} else if (received == 0) {
3949 		/* EOF */
3950 		cleanup_tcp_handler(data);
3951 		return;
3952 	}
3953 
3954 	data->bytes_transmitted += received;
3955 	buffer_skip(data->query->packet, received);
3956 	if (buffer_remaining(data->query->packet) > 0) {
3957 		/*
3958 		 * Message not yet complete, wait for more data to
3959 		 * become available.
3960 		 */
3961 		return;
3962 	}
3963 
3964 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3965 
3966 	/* Account... */
3967 #ifdef BIND8_STATS
3968 #ifndef INET6
3969 	STATUP(data->nsd, ctcp);
3970 #else
3971 	if (data->query->addr.ss_family == AF_INET) {
3972 		STATUP(data->nsd, ctcp);
3973 	} else if (data->query->addr.ss_family == AF_INET6) {
3974 		STATUP(data->nsd, ctcp6);
3975 	}
3976 #endif
3977 #endif /* BIND8_STATS */
3978 
3979 	/* We have a complete query, process it.  */
3980 
3981 	/* tcp-query-count: handle query counter ++ */
3982 	data->query_count++;
3983 
3984 	buffer_flip(data->query->packet);
3985 #ifdef USE_DNSTAP
3986 	/*
3987 	 * and send TCP-query with found address (local) and client address to dnstap process
3988 	 */
3989 	log_addr("query from client", &data->query->addr);
3990 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3991 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
3992 		data->query->addrlen, data->query->tcp, data->query->packet);
3993 #endif /* USE_DNSTAP */
3994 	data->query_state = server_process_query(data->nsd, data->query, &now);
3995 	if (data->query_state == QUERY_DISCARDED) {
3996 		/* Drop the packet and the entire connection... */
3997 		STATUP(data->nsd, dropped);
3998 		ZTATUP(data->nsd, data->query->zone, dropped);
3999 		cleanup_tcp_handler(data);
4000 		return;
4001 	}
4002 
4003 #ifdef BIND8_STATS
4004 	if (RCODE(data->query->packet) == RCODE_OK
4005 	    && !AA(data->query->packet))
4006 	{
4007 		STATUP(data->nsd, nona);
4008 		ZTATUP(data->nsd, data->query->zone, nona);
4009 	}
4010 #endif /* BIND8_STATS */
4011 
4012 #ifdef USE_ZONE_STATS
4013 #ifndef INET6
4014 	ZTATUP(data->nsd, data->query->zone, ctcp);
4015 #else
4016 	if (data->query->addr.ss_family == AF_INET) {
4017 		ZTATUP(data->nsd, data->query->zone, ctcp);
4018 	} else if (data->query->addr.ss_family == AF_INET6) {
4019 		ZTATUP(data->nsd, data->query->zone, ctcp6);
4020 	}
4021 #endif
4022 #endif /* USE_ZONE_STATS */
4023 
4024 	query_add_optional(data->query, data->nsd, &now);
4025 
4026 	/* Switch to the tcp write handler.  */
4027 	buffer_flip(data->query->packet);
4028 	data->query->tcplen = buffer_remaining(data->query->packet);
4029 #ifdef BIND8_STATS
4030 	/* Account the rcode & TC... */
4031 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4032 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4033 	if (TC(data->query->packet)) {
4034 		STATUP(data->nsd, truncated);
4035 		ZTATUP(data->nsd, data->query->zone, truncated);
4036 	}
4037 #endif /* BIND8_STATS */
4038 #ifdef USE_DNSTAP
4039 	/*
4040 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4041 	 */
4042 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4043 	log_addr("response to client", &data->query->addr);
4044 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4045 		data->query->addrlen, data->query->tcp, data->query->packet,
4046 		data->query->zone);
4047 #endif /* USE_DNSTAP */
4048 	data->bytes_transmitted = 0;
4049 
4050 	timeout.tv_sec = data->tcp_timeout / 1000;
4051 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4052 
4053 	ev_base = data->event.ev_base;
4054 	event_del(&data->event);
4055 	memset(&data->event, 0, sizeof(data->event));
4056 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4057 		handle_tcp_reading, data);
4058 	if(event_base_set(ev_base, &data->event) != 0)
4059 		log_msg(LOG_ERR, "event base set tcpr failed");
4060 	if(event_add(&data->event, &timeout) != 0)
4061 		log_msg(LOG_ERR, "event add tcpr failed");
4062 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4063 	handle_tcp_writing(fd, EV_WRITE, data);
4064 }
4065 
4066 static void
handle_tcp_writing(int fd,short event,void * arg)4067 handle_tcp_writing(int fd, short event, void* arg)
4068 {
4069 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4070 	ssize_t sent;
4071 	struct query *q = data->query;
4072 	struct timeval timeout;
4073 	struct event_base* ev_base;
4074 	uint32_t now = 0;
4075 
4076 	if ((event & EV_TIMEOUT)) {
4077 		/* Connection timed out.  */
4078 		cleanup_tcp_handler(data);
4079 		return;
4080 	}
4081 
4082 	assert((event & EV_WRITE));
4083 
4084 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
4085 		/* Writing the response packet length.  */
4086 		uint16_t n_tcplen = htons(q->tcplen);
4087 #ifdef HAVE_WRITEV
4088 		struct iovec iov[2];
4089 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4090 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4091 		iov[1].iov_base = buffer_begin(q->packet);
4092 		iov[1].iov_len = buffer_limit(q->packet);
4093 		sent = writev(fd, iov, 2);
4094 #else /* HAVE_WRITEV */
4095 		sent = write(fd,
4096 			     (const char *) &n_tcplen + data->bytes_transmitted,
4097 			     sizeof(n_tcplen) - data->bytes_transmitted);
4098 #endif /* HAVE_WRITEV */
4099 		if (sent == -1) {
4100 			if (errno == EAGAIN || errno == EINTR) {
4101 				/*
4102 				 * Write would block, wait until
4103 				 * socket becomes writable again.
4104 				 */
4105 				return;
4106 			} else {
4107 #ifdef ECONNRESET
4108 				if(verbosity >= 2 || errno != ECONNRESET)
4109 #endif /* ECONNRESET */
4110 #ifdef EPIPE
4111 				  if(verbosity >= 2 || errno != EPIPE)
4112 #endif /* EPIPE 'broken pipe' */
4113 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4114 				cleanup_tcp_handler(data);
4115 				return;
4116 			}
4117 		}
4118 
4119 		data->bytes_transmitted += sent;
4120 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
4121 			/*
4122 			 * Writing not complete, wait until socket
4123 			 * becomes writable again.
4124 			 */
4125 			return;
4126 		}
4127 
4128 #ifdef HAVE_WRITEV
4129 		sent -= sizeof(n_tcplen);
4130 		/* handle potential 'packet done' code */
4131 		goto packet_could_be_done;
4132 #endif
4133  	}
4134 
4135 	sent = write(fd,
4136 		     buffer_current(q->packet),
4137 		     buffer_remaining(q->packet));
4138 	if (sent == -1) {
4139 		if (errno == EAGAIN || errno == EINTR) {
4140 			/*
4141 			 * Write would block, wait until
4142 			 * socket becomes writable again.
4143 			 */
4144 			return;
4145 		} else {
4146 #ifdef ECONNRESET
4147 			if(verbosity >= 2 || errno != ECONNRESET)
4148 #endif /* ECONNRESET */
4149 #ifdef EPIPE
4150 				  if(verbosity >= 2 || errno != EPIPE)
4151 #endif /* EPIPE 'broken pipe' */
4152 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4153 			cleanup_tcp_handler(data);
4154 			return;
4155 		}
4156 	}
4157 
4158 	data->bytes_transmitted += sent;
4159 #ifdef HAVE_WRITEV
4160   packet_could_be_done:
4161 #endif
4162 	buffer_skip(q->packet, sent);
4163 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4164 		/*
4165 		 * Still more data to write when socket becomes
4166 		 * writable again.
4167 		 */
4168 		return;
4169 	}
4170 
4171 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4172 
4173 	if (data->query_state == QUERY_IN_AXFR ||
4174 		data->query_state == QUERY_IN_IXFR) {
4175 		/* Continue processing AXFR and writing back results.  */
4176 		buffer_clear(q->packet);
4177 		if(data->query_state == QUERY_IN_AXFR)
4178 			data->query_state = query_axfr(data->nsd, q, 0);
4179 		else data->query_state = query_ixfr(data->nsd, q);
4180 		if (data->query_state != QUERY_PROCESSED) {
4181 			query_add_optional(data->query, data->nsd, &now);
4182 
4183 			/* Reset data. */
4184 			buffer_flip(q->packet);
4185 			q->tcplen = buffer_remaining(q->packet);
4186 			data->bytes_transmitted = 0;
4187 			/* Reset timeout.  */
4188 			timeout.tv_sec = data->tcp_timeout / 1000;
4189 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4190 			ev_base = data->event.ev_base;
4191 			event_del(&data->event);
4192 			memset(&data->event, 0, sizeof(data->event));
4193 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4194 				handle_tcp_writing, data);
4195 			if(event_base_set(ev_base, &data->event) != 0)
4196 				log_msg(LOG_ERR, "event base set tcpw failed");
4197 			if(event_add(&data->event, &timeout) != 0)
4198 				log_msg(LOG_ERR, "event add tcpw failed");
4199 
4200 			/*
4201 			 * Write data if/when the socket is writable
4202 			 * again.
4203 			 */
4204 			return;
4205 		}
4206 	}
4207 
4208 	/*
4209 	 * Done sending, wait for the next request to arrive on the
4210 	 * TCP socket by installing the TCP read handler.
4211 	 */
4212 	if ((data->nsd->tcp_query_count > 0 &&
4213 		data->query_count >= data->nsd->tcp_query_count) ||
4214 		data->tcp_no_more_queries) {
4215 
4216 		(void) shutdown(fd, SHUT_WR);
4217 	}
4218 
4219 	data->bytes_transmitted = 0;
4220 
4221 	timeout.tv_sec = data->tcp_timeout / 1000;
4222 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4223 	ev_base = data->event.ev_base;
4224 	event_del(&data->event);
4225 	memset(&data->event, 0, sizeof(data->event));
4226 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4227 		handle_tcp_reading, data);
4228 	if(event_base_set(ev_base, &data->event) != 0)
4229 		log_msg(LOG_ERR, "event base set tcpw failed");
4230 	if(event_add(&data->event, &timeout) != 0)
4231 		log_msg(LOG_ERR, "event add tcpw failed");
4232 }
4233 
4234 #ifdef HAVE_SSL
4235 /** create SSL object and associate fd */
4236 static SSL*
incoming_ssl_fd(SSL_CTX * ctx,int fd)4237 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4238 {
4239 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4240 	if(!ssl) {
4241 		log_crypto_err("could not SSL_new");
4242 		return NULL;
4243 	}
4244 	SSL_set_accept_state(ssl);
4245 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4246 	if(!SSL_set_fd(ssl, fd)) {
4247 		log_crypto_err("could not SSL_set_fd");
4248 		SSL_free(ssl);
4249 		return NULL;
4250 	}
4251 	return ssl;
4252 }
4253 
4254 /** TLS handshake to upgrade TCP connection */
4255 static int
tls_handshake(struct tcp_handler_data * data,int fd,int writing)4256 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4257 {
4258 	int r;
4259 	if(data->shake_state == tls_hs_read_event) {
4260 		/* read condition satisfied back to writing */
4261 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4262 		data->shake_state = tls_hs_none;
4263 		return 1;
4264 	}
4265 	if(data->shake_state == tls_hs_write_event) {
4266 		/* write condition satisfied back to reading */
4267 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4268 		data->shake_state = tls_hs_none;
4269 		return 1;
4270 	}
4271 
4272 	/* (continue to) setup the TLS connection */
4273 	ERR_clear_error();
4274 	r = SSL_do_handshake(data->tls);
4275 
4276 	if(r != 1) {
4277 		int want = SSL_get_error(data->tls, r);
4278 		if(want == SSL_ERROR_WANT_READ) {
4279 			if(data->shake_state == tls_hs_read) {
4280 				/* try again later */
4281 				return 1;
4282 			}
4283 			data->shake_state = tls_hs_read;
4284 			/* switch back to reading mode */
4285 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4286 			return 1;
4287 		} else if(want == SSL_ERROR_WANT_WRITE) {
4288 			if(data->shake_state == tls_hs_write) {
4289 				/* try again later */
4290 				return 1;
4291 			}
4292 			data->shake_state = tls_hs_write;
4293 			/* switch back to writing mode */
4294 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4295 			return 1;
4296 		} else {
4297 			if(r == 0)
4298 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4299 			else {
4300 				unsigned long err = ERR_get_error();
4301 				if(!squelch_err_ssl_handshake(err)) {
4302 					char a[64], s[256];
4303 					addr2str(&data->query->addr, a, sizeof(a));
4304 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4305 					log_crypto_from_err(s, err);
4306 				}
4307 			}
4308 			cleanup_tcp_handler(data);
4309 			return 0;
4310 		}
4311 	}
4312 
4313 	/* Use to log successful upgrade for testing - could be removed*/
4314 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4315 	/* set back to the event we need to have when reading (or writing) */
4316 	if(data->shake_state == tls_hs_read && writing) {
4317 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4318 	} else if(data->shake_state == tls_hs_write && !writing) {
4319 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4320 	}
4321 	data->shake_state = tls_hs_none;
4322 	return 1;
4323 }
4324 
4325 /** handle TLS reading of incoming query */
4326 static void
handle_tls_reading(int fd,short event,void * arg)4327 handle_tls_reading(int fd, short event, void* arg)
4328 {
4329 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4330 	ssize_t received;
4331 	uint32_t now = 0;
4332 
4333 	if ((event & EV_TIMEOUT)) {
4334 		/* Connection timed out.  */
4335 		cleanup_tcp_handler(data);
4336 		return;
4337 	}
4338 
4339 	if ((data->nsd->tcp_query_count > 0 &&
4340 	    data->query_count >= data->nsd->tcp_query_count) ||
4341 	    data->tcp_no_more_queries) {
4342 		/* No more queries allowed on this tcp connection. */
4343 		cleanup_tcp_handler(data);
4344 		return;
4345 	}
4346 
4347 	assert((event & EV_READ));
4348 
4349 	if (data->bytes_transmitted == 0) {
4350 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4351 	}
4352 
4353 	if(data->shake_state != tls_hs_none) {
4354 		if(!tls_handshake(data, fd, 0))
4355 			return;
4356 		if(data->shake_state != tls_hs_none)
4357 			return;
4358 	}
4359 
4360 	/*
4361 	 * Check if we received the leading packet length bytes yet.
4362 	 */
4363 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4364 		ERR_clear_error();
4365 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
4366 		    + data->bytes_transmitted,
4367 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4368 			int want = SSL_get_error(data->tls, received);
4369 			if(want == SSL_ERROR_ZERO_RETURN) {
4370 				cleanup_tcp_handler(data);
4371 				return; /* shutdown, closed */
4372 			} else if(want == SSL_ERROR_WANT_READ) {
4373 				/* wants to be called again */
4374 				return;
4375 			}
4376 			else if(want == SSL_ERROR_WANT_WRITE) {
4377 				/* switch to writing */
4378 				data->shake_state = tls_hs_write_event;
4379 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4380 				return;
4381 			}
4382 			cleanup_tcp_handler(data);
4383 			log_crypto_err("could not SSL_read");
4384 			return;
4385 		}
4386 
4387 		data->bytes_transmitted += received;
4388 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4389 			/*
4390 			 * Not done with the tcplen yet, wait for more
4391 			 * data to become available.
4392 			 */
4393 			return;
4394 		}
4395 
4396 		assert(data->bytes_transmitted == sizeof(uint16_t));
4397 
4398 		data->query->tcplen = ntohs(data->query->tcplen);
4399 
4400 		/*
4401 		 * Minimum query size is:
4402 		 *
4403 		 *     Size of the header (12)
4404 		 *   + Root domain name   (1)
4405 		 *   + Query class        (2)
4406 		 *   + Query type         (2)
4407 		 */
4408 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4409 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4410 			cleanup_tcp_handler(data);
4411 			return;
4412 		}
4413 
4414 		if (data->query->tcplen > data->query->maxlen) {
4415 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4416 			cleanup_tcp_handler(data);
4417 			return;
4418 		}
4419 
4420 		buffer_set_limit(data->query->packet, data->query->tcplen);
4421 	}
4422 
4423 	assert(buffer_remaining(data->query->packet) > 0);
4424 
4425 	/* Read the (remaining) query data.  */
4426 	ERR_clear_error();
4427 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4428 			    (int)buffer_remaining(data->query->packet));
4429 	if(received <= 0) {
4430 		int want = SSL_get_error(data->tls, received);
4431 		if(want == SSL_ERROR_ZERO_RETURN) {
4432 			cleanup_tcp_handler(data);
4433 			return; /* shutdown, closed */
4434 		} else if(want == SSL_ERROR_WANT_READ) {
4435 			/* wants to be called again */
4436 			return;
4437 		}
4438 		else if(want == SSL_ERROR_WANT_WRITE) {
4439 			/* switch back writing */
4440 			data->shake_state = tls_hs_write_event;
4441 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4442 			return;
4443 		}
4444 		cleanup_tcp_handler(data);
4445 		log_crypto_err("could not SSL_read");
4446 		return;
4447 	}
4448 
4449 	data->bytes_transmitted += received;
4450 	buffer_skip(data->query->packet, received);
4451 	if (buffer_remaining(data->query->packet) > 0) {
4452 		/*
4453 		 * Message not yet complete, wait for more data to
4454 		 * become available.
4455 		 */
4456 		return;
4457 	}
4458 
4459 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4460 
4461 	/* Account... */
4462 #ifndef INET6
4463 	STATUP(data->nsd, ctls);
4464 #else
4465 	if (data->query->addr.ss_family == AF_INET) {
4466 		STATUP(data->nsd, ctls);
4467 	} else if (data->query->addr.ss_family == AF_INET6) {
4468 		STATUP(data->nsd, ctls6);
4469 	}
4470 #endif
4471 
4472 	/* We have a complete query, process it.  */
4473 
4474 	/* tcp-query-count: handle query counter ++ */
4475 	data->query_count++;
4476 
4477 	buffer_flip(data->query->packet);
4478 #ifdef USE_DNSTAP
4479 	/*
4480 	 * and send TCP-query with found address (local) and client address to dnstap process
4481 	 */
4482 	log_addr("query from client", &data->query->addr);
4483 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4484 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4485 		data->query->addrlen, data->query->tcp, data->query->packet);
4486 #endif /* USE_DNSTAP */
4487 	data->query_state = server_process_query(data->nsd, data->query, &now);
4488 	if (data->query_state == QUERY_DISCARDED) {
4489 		/* Drop the packet and the entire connection... */
4490 		STATUP(data->nsd, dropped);
4491 		ZTATUP(data->nsd, data->query->zone, dropped);
4492 		cleanup_tcp_handler(data);
4493 		return;
4494 	}
4495 
4496 #ifdef BIND8_STATS
4497 	if (RCODE(data->query->packet) == RCODE_OK
4498 	    && !AA(data->query->packet))
4499 	{
4500 		STATUP(data->nsd, nona);
4501 		ZTATUP(data->nsd, data->query->zone, nona);
4502 	}
4503 #endif /* BIND8_STATS */
4504 
4505 #ifdef USE_ZONE_STATS
4506 #ifndef INET6
4507 	ZTATUP(data->nsd, data->query->zone, ctls);
4508 #else
4509 	if (data->query->addr.ss_family == AF_INET) {
4510 		ZTATUP(data->nsd, data->query->zone, ctls);
4511 	} else if (data->query->addr.ss_family == AF_INET6) {
4512 		ZTATUP(data->nsd, data->query->zone, ctls6);
4513 	}
4514 #endif
4515 #endif /* USE_ZONE_STATS */
4516 
4517 	query_add_optional(data->query, data->nsd, &now);
4518 
4519 	/* Switch to the tcp write handler.  */
4520 	buffer_flip(data->query->packet);
4521 	data->query->tcplen = buffer_remaining(data->query->packet);
4522 #ifdef BIND8_STATS
4523 	/* Account the rcode & TC... */
4524 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4525 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4526 	if (TC(data->query->packet)) {
4527 		STATUP(data->nsd, truncated);
4528 		ZTATUP(data->nsd, data->query->zone, truncated);
4529 	}
4530 #endif /* BIND8_STATS */
4531 #ifdef USE_DNSTAP
4532 	/*
4533 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4534 	 */
4535 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4536 	log_addr("response to client", &data->query->addr);
4537 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4538 		data->query->addrlen, data->query->tcp, data->query->packet,
4539 		data->query->zone);
4540 #endif /* USE_DNSTAP */
4541 	data->bytes_transmitted = 0;
4542 
4543 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4544 
4545 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4546 	handle_tls_writing(fd, EV_WRITE, data);
4547 }
4548 
4549 /** handle TLS writing of outgoing response */
4550 static void
handle_tls_writing(int fd,short event,void * arg)4551 handle_tls_writing(int fd, short event, void* arg)
4552 {
4553 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4554 	ssize_t sent;
4555 	struct query *q = data->query;
4556 	/* static variable that holds reassembly buffer used to put the
4557 	 * TCP length in front of the packet, like writev. */
4558 	static buffer_type* global_tls_temp_buffer = NULL;
4559 	buffer_type* write_buffer;
4560 	uint32_t now = 0;
4561 
4562 	if ((event & EV_TIMEOUT)) {
4563 		/* Connection timed out.  */
4564 		cleanup_tcp_handler(data);
4565 		return;
4566 	}
4567 
4568 	assert((event & EV_WRITE));
4569 
4570 	if(data->shake_state != tls_hs_none) {
4571 		if(!tls_handshake(data, fd, 1))
4572 			return;
4573 		if(data->shake_state != tls_hs_none)
4574 			return;
4575 	}
4576 
4577 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4578 
4579 	/* If we are writing the start of a message, we must include the length
4580 	 * this is done with a copy into write_buffer. */
4581 	write_buffer = NULL;
4582 	if (data->bytes_transmitted == 0) {
4583 		if(!global_tls_temp_buffer) {
4584 			/* gets deallocated when nsd shuts down from
4585 			 * nsd.region */
4586 			global_tls_temp_buffer = buffer_create(nsd.region,
4587 				QIOBUFSZ + sizeof(q->tcplen));
4588 			if (!global_tls_temp_buffer) {
4589 				return;
4590 			}
4591 		}
4592 		write_buffer = global_tls_temp_buffer;
4593 		buffer_clear(write_buffer);
4594 		buffer_write_u16(write_buffer, q->tcplen);
4595 		buffer_write(write_buffer, buffer_current(q->packet),
4596 			(int)buffer_remaining(q->packet));
4597 		buffer_flip(write_buffer);
4598 	} else {
4599 		write_buffer = q->packet;
4600 	}
4601 
4602 	/* Write the response */
4603 	ERR_clear_error();
4604 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4605 	if(sent <= 0) {
4606 		int want = SSL_get_error(data->tls, sent);
4607 		if(want == SSL_ERROR_ZERO_RETURN) {
4608 			cleanup_tcp_handler(data);
4609 			/* closed */
4610 		} else if(want == SSL_ERROR_WANT_READ) {
4611 			/* switch back to reading */
4612 			data->shake_state = tls_hs_read_event;
4613 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4614 		} else if(want != SSL_ERROR_WANT_WRITE) {
4615 			cleanup_tcp_handler(data);
4616 			log_crypto_err("could not SSL_write");
4617 		}
4618 		return;
4619 	}
4620 
4621 	buffer_skip(write_buffer, sent);
4622 	if(buffer_remaining(write_buffer) != 0) {
4623 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4624 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4625 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4626 		}
4627 	}
4628 
4629 	data->bytes_transmitted += sent;
4630 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4631 		/*
4632 		 * Still more data to write when socket becomes
4633 		 * writable again.
4634 		 */
4635 		return;
4636 	}
4637 
4638 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4639 
4640 	if (data->query_state == QUERY_IN_AXFR ||
4641 		data->query_state == QUERY_IN_IXFR) {
4642 		/* Continue processing AXFR and writing back results.  */
4643 		buffer_clear(q->packet);
4644 		if(data->query_state == QUERY_IN_AXFR)
4645 			data->query_state = query_axfr(data->nsd, q, 0);
4646 		else data->query_state = query_ixfr(data->nsd, q);
4647 		if (data->query_state != QUERY_PROCESSED) {
4648 			query_add_optional(data->query, data->nsd, &now);
4649 
4650 			/* Reset data. */
4651 			buffer_flip(q->packet);
4652 			q->tcplen = buffer_remaining(q->packet);
4653 			data->bytes_transmitted = 0;
4654 			/* Reset to writing mode.  */
4655 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4656 
4657 			/*
4658 			 * Write data if/when the socket is writable
4659 			 * again.
4660 			 */
4661 			return;
4662 		}
4663 	}
4664 
4665 	/*
4666 	 * Done sending, wait for the next request to arrive on the
4667 	 * TCP socket by installing the TCP read handler.
4668 	 */
4669 	if ((data->nsd->tcp_query_count > 0 &&
4670 		data->query_count >= data->nsd->tcp_query_count) ||
4671 		data->tcp_no_more_queries) {
4672 
4673 		(void) shutdown(fd, SHUT_WR);
4674 	}
4675 
4676 	data->bytes_transmitted = 0;
4677 
4678 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4679 }
4680 #endif
4681 
4682 static void
handle_slowaccept_timeout(int ATTR_UNUSED (fd),short ATTR_UNUSED (event),void * ATTR_UNUSED (arg))4683 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4684 	void* ATTR_UNUSED(arg))
4685 {
4686 	if(slowaccept) {
4687 		configure_handler_event_types(EV_PERSIST | EV_READ);
4688 		slowaccept = 0;
4689 	}
4690 }
4691 
perform_accept(int fd,struct sockaddr * addr,socklen_t * addrlen)4692 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4693 {
4694 #ifndef HAVE_ACCEPT4
4695 	int s = accept(fd, addr, addrlen);
4696 	if (s != -1) {
4697 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4698 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4699 			close(s);
4700 			s = -1;
4701 			errno=EINTR; /* stop error printout as error in accept4
4702 				by setting this errno, it omits printout, in
4703 				later code that calls nsd_accept4 */
4704 		}
4705 	}
4706 	return s;
4707 #else
4708 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4709 #endif /* HAVE_ACCEPT4 */
4710 }
4711 
4712 /*
4713  * Handle an incoming TCP connection.  The connection is accepted and
4714  * a new TCP reader event handler is added.  The TCP handler
4715  * is responsible for cleanup when the connection is closed.
4716  */
4717 static void
handle_tcp_accept(int fd,short event,void * arg)4718 handle_tcp_accept(int fd, short event, void* arg)
4719 {
4720 	struct tcp_accept_handler_data *data
4721 		= (struct tcp_accept_handler_data *) arg;
4722 	int s;
4723 	int reject = 0;
4724 	struct tcp_handler_data *tcp_data;
4725 	region_type *tcp_region;
4726 #ifdef INET6
4727 	struct sockaddr_storage addr;
4728 #else
4729 	struct sockaddr_in addr;
4730 #endif
4731 	socklen_t addrlen;
4732 	struct timeval timeout;
4733 
4734 	if (!(event & EV_READ)) {
4735 		return;
4736 	}
4737 
4738 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4739 		reject = data->nsd->options->tcp_reject_overflow;
4740 		if (!reject) {
4741 			return;
4742 		}
4743 	}
4744 
4745 	/* Accept it... */
4746 	addrlen = sizeof(addr);
4747 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4748 	if (s == -1) {
4749 		/**
4750 		 * EMFILE and ENFILE is a signal that the limit of open
4751 		 * file descriptors has been reached. Pause accept().
4752 		 * EINTR is a signal interrupt. The others are various OS ways
4753 		 * of saying that the client has closed the connection.
4754 		 */
4755 		if (errno == EMFILE || errno == ENFILE) {
4756 			if (!slowaccept) {
4757 				/* disable accept events */
4758 				struct timeval tv;
4759 				configure_handler_event_types(0);
4760 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4761 				tv.tv_usec = 0L;
4762 				memset(&slowaccept_event, 0,
4763 					sizeof(slowaccept_event));
4764 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4765 					handle_slowaccept_timeout, NULL);
4766 				(void)event_base_set(data->event.ev_base,
4767 					&slowaccept_event);
4768 				(void)event_add(&slowaccept_event, &tv);
4769 				slowaccept = 1;
4770 				/* We don't want to spam the logs here */
4771 			}
4772 		} else if (errno != EINTR
4773 			&& errno != EWOULDBLOCK
4774 #ifdef ECONNABORTED
4775 			&& errno != ECONNABORTED
4776 #endif /* ECONNABORTED */
4777 #ifdef EPROTO
4778 			&& errno != EPROTO
4779 #endif /* EPROTO */
4780 			) {
4781 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4782 		}
4783 		return;
4784 	}
4785 
4786 	if (reject) {
4787 		shutdown(s, SHUT_RDWR);
4788 		close(s);
4789 		return;
4790 	}
4791 
4792 	/*
4793 	 * This region is deallocated when the TCP connection is
4794 	 * closed by the TCP handler.
4795 	 */
4796 	tcp_region = region_create(xalloc, free);
4797 	tcp_data = (struct tcp_handler_data *) region_alloc(
4798 		tcp_region, sizeof(struct tcp_handler_data));
4799 	tcp_data->region = tcp_region;
4800 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4801 		compression_table_size, compressed_dnames);
4802 	tcp_data->nsd = data->nsd;
4803 	tcp_data->query_count = 0;
4804 #ifdef HAVE_SSL
4805 	tcp_data->shake_state = tls_hs_none;
4806 	tcp_data->tls = NULL;
4807 #endif
4808 	tcp_data->prev = NULL;
4809 	tcp_data->next = NULL;
4810 
4811 	tcp_data->query_state = QUERY_PROCESSED;
4812 	tcp_data->bytes_transmitted = 0;
4813 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4814 	tcp_data->query->addrlen = addrlen;
4815 
4816 	tcp_data->tcp_no_more_queries = 0;
4817 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4818 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4819 		/* very busy, give smaller timeout */
4820 		tcp_data->tcp_timeout = 200;
4821 	}
4822 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4823 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4824 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4825 
4826 #ifdef USE_DNSTAP
4827 	/* save the address of the connection */
4828 	tcp_data->socket = data->socket;
4829 #endif /* USE_DNSTAP */
4830 
4831 #ifdef HAVE_SSL
4832 	if (data->tls_accept) {
4833 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4834 		if(!tcp_data->tls) {
4835 			close(s);
4836 			return;
4837 		}
4838 		tcp_data->shake_state = tls_hs_read;
4839 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4840 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4841 			  handle_tls_reading, tcp_data);
4842 	} else {
4843 #endif
4844 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4845 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4846 			  handle_tcp_reading, tcp_data);
4847 #ifdef HAVE_SSL
4848 	}
4849 #endif
4850 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4851 		log_msg(LOG_ERR, "cannot set tcp event base");
4852 		close(s);
4853 		region_destroy(tcp_region);
4854 		return;
4855 	}
4856 	if(event_add(&tcp_data->event, &timeout) != 0) {
4857 		log_msg(LOG_ERR, "cannot add tcp to event base");
4858 		close(s);
4859 		region_destroy(tcp_region);
4860 		return;
4861 	}
4862 	if(tcp_active_list) {
4863 		tcp_active_list->prev = tcp_data;
4864 		tcp_data->next = tcp_active_list;
4865 	}
4866 	tcp_active_list = tcp_data;
4867 
4868 	/*
4869 	 * Keep track of the total number of TCP handlers installed so
4870 	 * we can stop accepting connections when the maximum number
4871 	 * of simultaneous TCP connections is reached.
4872 	 *
4873 	 * If tcp-reject-overflow is enabled, however, then we do not
4874 	 * change the handler event type; we keep it as-is and accept
4875 	 * overflow TCP connections only so that we can forcibly kill
4876 	 * them off.
4877 	 */
4878 	++data->nsd->current_tcp_count;
4879 	if (!data->nsd->options->tcp_reject_overflow &&
4880 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4881 	{
4882 		configure_handler_event_types(0);
4883 	}
4884 }
4885 
4886 static void
send_children_command(struct nsd * nsd,sig_atomic_t command,int timeout)4887 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4888 {
4889 	size_t i;
4890 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4891 	for (i = 0; i < nsd->child_count; ++i) {
4892 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4893 			if (write(nsd->children[i].child_fd,
4894 				&command,
4895 				sizeof(command)) == -1)
4896 			{
4897 				if(errno != EAGAIN && errno != EINTR)
4898 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4899 					(int) command,
4900 					(int) nsd->children[i].pid,
4901 					strerror(errno));
4902 			} else if (timeout > 0) {
4903 				(void)block_read(NULL,
4904 					nsd->children[i].child_fd,
4905 					&command, sizeof(command), timeout);
4906 			}
4907 			fsync(nsd->children[i].child_fd);
4908 			close(nsd->children[i].child_fd);
4909 			nsd->children[i].child_fd = -1;
4910 		}
4911 	}
4912 }
4913 
4914 static void
send_children_quit(struct nsd * nsd)4915 send_children_quit(struct nsd* nsd)
4916 {
4917 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4918 	send_children_command(nsd, NSD_QUIT, 0);
4919 }
4920 
4921 static void
send_children_quit_and_wait(struct nsd * nsd)4922 send_children_quit_and_wait(struct nsd* nsd)
4923 {
4924 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4925 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4926 }
4927 
4928 #ifdef BIND8_STATS
4929 static void
set_children_stats(struct nsd * nsd)4930 set_children_stats(struct nsd* nsd)
4931 {
4932 	size_t i;
4933 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4934 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4935 	for (i = 0; i < nsd->child_count; ++i) {
4936 		nsd->children[i].need_to_send_STATS = 1;
4937 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4938 	}
4939 }
4940 #endif /* BIND8_STATS */
4941 
4942 static void
configure_handler_event_types(short event_types)4943 configure_handler_event_types(short event_types)
4944 {
4945 	size_t i;
4946 
4947 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4948 		struct event* handler = &tcp_accept_handlers[i].event;
4949 		if(event_types) {
4950 			/* reassign */
4951 			int fd = handler->ev_fd;
4952 			struct event_base* base = handler->ev_base;
4953 			if(tcp_accept_handlers[i].event_added)
4954 				event_del(handler);
4955 			memset(handler, 0, sizeof(*handler));
4956 			event_set(handler, fd, event_types,
4957 				handle_tcp_accept, &tcp_accept_handlers[i]);
4958 			if(event_base_set(base, handler) != 0)
4959 				log_msg(LOG_ERR, "conhand: cannot event_base");
4960 			if(event_add(handler, NULL) != 0)
4961 				log_msg(LOG_ERR, "conhand: cannot event_add");
4962 			tcp_accept_handlers[i].event_added = 1;
4963 		} else {
4964 			/* remove */
4965 			if(tcp_accept_handlers[i].event_added) {
4966 				event_del(handler);
4967 				tcp_accept_handlers[i].event_added = 0;
4968 			}
4969 		}
4970 	}
4971 }
4972