1 /*
2 * server.c -- nsd(8) network input/output
3 *
4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5 *
6 * See LICENSE for the license.
7 *
8 */
9
10 #include "config.h"
11
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21 #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 # ifdef HAVE_EVENT_H
61 # include <event.h>
62 # else
63 # include <event2/event.h>
64 # include "event2/event_struct.h"
65 # include "event2/event_compat.h"
66 # endif
67 #else
68 # include "mini_event.h"
69 #endif
70
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #include "ixfr.h"
85 #ifdef USE_DNSTAP
86 #include "dnstap/dnstap_collector.h"
87 #endif
88 #include "verify.h"
89
90 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
91
92 #ifdef USE_DNSTAP
93 /*
94 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
95 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
96 */
97 static void
log_addr(const char * descr,struct sockaddr_storage * addr)98 log_addr(const char* descr,
99 #ifdef INET6
100 struct sockaddr_storage* addr
101 #else
102 struct sockaddr_in* addr
103 #endif
104 )
105 {
106 char str_buf[64];
107 if(verbosity < 6)
108 return;
109 if(
110 #ifdef INET6
111 addr->ss_family == AF_INET
112 #else
113 addr->sin_family == AF_INET
114 #endif
115 ) {
116 struct sockaddr_in* s = (struct sockaddr_in*)addr;
117 inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
118 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
119 #ifdef INET6
120 } else {
121 struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
122 inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
123 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
124 #endif
125 }
126 }
127 #endif /* USE_DNSTAP */
128
129 #ifdef USE_TCP_FASTOPEN
130 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
131 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
132 #endif
133
134 /*
135 * Data for the UDP handlers.
136 */
137 struct udp_handler_data
138 {
139 struct nsd *nsd;
140 struct nsd_socket *socket;
141 struct event event;
142 };
143
144 struct tcp_accept_handler_data {
145 struct nsd *nsd;
146 struct nsd_socket *socket;
147 int event_added;
148 struct event event;
149 #ifdef HAVE_SSL
150 /* handler accepts TLS connections on the dedicated port */
151 int tls_accept;
152 #endif
153 };
154
155 /*
156 * These globals are used to enable the TCP accept handlers
157 * when the number of TCP connection drops below the maximum
158 * number of TCP connections.
159 */
160 static size_t tcp_accept_handler_count;
161 static struct tcp_accept_handler_data *tcp_accept_handlers;
162
163 static struct event slowaccept_event;
164 static int slowaccept;
165
166 #ifdef HAVE_SSL
167 static unsigned char *ocspdata = NULL;
168 static long ocspdata_len = 0;
169 #endif
170
171 #ifdef NONBLOCKING_IS_BROKEN
172 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
173 read multiple times from a socket when reported ready by select. */
174 # define NUM_RECV_PER_SELECT (1)
175 #else /* !NONBLOCKING_IS_BROKEN */
176 # define NUM_RECV_PER_SELECT (100)
177 #endif /* NONBLOCKING_IS_BROKEN */
178
179 #ifndef HAVE_MMSGHDR
180 struct mmsghdr {
181 struct msghdr msg_hdr;
182 unsigned int msg_len;
183 };
184 #endif
185
186 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
187 static struct iovec iovecs[NUM_RECV_PER_SELECT];
188 static struct query *queries[NUM_RECV_PER_SELECT];
189
190 /*
191 * Data for the TCP connection handlers.
192 *
193 * The TCP handlers use non-blocking I/O. This is necessary to avoid
194 * blocking the entire server on a slow TCP connection, but does make
195 * reading from and writing to the socket more complicated.
196 *
197 * Basically, whenever a read/write would block (indicated by the
198 * EAGAIN errno variable) we remember the position we were reading
199 * from/writing to and return from the TCP reading/writing event
200 * handler. When the socket becomes readable/writable again we
201 * continue from the same position.
202 */
203 struct tcp_handler_data
204 {
205 /*
206 * The region used to allocate all TCP connection related
207 * data, including this structure. This region is destroyed
208 * when the connection is closed.
209 */
210 region_type* region;
211
212 /*
213 * The global nsd structure.
214 */
215 struct nsd* nsd;
216
217 /*
218 * The current query data for this TCP connection.
219 */
220 query_type* query;
221
222 /*
223 * The query_state is used to remember if we are performing an
224 * AXFR, if we're done processing, or if we should discard the
225 * query and connection.
226 */
227 query_state_type query_state;
228
229 /*
230 * The event for the file descriptor and tcp timeout
231 */
232 struct event event;
233
234 /*
235 * The bytes_transmitted field is used to remember the number
236 * of bytes transmitted when receiving or sending a DNS
237 * packet. The count includes the two additional bytes used
238 * to specify the packet length on a TCP connection.
239 */
240 size_t bytes_transmitted;
241
242 /*
243 * The number of queries handled by this specific TCP connection.
244 */
245 int query_count;
246
247 /*
248 * The timeout in msec for this tcp connection
249 */
250 int tcp_timeout;
251
252 /*
253 * If the connection is allowed to have further queries on it.
254 */
255 int tcp_no_more_queries;
256
257 #ifdef USE_DNSTAP
258 /* the socket of the accept socket to find proper service (local) address the socket is bound to. */
259 struct nsd_socket *socket;
260 #endif /* USE_DNSTAP */
261
262 #ifdef HAVE_SSL
263 /*
264 * TLS object.
265 */
266 SSL* tls;
267
268 /*
269 * TLS handshake state.
270 */
271 enum { tls_hs_none, tls_hs_read, tls_hs_write,
272 tls_hs_read_event, tls_hs_write_event } shake_state;
273 #endif
274 /* list of connections, for service of remaining tcp channels */
275 struct tcp_handler_data *prev, *next;
276 };
277 /* global that is the list of active tcp channels */
278 static struct tcp_handler_data *tcp_active_list = NULL;
279
280 /*
281 * Handle incoming queries on the UDP server sockets.
282 */
283 static void handle_udp(int fd, short event, void* arg);
284
285 /*
286 * Handle incoming connections on the TCP sockets. These handlers
287 * usually wait for the NETIO_EVENT_READ event (indicating an incoming
288 * connection) but are disabled when the number of current TCP
289 * connections is equal to the maximum number of TCP connections.
290 * Disabling is done by changing the handler to wait for the
291 * NETIO_EVENT_NONE type. This is done using the function
292 * configure_tcp_accept_handlers.
293 */
294 static void handle_tcp_accept(int fd, short event, void* arg);
295
296 /*
297 * Handle incoming queries on a TCP connection. The TCP connections
298 * are configured to be non-blocking and the handler may be called
299 * multiple times before a complete query is received.
300 */
301 static void handle_tcp_reading(int fd, short event, void* arg);
302
303 /*
304 * Handle outgoing responses on a TCP connection. The TCP connections
305 * are configured to be non-blocking and the handler may be called
306 * multiple times before a complete response is sent.
307 */
308 static void handle_tcp_writing(int fd, short event, void* arg);
309
310 #ifdef HAVE_SSL
311 /* Create SSL object and associate fd */
312 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
313 /*
314 * Handle TLS handshake. May be called multiple times if incomplete.
315 */
316 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
317
318 /*
319 * Handle incoming queries on a TLS over TCP connection. The TLS
320 * connections are configured to be non-blocking and the handler may
321 * be called multiple times before a complete query is received.
322 */
323 static void handle_tls_reading(int fd, short event, void* arg);
324
325 /*
326 * Handle outgoing responses on a TLS over TCP connection. The TLS
327 * connections are configured to be non-blocking and the handler may
328 * be called multiple times before a complete response is sent.
329 */
330 static void handle_tls_writing(int fd, short event, void* arg);
331 #endif
332
333 /*
334 * Send all children the quit nonblocking, then close pipe.
335 */
336 static void send_children_quit(struct nsd* nsd);
337 /* same, for shutdown time, waits for child to exit to avoid restart issues */
338 static void send_children_quit_and_wait(struct nsd* nsd);
339
340 /* set childrens flags to send NSD_STATS to them */
341 #ifdef BIND8_STATS
342 static void set_children_stats(struct nsd* nsd);
343 #endif /* BIND8_STATS */
344
345 /*
346 * Change the event types the HANDLERS are interested in to EVENT_TYPES.
347 */
348 static void configure_handler_event_types(short event_types);
349
350 static uint16_t *compressed_dname_offsets = 0;
351 static uint32_t compression_table_capacity = 0;
352 static uint32_t compression_table_size = 0;
353 static domain_type* compressed_dnames[MAXRRSPP];
354
355 #ifdef USE_TCP_FASTOPEN
356 /* Checks to see if the kernel value must be manually changed in order for
357 TCP Fast Open to support server mode */
report_tcp_fastopen_config()358 static void report_tcp_fastopen_config() {
359
360 int tcp_fastopen_fp;
361 uint8_t tcp_fastopen_value;
362
363 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
364 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
365 }
366 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
367 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
368 close(tcp_fastopen_fp);
369 }
370 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
371 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
372 log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
373 log_msg(LOG_WARNING, "To enable TFO use the command:");
374 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
375 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
376 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
377 close(tcp_fastopen_fp);
378 }
379 close(tcp_fastopen_fp);
380 }
381 #endif
382
383 /*
384 * Remove the specified pid from the list of child pids. Returns -1 if
385 * the pid is not in the list, child_num otherwise. The field is set to 0.
386 */
387 static int
delete_child_pid(struct nsd * nsd,pid_t pid)388 delete_child_pid(struct nsd *nsd, pid_t pid)
389 {
390 size_t i;
391 for (i = 0; i < nsd->child_count; ++i) {
392 if (nsd->children[i].pid == pid) {
393 nsd->children[i].pid = 0;
394 if(!nsd->children[i].need_to_exit) {
395 if(nsd->children[i].child_fd != -1)
396 close(nsd->children[i].child_fd);
397 nsd->children[i].child_fd = -1;
398 if(nsd->children[i].handler)
399 nsd->children[i].handler->fd = -1;
400 }
401 return i;
402 }
403 }
404 return -1;
405 }
406
407 /*
408 * Restart child servers if necessary.
409 */
410 static int
restart_child_servers(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)411 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
412 int* xfrd_sock_p)
413 {
414 struct main_ipc_handler_data *ipc_data;
415 size_t i;
416 int sv[2];
417
418 /* Fork the child processes... */
419 for (i = 0; i < nsd->child_count; ++i) {
420 if (nsd->children[i].pid <= 0) {
421 if (nsd->children[i].child_fd != -1)
422 close(nsd->children[i].child_fd);
423 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
424 log_msg(LOG_ERR, "socketpair: %s",
425 strerror(errno));
426 return -1;
427 }
428 nsd->children[i].child_fd = sv[0];
429 nsd->children[i].parent_fd = sv[1];
430 nsd->children[i].pid = fork();
431 switch (nsd->children[i].pid) {
432 default: /* SERVER MAIN */
433 close(nsd->children[i].parent_fd);
434 nsd->children[i].parent_fd = -1;
435 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
436 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
437 }
438 if(!nsd->children[i].handler)
439 {
440 ipc_data = (struct main_ipc_handler_data*) region_alloc(
441 region, sizeof(struct main_ipc_handler_data));
442 ipc_data->nsd = nsd;
443 ipc_data->child = &nsd->children[i];
444 ipc_data->child_num = i;
445 ipc_data->xfrd_sock = xfrd_sock_p;
446 ipc_data->packet = buffer_create(region, QIOBUFSZ);
447 ipc_data->forward_mode = 0;
448 ipc_data->got_bytes = 0;
449 ipc_data->total_bytes = 0;
450 ipc_data->acl_num = 0;
451 nsd->children[i].handler = (struct netio_handler*) region_alloc(
452 region, sizeof(struct netio_handler));
453 nsd->children[i].handler->fd = nsd->children[i].child_fd;
454 nsd->children[i].handler->timeout = NULL;
455 nsd->children[i].handler->user_data = ipc_data;
456 nsd->children[i].handler->event_types = NETIO_EVENT_READ;
457 nsd->children[i].handler->event_handler = parent_handle_child_command;
458 netio_add_handler(netio, nsd->children[i].handler);
459 }
460 /* clear any ongoing ipc */
461 ipc_data = (struct main_ipc_handler_data*)
462 nsd->children[i].handler->user_data;
463 ipc_data->forward_mode = 0;
464 /* restart - update fd */
465 nsd->children[i].handler->fd = nsd->children[i].child_fd;
466 break;
467 case 0: /* CHILD */
468 /* the child need not be able to access the
469 * nsd.db file */
470 namedb_close_udb(nsd->db);
471 #ifdef MEMCLEAN /* OS collects memory pages */
472 region_destroy(region);
473 #endif
474 nsd->pid = 0;
475 nsd->child_count = 0;
476 nsd->server_kind = nsd->children[i].kind;
477 nsd->this_child = &nsd->children[i];
478 nsd->this_child->child_num = i;
479 /* remove signal flags inherited from parent
480 the parent will handle them. */
481 nsd->signal_hint_reload_hup = 0;
482 nsd->signal_hint_reload = 0;
483 nsd->signal_hint_child = 0;
484 nsd->signal_hint_quit = 0;
485 nsd->signal_hint_shutdown = 0;
486 nsd->signal_hint_stats = 0;
487 nsd->signal_hint_statsusr = 0;
488 close(*xfrd_sock_p);
489 close(nsd->this_child->child_fd);
490 nsd->this_child->child_fd = -1;
491 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
492 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
493 }
494 server_child(nsd);
495 /* NOTREACH */
496 exit(0);
497 case -1:
498 log_msg(LOG_ERR, "fork failed: %s",
499 strerror(errno));
500 return -1;
501 }
502 }
503 }
504 return 0;
505 }
506
507 #ifdef BIND8_STATS
set_bind8_alarm(struct nsd * nsd)508 static void set_bind8_alarm(struct nsd* nsd)
509 {
510 /* resync so that the next alarm is on the next whole minute */
511 if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
512 alarm(nsd->st.period - (time(NULL) % nsd->st.period));
513 }
514 #endif
515
516 /* set zone stat ids for zones initially read in */
517 static void
zonestatid_tree_set(struct nsd * nsd)518 zonestatid_tree_set(struct nsd* nsd)
519 {
520 struct radnode* n;
521 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
522 zone_type* zone = (zone_type*)n->elem;
523 zone->zonestatid = getzonestatid(nsd->options, zone->opts);
524 }
525 }
526
527 #ifdef USE_ZONE_STATS
528 void
server_zonestat_alloc(struct nsd * nsd)529 server_zonestat_alloc(struct nsd* nsd)
530 {
531 size_t num = (nsd->options->zonestatnames->count==0?1:
532 nsd->options->zonestatnames->count);
533 size_t sz = sizeof(struct nsdst)*num;
534 char tmpfile[256];
535 uint8_t z = 0;
536
537 /* file names */
538 nsd->zonestatfname[0] = 0;
539 nsd->zonestatfname[1] = 0;
540 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
541 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
542 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
543 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
544 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
545 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
546
547 /* file descriptors */
548 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
549 if(nsd->zonestatfd[0] == -1) {
550 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
551 strerror(errno));
552 exit(1);
553 }
554 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
555 if(nsd->zonestatfd[0] == -1) {
556 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
557 strerror(errno));
558 close(nsd->zonestatfd[0]);
559 unlink(nsd->zonestatfname[0]);
560 exit(1);
561 }
562
563 #ifdef HAVE_MMAP
564 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
565 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
566 strerror(errno));
567 exit(1);
568 }
569 if(write(nsd->zonestatfd[0], &z, 1) == -1) {
570 log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
571 nsd->zonestatfname[0], strerror(errno));
572 exit(1);
573 }
574 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
575 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
576 strerror(errno));
577 exit(1);
578 }
579 if(write(nsd->zonestatfd[1], &z, 1) == -1) {
580 log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
581 nsd->zonestatfname[1], strerror(errno));
582 exit(1);
583 }
584 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
585 MAP_SHARED, nsd->zonestatfd[0], 0);
586 if(nsd->zonestat[0] == MAP_FAILED) {
587 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
588 unlink(nsd->zonestatfname[0]);
589 unlink(nsd->zonestatfname[1]);
590 exit(1);
591 }
592 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
593 MAP_SHARED, nsd->zonestatfd[1], 0);
594 if(nsd->zonestat[1] == MAP_FAILED) {
595 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
596 unlink(nsd->zonestatfname[0]);
597 unlink(nsd->zonestatfname[1]);
598 exit(1);
599 }
600 memset(nsd->zonestat[0], 0, sz);
601 memset(nsd->zonestat[1], 0, sz);
602 nsd->zonestatsize[0] = num;
603 nsd->zonestatsize[1] = num;
604 nsd->zonestatdesired = num;
605 nsd->zonestatsizenow = num;
606 nsd->zonestatnow = nsd->zonestat[0];
607 #endif /* HAVE_MMAP */
608 }
609
610 void
zonestat_remap(struct nsd * nsd,int idx,size_t sz)611 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
612 {
613 #ifdef HAVE_MMAP
614 #ifdef MREMAP_MAYMOVE
615 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
616 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
617 MREMAP_MAYMOVE);
618 if(nsd->zonestat[idx] == MAP_FAILED) {
619 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
620 exit(1);
621 }
622 #else /* !HAVE MREMAP */
623 if(msync(nsd->zonestat[idx],
624 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
625 log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
626 if(munmap(nsd->zonestat[idx],
627 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
628 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
629 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
630 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
631 if(nsd->zonestat[idx] == MAP_FAILED) {
632 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
633 exit(1);
634 }
635 #endif /* MREMAP */
636 #endif /* HAVE_MMAP */
637 }
638
639 /* realloc the zonestat array for the one that is not currently in use,
640 * to match the desired new size of the array (if applicable) */
641 void
server_zonestat_realloc(struct nsd * nsd)642 server_zonestat_realloc(struct nsd* nsd)
643 {
644 #ifdef HAVE_MMAP
645 uint8_t z = 0;
646 size_t sz;
647 int idx = 0; /* index of the zonestat array that is not in use */
648 if(nsd->zonestatnow == nsd->zonestat[0])
649 idx = 1;
650 if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
651 return;
652 sz = sizeof(struct nsdst)*nsd->zonestatdesired;
653 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
654 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
655 strerror(errno));
656 exit(1);
657 }
658 if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
659 log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
660 nsd->zonestatfname[idx], strerror(errno));
661 exit(1);
662 }
663 zonestat_remap(nsd, idx, sz);
664 /* zero the newly allocated region */
665 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
666 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
667 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
668 (nsd->zonestatdesired - nsd->zonestatsize[idx]));
669 }
670 nsd->zonestatsize[idx] = nsd->zonestatdesired;
671 #endif /* HAVE_MMAP */
672 }
673
674 /* switchover to use the other array for the new children, that
675 * briefly coexist with the old children. And we want to avoid them
676 * both writing to the same statistics arrays. */
677 void
server_zonestat_switch(struct nsd * nsd)678 server_zonestat_switch(struct nsd* nsd)
679 {
680 if(nsd->zonestatnow == nsd->zonestat[0]) {
681 nsd->zonestatnow = nsd->zonestat[1];
682 nsd->zonestatsizenow = nsd->zonestatsize[1];
683 } else {
684 nsd->zonestatnow = nsd->zonestat[0];
685 nsd->zonestatsizenow = nsd->zonestatsize[0];
686 }
687 }
688 #endif /* USE_ZONE_STATS */
689
690 static void
cleanup_dname_compression_tables(void * ptr)691 cleanup_dname_compression_tables(void *ptr)
692 {
693 free(ptr);
694 compressed_dname_offsets = NULL;
695 compression_table_capacity = 0;
696 }
697
698 static void
initialize_dname_compression_tables(struct nsd * nsd)699 initialize_dname_compression_tables(struct nsd *nsd)
700 {
701 size_t needed = domain_table_count(nsd->db->domains) + 1;
702 needed += EXTRA_DOMAIN_NUMBERS;
703 if(compression_table_capacity < needed) {
704 if(compressed_dname_offsets) {
705 region_remove_cleanup(nsd->db->region,
706 cleanup_dname_compression_tables,
707 compressed_dname_offsets);
708 free(compressed_dname_offsets);
709 }
710 compressed_dname_offsets = (uint16_t *) xmallocarray(
711 needed, sizeof(uint16_t));
712 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
713 compressed_dname_offsets);
714 compression_table_capacity = needed;
715 compression_table_size=domain_table_count(nsd->db->domains)+1;
716 }
717 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
718 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
719 }
720
721 static int
set_cloexec(struct nsd_socket * sock)722 set_cloexec(struct nsd_socket *sock)
723 {
724 assert(sock != NULL);
725
726 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
727 const char *socktype =
728 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
729 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
730 socktype, strerror(errno));
731 return -1;
732 }
733
734 return 1;
735 }
736
737 static int
set_reuseport(struct nsd_socket * sock)738 set_reuseport(struct nsd_socket *sock)
739 {
740 #ifdef SO_REUSEPORT
741 int on = 1;
742 #ifdef SO_REUSEPORT_LB
743 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
744 * SO_REUSEPORT on Linux. This is what the users want with the config
745 * option in nsd.conf; if we actually need local address and port reuse
746 * they'll also need to have SO_REUSEPORT set for them, assume it was
747 * _LB they want.
748 */
749 int opt = SO_REUSEPORT_LB;
750 static const char optname[] = "SO_REUSEPORT_LB";
751 #else /* !SO_REUSEPORT_LB */
752 int opt = SO_REUSEPORT;
753 static const char optname[] = "SO_REUSEPORT";
754 #endif /* SO_REUSEPORT_LB */
755
756 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
757 return 1;
758 } else if(verbosity >= 3 || errno != ENOPROTOOPT) {
759 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
760 optname, strerror(errno));
761 }
762 return -1;
763 #else
764 (void)sock;
765 #endif /* SO_REUSEPORT */
766
767 return 0;
768 }
769
770 static int
set_reuseaddr(struct nsd_socket * sock)771 set_reuseaddr(struct nsd_socket *sock)
772 {
773 #ifdef SO_REUSEADDR
774 int on = 1;
775 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
776 return 1;
777 }
778 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
779 strerror(errno));
780 return -1;
781 #endif /* SO_REUSEADDR */
782 return 0;
783 }
784
785 static int
set_rcvbuf(struct nsd_socket * sock,int rcv)786 set_rcvbuf(struct nsd_socket *sock, int rcv)
787 {
788 #ifdef SO_RCVBUF
789 #ifdef SO_RCVBUFFORCE
790 if(0 == setsockopt(
791 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
792 {
793 return 1;
794 }
795 if(errno == EPERM || errno == ENOBUFS) {
796 return 0;
797 }
798 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
799 strerror(errno));
800 return -1;
801 #else /* !SO_RCVBUFFORCE */
802 if (0 == setsockopt(
803 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
804 {
805 return 1;
806 }
807 if(errno == ENOSYS || errno == ENOBUFS) {
808 return 0;
809 }
810 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
811 strerror(errno));
812 return -1;
813 #endif /* SO_RCVBUFFORCE */
814 #endif /* SO_RCVBUF */
815
816 return 0;
817 }
818
819 static int
set_sndbuf(struct nsd_socket * sock,int snd)820 set_sndbuf(struct nsd_socket *sock, int snd)
821 {
822 #ifdef SO_SNDBUF
823 #ifdef SO_SNDBUFFORCE
824 if(0 == setsockopt(
825 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
826 {
827 return 1;
828 }
829 if(errno == EPERM || errno == ENOBUFS) {
830 return 0;
831 }
832 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
833 strerror(errno));
834 return -1;
835 #else /* !SO_SNDBUFFORCE */
836 if(0 == setsockopt(
837 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
838 {
839 return 1;
840 }
841 if(errno == ENOSYS || errno == ENOBUFS) {
842 return 0;
843 }
844 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
845 strerror(errno));
846 return -1;
847 #endif /* SO_SNDBUFFORCE */
848 #endif /* SO_SNDBUF */
849
850 return 0;
851 }
852
853 static int
set_nonblock(struct nsd_socket * sock)854 set_nonblock(struct nsd_socket *sock)
855 {
856 const char *socktype =
857 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
858
859 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
860 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
861 socktype, strerror(errno));
862 return -1;
863 }
864
865 return 1;
866 }
867
868 #ifdef INET6
869 static int
set_ipv6_v6only(struct nsd_socket * sock)870 set_ipv6_v6only(struct nsd_socket *sock)
871 {
872 #ifdef IPV6_V6ONLY
873 int on = 1;
874 const char *socktype =
875 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
876
877 if(0 == setsockopt(
878 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
879 {
880 return 1;
881 }
882
883 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
884 socktype, strerror(errno));
885 return -1;
886 #else
887 (void)sock;
888 #endif /* IPV6_V6ONLY */
889
890 return 0;
891 }
892 #endif /* INET6 */
893
894 #ifdef INET6
895 static int
set_ipv6_use_min_mtu(struct nsd_socket * sock)896 set_ipv6_use_min_mtu(struct nsd_socket *sock)
897 {
898 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
899 #if defined(IPV6_USE_MIN_MTU)
900 /* There is no fragmentation of IPv6 datagrams during forwarding in the
901 * network. Therefore we do not send UDP datagrams larger than the
902 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
903 * larger if the network stack supports IPV6_USE_MIN_MTU.
904 */
905 int opt = IPV6_USE_MIN_MTU;
906 int optval = 1;
907 static const char optname[] = "IPV6_USE_MIN_MTU";
908 #elif defined(IPV6_MTU)
909 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU
910 * to the MIN MTU to get the same.
911 */
912 int opt = IPV6_MTU;
913 int optval = IPV6_MIN_MTU;
914 static const char optname[] = "IPV6_MTU";
915 #endif
916 if(0 == setsockopt(
917 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
918 {
919 return 1;
920 }
921
922 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
923 optname, strerror(errno));
924 return -1;
925 #else
926 (void)sock;
927 #endif /* INET6 */
928
929 return 0;
930 }
931 #endif /* INET6 */
932
933 static int
set_ipv4_no_pmtu_disc(struct nsd_socket * sock)934 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
935 {
936 int ret = 0;
937
938 #if defined(IP_MTU_DISCOVER)
939 int opt = IP_MTU_DISCOVER;
940 int optval;
941 # if defined(IP_PMTUDISC_OMIT)
942 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
943 * information and send packets with DF=0. Fragmentation is allowed if
944 * and only if the packet size exceeds the outgoing interface MTU or
945 * the packet encounters smaller MTU link in network. This mitigates
946 * DNS fragmentation attacks by preventing forged PMTU information.
947 * FreeBSD already has same semantics without setting the option.
948 */
949 optval = IP_PMTUDISC_OMIT;
950 if(0 == setsockopt(
951 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
952 {
953 return 1;
954 }
955
956 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
957 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
958 # endif /* IP_PMTUDISC_OMIT */
959 # if defined(IP_PMTUDISC_DONT)
960 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
961 optval = IP_PMTUDISC_DONT;
962 if(0 == setsockopt(
963 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
964 {
965 return 1;
966 }
967
968 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
969 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
970 # endif
971 ret = -1;
972 #elif defined(IP_DONTFRAG)
973 int off = 0;
974 if (0 == setsockopt(
975 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
976 {
977 return 1;
978 }
979
980 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
981 strerror(errno));
982 ret = -1;
983 #else
984 (void)sock;
985 #endif
986
987 return ret;
988 }
989
990 static int
set_ip_freebind(struct nsd_socket * sock)991 set_ip_freebind(struct nsd_socket *sock)
992 {
993 #ifdef IP_FREEBIND
994 int on = 1;
995 const char *socktype =
996 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
997 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
998 {
999 return 1;
1000 }
1001 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1002 socktype, strerror(errno));
1003 return -1;
1004 #else
1005 (void)sock;
1006 #endif /* IP_FREEBIND */
1007
1008 return 0;
1009 }
1010
1011 static int
set_ip_transparent(struct nsd_socket * sock)1012 set_ip_transparent(struct nsd_socket *sock)
1013 {
1014 /*
1015 The scandalous preprocessor blob here calls for some explanation :)
1016 POSIX does not specify an option to bind non-local IPs, so
1017 platforms developed several implementation-specific options,
1018 all set in the same way, but with different names.
1019 For additional complexity, some platform manage this setting
1020 differently for different address families (IPv4 vs IPv6).
1021 This scandalous preprocessor blob below abstracts such variability
1022 in the way which leaves the C code as lean and clear as possible.
1023 */
1024
1025 #if defined(IP_TRANSPARENT)
1026 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT
1027 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP
1028 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT"
1029 // as of 2020-01, Linux does not support this on IPv6 programmatically
1030 #elif defined(SO_BINDANY)
1031 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY
1032 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET
1033 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY"
1034 #elif defined(IP_BINDANY)
1035 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY
1036 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY
1037 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP
1038 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6
1039 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY"
1040 #endif
1041
1042 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1043 (void)sock;
1044 #else
1045 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1046 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1047 # endif
1048 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1049 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1050 # endif
1051 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1052 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1053 # endif
1054
1055 int on = 1;
1056 const char *socktype =
1057 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1058 const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1059
1060 if(0 == setsockopt(
1061 sock->s,
1062 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1063 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1064 &on, sizeof(on)))
1065 {
1066 return 1;
1067 }
1068
1069 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1070 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1071 return -1;
1072 #endif
1073
1074 return 0;
1075 }
1076
1077 static int
set_tcp_maxseg(struct nsd_socket * sock,int mss)1078 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1079 {
1080 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1081 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1082 return 1;
1083 }
1084 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1085 strerror(errno));
1086 return -1;
1087 #else
1088 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1089 #endif
1090 return 0;
1091 }
1092
1093 #ifdef USE_TCP_FASTOPEN
1094 static int
set_tcp_fastopen(struct nsd_socket * sock)1095 set_tcp_fastopen(struct nsd_socket *sock)
1096 {
1097 /* qlen specifies how many outstanding TFO requests to allow. Limit is
1098 * a defense against IP spoofing attacks as suggested in RFC7413.
1099 */
1100 int qlen;
1101
1102 #ifdef __APPLE__
1103 /* macOS X implementation only supports qlen of 1 via this call. The
1104 * actual value is configured by the net.inet.tcp.fastopen_backlog
1105 * kernel parameter.
1106 */
1107 qlen = 1;
1108 #else
1109 /* 5 is recommended on Linux. */
1110 qlen = 5;
1111 #endif
1112 if (0 == setsockopt(
1113 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1114 {
1115 return 1;
1116 }
1117
1118 if (errno == EPERM) {
1119 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1120 "; this could likely be because sysctl "
1121 "net.inet.tcp.fastopen.enabled, "
1122 "net.inet.tcp.fastopen.server_enable, or "
1123 "net.ipv4.tcp_fastopen is disabled",
1124 strerror(errno));
1125 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1126 * disabled, except when verbosity enabled for debugging
1127 */
1128 } else if(errno != ENOPROTOOPT || verbosity >= 3) {
1129 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1130 strerror(errno));
1131 }
1132
1133 return (errno == ENOPROTOOPT ? 0 : -1);
1134 }
1135 #endif /* USE_TCP_FASTOPEN */
1136
1137 static int
set_bindtodevice(struct nsd_socket * sock)1138 set_bindtodevice(struct nsd_socket *sock)
1139 {
1140 #if defined(SO_BINDTODEVICE)
1141 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1142 sock->device, strlen(sock->device)) == -1)
1143 {
1144 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1145 "SO_BINDTODEVICE", sock->device, strerror(errno));
1146 return -1;
1147 }
1148
1149 return 1;
1150 #else
1151 (void)sock;
1152 return 0;
1153 #endif
1154 }
1155
1156 static int
set_setfib(struct nsd_socket * sock)1157 set_setfib(struct nsd_socket *sock)
1158 {
1159 #if defined(SO_SETFIB)
1160 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1161 (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1162 {
1163 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1164 "SO_SETFIB", sock->fib, strerror(errno));
1165 return -1;
1166 }
1167
1168 return 1;
1169 #else
1170 (void)sock;
1171 return 0;
1172 #endif
1173 }
1174
1175 static int
open_udp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1176 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1177 {
1178 int rcv = 1*1024*1024, snd = 1*1024*1024;
1179
1180 if(-1 == (sock->s = socket(
1181 sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1182 {
1183 #ifdef INET6
1184 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1185 (sock->addr.ai_family == AF_INET6) &&
1186 (errno == EAFNOSUPPORT))
1187 {
1188 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1189 "not supported");
1190 return 0;
1191 }
1192 #endif
1193 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1194 return -1;
1195 }
1196
1197 set_cloexec(sock);
1198
1199 if(nsd->reuseport && reuseport_works && *reuseport_works)
1200 *reuseport_works = (set_reuseport(sock) == 1);
1201
1202 if(nsd->options->receive_buffer_size > 0)
1203 rcv = nsd->options->receive_buffer_size;
1204 if(set_rcvbuf(sock, rcv) == -1)
1205 return -1;
1206
1207 if(nsd->options->send_buffer_size > 0)
1208 snd = nsd->options->send_buffer_size;
1209 if(set_sndbuf(sock, snd) == -1)
1210 return -1;
1211 #ifdef INET6
1212 if(sock->addr.ai_family == AF_INET6) {
1213 if(set_ipv6_v6only(sock) == -1 ||
1214 set_ipv6_use_min_mtu(sock) == -1)
1215 return -1;
1216 } else
1217 #endif /* INET6 */
1218 if(sock->addr.ai_family == AF_INET) {
1219 if(set_ipv4_no_pmtu_disc(sock) == -1)
1220 return -1;
1221 }
1222
1223 /* Set socket to non-blocking. Otherwise, on operating systems
1224 * with thundering herd problems, the UDP recv could block
1225 * after select returns readable.
1226 */
1227 set_nonblock(sock);
1228
1229 if(nsd->options->ip_freebind)
1230 (void)set_ip_freebind(sock);
1231 if(nsd->options->ip_transparent)
1232 (void)set_ip_transparent(sock);
1233 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1234 return -1;
1235 if(sock->fib != -1 && set_setfib(sock) == -1)
1236 return -1;
1237
1238 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1239 char buf[256];
1240 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1241 log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1242 buf, strerror(errno));
1243 return -1;
1244 }
1245
1246 return 1;
1247 }
1248
1249 static int
open_tcp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1250 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1251 {
1252 #ifdef USE_TCP_FASTOPEN
1253 report_tcp_fastopen_config();
1254 #endif
1255
1256 (void)reuseport_works;
1257
1258 if(-1 == (sock->s = socket(
1259 sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1260 {
1261 #ifdef INET6
1262 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1263 (sock->addr.ai_family == AF_INET6) &&
1264 (errno == EAFNOSUPPORT))
1265 {
1266 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1267 "not supported");
1268 return 0;
1269 }
1270 #endif /* INET6 */
1271 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1272 return -1;
1273 }
1274
1275 set_cloexec(sock);
1276
1277 if(nsd->reuseport && reuseport_works && *reuseport_works)
1278 *reuseport_works = (set_reuseport(sock) == 1);
1279
1280 (void)set_reuseaddr(sock);
1281
1282 #ifdef INET6
1283 if(sock->addr.ai_family == AF_INET6) {
1284 if (set_ipv6_v6only(sock) == -1 ||
1285 set_ipv6_use_min_mtu(sock) == -1)
1286 return -1;
1287 }
1288 #endif
1289
1290 if(nsd->tcp_mss > 0)
1291 set_tcp_maxseg(sock, nsd->tcp_mss);
1292 /* (StevensUNP p463), if TCP listening socket is blocking, then
1293 it may block in accept, even if select() says readable. */
1294 (void)set_nonblock(sock);
1295 if(nsd->options->ip_freebind)
1296 (void)set_ip_freebind(sock);
1297 if(nsd->options->ip_transparent)
1298 (void)set_ip_transparent(sock);
1299 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1300 return -1;
1301 if(sock->fib != -1 && set_setfib(sock) == -1)
1302 return -1;
1303
1304 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1305 char buf[256];
1306 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1307 log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1308 buf, strerror(errno));
1309 return -1;
1310 }
1311
1312 #ifdef USE_TCP_FASTOPEN
1313 (void)set_tcp_fastopen(sock);
1314 #endif
1315
1316 if(listen(sock->s, TCP_BACKLOG) == -1) {
1317 log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1318 return -1;
1319 }
1320
1321 return 1;
1322 }
1323
1324 /*
1325 * Initialize the server, reuseport, create and bind the sockets.
1326 */
1327 int
server_init(struct nsd * nsd)1328 server_init(struct nsd *nsd)
1329 {
1330 size_t i;
1331 int reuseport = 1; /* Determine if REUSEPORT works. */
1332
1333 /* open server interface ports */
1334 for(i = 0; i < nsd->ifs; i++) {
1335 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1336 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1337 {
1338 return -1;
1339 }
1340 }
1341
1342 if(nsd->reuseport && reuseport) {
1343 size_t ifs = nsd->ifs * nsd->reuseport;
1344
1345 /* increase the size of the interface arrays, there are going
1346 * to be separate interface file descriptors for every server
1347 * instance */
1348 region_remove_cleanup(nsd->region, free, nsd->udp);
1349 region_remove_cleanup(nsd->region, free, nsd->tcp);
1350
1351 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1352 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1353 region_add_cleanup(nsd->region, free, nsd->udp);
1354 region_add_cleanup(nsd->region, free, nsd->tcp);
1355 if(ifs > nsd->ifs) {
1356 memset(&nsd->udp[nsd->ifs], 0,
1357 (ifs-nsd->ifs)*sizeof(*nsd->udp));
1358 memset(&nsd->tcp[nsd->ifs], 0,
1359 (ifs-nsd->ifs)*sizeof(*nsd->tcp));
1360 }
1361
1362 for(i = nsd->ifs; i < ifs; i++) {
1363 nsd->udp[i] = nsd->udp[i%nsd->ifs];
1364 nsd->udp[i].s = -1;
1365 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1366 return -1;
1367 }
1368 /* Turn off REUSEPORT for TCP by copying the socket
1369 * file descriptor.
1370 * This means we should not close TCP used by
1371 * other servers in reuseport enabled mode, in
1372 * server_child().
1373 */
1374 nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1375 }
1376
1377 nsd->ifs = ifs;
1378 } else {
1379 nsd->reuseport = 0;
1380 }
1381
1382 /* open server interface ports for verifiers */
1383 for(i = 0; i < nsd->verify_ifs; i++) {
1384 if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1385 open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1386 {
1387 return -1;
1388 }
1389 }
1390
1391 return 0;
1392 }
1393
1394 /*
1395 * Prepare the server for take off.
1396 *
1397 */
1398 int
server_prepare(struct nsd * nsd)1399 server_prepare(struct nsd *nsd)
1400 {
1401 #ifdef RATELIMIT
1402 /* set secret modifier for hashing (udb ptr buckets and rate limits) */
1403 #ifdef HAVE_GETRANDOM
1404 uint32_t v;
1405 if(getrandom(&v, sizeof(v), 0) == -1) {
1406 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1407 exit(1);
1408 }
1409 hash_set_raninit(v);
1410 #elif defined(HAVE_ARC4RANDOM)
1411 hash_set_raninit(arc4random());
1412 #else
1413 uint32_t v = getpid() ^ time(NULL);
1414 srandom((unsigned long)v);
1415 # ifdef HAVE_SSL
1416 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1417 hash_set_raninit(v);
1418 else
1419 # endif
1420 hash_set_raninit(random());
1421 #endif
1422 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1423 nsd->options->rrl_ratelimit,
1424 nsd->options->rrl_whitelist_ratelimit,
1425 nsd->options->rrl_slip,
1426 nsd->options->rrl_ipv4_prefix_length,
1427 nsd->options->rrl_ipv6_prefix_length);
1428 #endif /* RATELIMIT */
1429
1430 /* Open the database... */
1431 if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1432 log_msg(LOG_ERR, "unable to open the database %s: %s",
1433 nsd->dbfile, strerror(errno));
1434 unlink(nsd->task[0]->fname);
1435 unlink(nsd->task[1]->fname);
1436 #ifdef USE_ZONE_STATS
1437 unlink(nsd->zonestatfname[0]);
1438 unlink(nsd->zonestatfname[1]);
1439 #endif
1440 xfrd_del_tempdir(nsd);
1441 return -1;
1442 }
1443 /* check if zone files have been modified */
1444 /* NULL for taskudb because we send soainfo in a moment, batched up,
1445 * for all zones */
1446 if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1447 nsd->options->database[0] == 0))
1448 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1449 zonestatid_tree_set(nsd);
1450
1451 compression_table_capacity = 0;
1452 initialize_dname_compression_tables(nsd);
1453
1454 #ifdef BIND8_STATS
1455 /* Initialize times... */
1456 time(&nsd->st.boot);
1457 set_bind8_alarm(nsd);
1458 #endif /* BIND8_STATS */
1459
1460 return 0;
1461 }
1462
1463 /*
1464 * Fork the required number of servers.
1465 */
1466 static int
server_start_children(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)1467 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1468 int* xfrd_sock_p)
1469 {
1470 size_t i;
1471
1472 /* Start all child servers initially. */
1473 for (i = 0; i < nsd->child_count; ++i) {
1474 nsd->children[i].pid = 0;
1475 }
1476
1477 return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1478 }
1479
1480 static void
server_close_socket(struct nsd_socket * sock)1481 server_close_socket(struct nsd_socket *sock)
1482 {
1483 if(sock->s != -1) {
1484 close(sock->s);
1485 sock->s = -1;
1486 }
1487 }
1488
1489 void
server_close_all_sockets(struct nsd_socket sockets[],size_t n)1490 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1491 {
1492 size_t i;
1493
1494 /* Close all the sockets... */
1495 for (i = 0; i < n; ++i) {
1496 server_close_socket(&sockets[i]);
1497 }
1498 }
1499
1500 /*
1501 * Close the sockets, shutdown the server and exit.
1502 * Does not return.
1503 */
1504 void
server_shutdown(struct nsd * nsd)1505 server_shutdown(struct nsd *nsd)
1506 {
1507 size_t i;
1508
1509 server_close_all_sockets(nsd->udp, nsd->ifs);
1510 server_close_all_sockets(nsd->tcp, nsd->ifs);
1511 /* CHILD: close command channel to parent */
1512 if(nsd->this_child && nsd->this_child->parent_fd != -1)
1513 {
1514 close(nsd->this_child->parent_fd);
1515 nsd->this_child->parent_fd = -1;
1516 }
1517 /* SERVER: close command channels to children */
1518 if(!nsd->this_child)
1519 {
1520 for(i=0; i < nsd->child_count; ++i)
1521 if(nsd->children[i].child_fd != -1)
1522 {
1523 close(nsd->children[i].child_fd);
1524 nsd->children[i].child_fd = -1;
1525 }
1526 }
1527
1528 tsig_finalize();
1529 #ifdef HAVE_SSL
1530 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1531 if (nsd->tls_ctx)
1532 SSL_CTX_free(nsd->tls_ctx);
1533 #endif
1534
1535 #ifdef MEMCLEAN /* OS collects memory pages */
1536 #ifdef RATELIMIT
1537 rrl_mmap_deinit_keep_mmap();
1538 #endif
1539 #ifdef USE_DNSTAP
1540 dt_collector_destroy(nsd->dt_collector, nsd);
1541 #endif
1542 udb_base_free_keep_mmap(nsd->task[0]);
1543 udb_base_free_keep_mmap(nsd->task[1]);
1544 namedb_free_ixfr(nsd->db);
1545 namedb_close_udb(nsd->db); /* keeps mmap */
1546 namedb_close(nsd->db);
1547 nsd_options_destroy(nsd->options);
1548 region_destroy(nsd->region);
1549 #endif
1550 log_finalize();
1551 exit(0);
1552 }
1553
1554 void
server_prepare_xfrd(struct nsd * nsd)1555 server_prepare_xfrd(struct nsd* nsd)
1556 {
1557 char tmpfile[256];
1558 /* create task mmaps */
1559 nsd->mytask = 0;
1560 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1561 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1562 nsd->task[0] = task_file_create(tmpfile);
1563 if(!nsd->task[0]) {
1564 #ifdef USE_ZONE_STATS
1565 unlink(nsd->zonestatfname[0]);
1566 unlink(nsd->zonestatfname[1]);
1567 #endif
1568 xfrd_del_tempdir(nsd);
1569 exit(1);
1570 }
1571 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1572 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1573 nsd->task[1] = task_file_create(tmpfile);
1574 if(!nsd->task[1]) {
1575 unlink(nsd->task[0]->fname);
1576 #ifdef USE_ZONE_STATS
1577 unlink(nsd->zonestatfname[0]);
1578 unlink(nsd->zonestatfname[1]);
1579 #endif
1580 xfrd_del_tempdir(nsd);
1581 exit(1);
1582 }
1583 assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1584 assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1585 /* create xfrd listener structure */
1586 nsd->xfrd_listener = region_alloc(nsd->region,
1587 sizeof(netio_handler_type));
1588 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1589 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1590 nsd->xfrd_listener->fd = -1;
1591 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1592 nsd;
1593 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1594 xfrd_tcp_create(nsd->region, QIOBUFSZ);
1595 }
1596
1597
1598 void
server_start_xfrd(struct nsd * nsd,int del_db,int reload_active)1599 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1600 {
1601 pid_t pid;
1602 int sockets[2] = {0,0};
1603 struct ipc_handler_conn_data *data;
1604
1605 if(nsd->xfrd_listener->fd != -1)
1606 close(nsd->xfrd_listener->fd);
1607 if(del_db) {
1608 /* recreate taskdb that xfrd was using, it may be corrupt */
1609 /* we (or reload) use nsd->mytask, and xfrd uses the other */
1610 char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1611 nsd->task[1-nsd->mytask]->fname = NULL;
1612 /* free alloc already, so udb does not shrink itself */
1613 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1614 nsd->task[1-nsd->mytask]->alloc = NULL;
1615 udb_base_free(nsd->task[1-nsd->mytask]);
1616 /* create new file, overwrite the old one */
1617 nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1618 free(tmpfile);
1619 }
1620 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1621 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1622 return;
1623 }
1624 pid = fork();
1625 switch (pid) {
1626 case -1:
1627 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1628 break;
1629 default:
1630 /* PARENT: close first socket, use second one */
1631 close(sockets[0]);
1632 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1633 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1634 }
1635 if(del_db) xfrd_free_namedb(nsd);
1636 /* use other task than I am using, since if xfrd died and is
1637 * restarted, the reload is using nsd->mytask */
1638 nsd->mytask = 1 - nsd->mytask;
1639
1640 #ifdef HAVE_SETPROCTITLE
1641 setproctitle("xfrd");
1642 #endif
1643 #ifdef HAVE_CPUSET_T
1644 if(nsd->use_cpu_affinity) {
1645 set_cpu_affinity(nsd->xfrd_cpuset);
1646 }
1647 #endif
1648
1649 xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1650 /* ENOTREACH */
1651 break;
1652 case 0:
1653 /* CHILD: close second socket, use first one */
1654 close(sockets[1]);
1655 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1656 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1657 }
1658 nsd->xfrd_listener->fd = sockets[0];
1659 break;
1660 }
1661 /* server-parent only */
1662 nsd->xfrd_listener->timeout = NULL;
1663 nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1664 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1665 /* clear ongoing ipc reads */
1666 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1667 data->conn->is_reading = 0;
1668 }
1669
1670 /** add all soainfo to taskdb */
1671 static void
add_all_soa_to_task(struct nsd * nsd,struct udb_base * taskudb)1672 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1673 {
1674 struct radnode* n;
1675 udb_ptr task_last; /* last task, mytask is empty so NULL */
1676 /* add all SOA INFO to mytask */
1677 udb_ptr_init(&task_last, taskudb);
1678 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1679 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1680 }
1681 udb_ptr_unlink(&task_last, taskudb);
1682 }
1683
1684 void
server_send_soa_xfrd(struct nsd * nsd,int shortsoa)1685 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1686 {
1687 /* normally this exchanges the SOA from nsd->xfrd and the expire back.
1688 * parent fills one taskdb with soas, xfrd fills other with expires.
1689 * then they exchange and process.
1690 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1691 * may be in use by reload. Fill SOA in taskdb and give to xfrd.
1692 * expire notifications can be sent back via a normal reload later
1693 * (xfrd will wait for current running reload to finish if any).
1694 */
1695 sig_atomic_t cmd = 0;
1696 pid_t mypid;
1697 int xfrd_sock = nsd->xfrd_listener->fd;
1698 struct udb_base* taskudb = nsd->task[nsd->mytask];
1699 udb_ptr t;
1700 if(!shortsoa) {
1701 if(nsd->signal_hint_shutdown) {
1702 shutdown:
1703 log_msg(LOG_WARNING, "signal received, shutting down...");
1704 server_close_all_sockets(nsd->udp, nsd->ifs);
1705 server_close_all_sockets(nsd->tcp, nsd->ifs);
1706 #ifdef HAVE_SSL
1707 daemon_remote_close(nsd->rc);
1708 #endif
1709 /* Unlink it if possible... */
1710 unlinkpid(nsd->pidfile);
1711 unlink(nsd->task[0]->fname);
1712 unlink(nsd->task[1]->fname);
1713 #ifdef USE_ZONE_STATS
1714 unlink(nsd->zonestatfname[0]);
1715 unlink(nsd->zonestatfname[1]);
1716 #endif
1717 /* write the nsd.db to disk, wait for it to complete */
1718 udb_base_sync(nsd->db->udb, 1);
1719 udb_base_close(nsd->db->udb);
1720 server_shutdown(nsd);
1721 /* ENOTREACH */
1722 exit(0);
1723 }
1724 }
1725 if(shortsoa) {
1726 /* put SOA in xfrd task because mytask may be in use */
1727 taskudb = nsd->task[1-nsd->mytask];
1728 }
1729
1730 add_all_soa_to_task(nsd, taskudb);
1731 if(!shortsoa) {
1732 /* wait for xfrd to signal task is ready, RELOAD signal */
1733 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1734 cmd != NSD_RELOAD) {
1735 log_msg(LOG_ERR, "did not get start signal from xfrd");
1736 exit(1);
1737 }
1738 if(nsd->signal_hint_shutdown) {
1739 goto shutdown;
1740 }
1741 }
1742 /* give xfrd our task, signal it with RELOAD_DONE */
1743 task_process_sync(taskudb);
1744 cmd = NSD_RELOAD_DONE;
1745 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) {
1746 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1747 (int)nsd->pid, strerror(errno));
1748 }
1749 mypid = getpid();
1750 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) {
1751 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1752 strerror(errno));
1753 }
1754
1755 if(!shortsoa) {
1756 /* process the xfrd task works (expiry data) */
1757 nsd->mytask = 1 - nsd->mytask;
1758 taskudb = nsd->task[nsd->mytask];
1759 task_remap(taskudb);
1760 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1761 while(!udb_ptr_is_null(&t)) {
1762 task_process_expire(nsd->db, TASKLIST(&t));
1763 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1764 }
1765 udb_ptr_unlink(&t, taskudb);
1766 task_clear(taskudb);
1767
1768 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1769 cmd = NSD_RELOAD_DONE;
1770 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) {
1771 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1772 (int)nsd->pid, strerror(errno));
1773 }
1774 }
1775 }
1776
1777 #ifdef HAVE_SSL
1778 static void
log_crypto_from_err(const char * str,unsigned long err)1779 log_crypto_from_err(const char* str, unsigned long err)
1780 {
1781 /* error:[error code]:[library name]:[function name]:[reason string] */
1782 char buf[128];
1783 unsigned long e;
1784 ERR_error_string_n(err, buf, sizeof(buf));
1785 log_msg(LOG_ERR, "%s crypto %s", str, buf);
1786 while( (e=ERR_get_error()) ) {
1787 ERR_error_string_n(e, buf, sizeof(buf));
1788 log_msg(LOG_ERR, "and additionally crypto %s", buf);
1789 }
1790 }
1791
1792 void
log_crypto_err(const char * str)1793 log_crypto_err(const char* str)
1794 {
1795 log_crypto_from_err(str, ERR_get_error());
1796 }
1797
1798 /** true if the ssl handshake error has to be squelched from the logs */
1799 static int
squelch_err_ssl_handshake(unsigned long err)1800 squelch_err_ssl_handshake(unsigned long err)
1801 {
1802 if(verbosity >= 3)
1803 return 0; /* only squelch on low verbosity */
1804 /* this is very specific, we could filter on ERR_GET_REASON()
1805 * (the third element in ERR_PACK) */
1806 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1807 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1808 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1809 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1810 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1811 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1812 #endif
1813 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1814 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1815 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1816 # ifdef SSL_R_VERSION_TOO_LOW
1817 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1818 # endif
1819 #endif
1820 )
1821 return 1;
1822 return 0;
1823 }
1824
1825 void
perform_openssl_init(void)1826 perform_openssl_init(void)
1827 {
1828 /* init SSL library */
1829 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1830 ERR_load_crypto_strings();
1831 #endif
1832 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1833 ERR_load_SSL_strings();
1834 #endif
1835 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1836 OpenSSL_add_all_algorithms();
1837 #else
1838 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1839 | OPENSSL_INIT_ADD_ALL_DIGESTS
1840 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1841 #endif
1842 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1843 (void)SSL_library_init();
1844 #else
1845 OPENSSL_init_ssl(0, NULL);
1846 #endif
1847
1848 if(!RAND_status()) {
1849 /* try to seed it */
1850 unsigned char buf[256];
1851 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1852 size_t i;
1853 v = seed;
1854 for(i=0; i<256/sizeof(v); i++) {
1855 memmove(buf+i*sizeof(v), &v, sizeof(v));
1856 v = v*seed + (unsigned int)i;
1857 }
1858 RAND_seed(buf, 256);
1859 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1860 }
1861 }
1862
1863 static int
get_ocsp(char * filename,unsigned char ** ocsp)1864 get_ocsp(char *filename, unsigned char **ocsp)
1865 {
1866 BIO *bio;
1867 OCSP_RESPONSE *response;
1868 int len = -1;
1869 unsigned char *p, *buf;
1870 assert(filename);
1871
1872 if ((bio = BIO_new_file(filename, "r")) == NULL) {
1873 log_crypto_err("get_ocsp: BIO_new_file failed");
1874 return -1;
1875 }
1876
1877 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1878 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1879 BIO_free(bio);
1880 return -1;
1881 }
1882
1883 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1884 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1885 OCSP_RESPONSE_free(response);
1886 BIO_free(bio);
1887 return -1;
1888 }
1889
1890 if ((buf = malloc((size_t) len)) == NULL) {
1891 log_msg(LOG_ERR, "get_ocsp: malloc failed");
1892 OCSP_RESPONSE_free(response);
1893 BIO_free(bio);
1894 return -1;
1895 }
1896
1897 p = buf;
1898 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1899 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1900 free(buf);
1901 OCSP_RESPONSE_free(response);
1902 BIO_free(bio);
1903 return -1;
1904 }
1905
1906 OCSP_RESPONSE_free(response);
1907 BIO_free(bio);
1908
1909 *ocsp = buf;
1910 return len;
1911 }
1912
1913 /* further setup ssl ctx after the keys are loaded */
1914 static void
listen_sslctx_setup_2(void * ctxt)1915 listen_sslctx_setup_2(void* ctxt)
1916 {
1917 SSL_CTX* ctx = (SSL_CTX*)ctxt;
1918 (void)ctx;
1919 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1920 if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1921 /* ENOTREACH */
1922 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1923 }
1924 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1925 if(1) {
1926 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1927 if (!ecdh) {
1928 log_crypto_err("could not find p256, not enabling ECDHE");
1929 } else {
1930 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1931 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1932 }
1933 EC_KEY_free (ecdh);
1934 }
1935 }
1936 #endif
1937 }
1938
1939 static int
add_ocsp_data_cb(SSL * s,void * ATTR_UNUSED (arg))1940 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1941 {
1942 if(ocspdata) {
1943 unsigned char *p;
1944 if ((p=malloc(ocspdata_len)) == NULL) {
1945 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1946 return SSL_TLSEXT_ERR_NOACK;
1947 }
1948 memcpy(p, ocspdata, ocspdata_len);
1949 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1950 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1951 free(p);
1952 return SSL_TLSEXT_ERR_NOACK;
1953 }
1954 return SSL_TLSEXT_ERR_OK;
1955 } else {
1956 return SSL_TLSEXT_ERR_NOACK;
1957 }
1958 }
1959
1960 SSL_CTX*
server_tls_ctx_setup(char * key,char * pem,char * verifypem)1961 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1962 {
1963 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1964 if(!ctx) {
1965 log_crypto_err("could not SSL_CTX_new");
1966 return NULL;
1967 }
1968 /* no SSLv2, SSLv3 because has defects */
1969 #if SSL_OP_NO_SSLv2 != 0
1970 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1971 log_crypto_err("could not set SSL_OP_NO_SSLv2");
1972 SSL_CTX_free(ctx);
1973 return NULL;
1974 }
1975 #endif
1976 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1977 != SSL_OP_NO_SSLv3){
1978 log_crypto_err("could not set SSL_OP_NO_SSLv3");
1979 SSL_CTX_free(ctx);
1980 return 0;
1981 }
1982 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1983 /* if we have tls 1.1 disable 1.0 */
1984 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1985 != SSL_OP_NO_TLSv1){
1986 log_crypto_err("could not set SSL_OP_NO_TLSv1");
1987 SSL_CTX_free(ctx);
1988 return 0;
1989 }
1990 #endif
1991 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1992 /* if we have tls 1.2 disable 1.1 */
1993 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1994 != SSL_OP_NO_TLSv1_1){
1995 log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1996 SSL_CTX_free(ctx);
1997 return 0;
1998 }
1999 #endif
2000 #if defined(SSL_OP_NO_RENEGOTIATION)
2001 /* disable client renegotiation */
2002 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2003 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2004 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2005 SSL_CTX_free(ctx);
2006 return 0;
2007 }
2008 #endif
2009 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2010 /* if we detect system-wide crypto policies, use those */
2011 if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2012 /* if we have sha256, set the cipher list to have no known vulns */
2013 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2014 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2015 }
2016 #endif
2017 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2018 SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2019 SSL_OP_CIPHER_SERVER_PREFERENCE) {
2020 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2021 SSL_CTX_free(ctx);
2022 return 0;
2023 }
2024 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2025 SSL_CTX_set_security_level(ctx, 0);
2026 #endif
2027 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2028 log_msg(LOG_ERR, "error for cert file: %s", pem);
2029 log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2030 SSL_CTX_free(ctx);
2031 return NULL;
2032 }
2033 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2034 log_msg(LOG_ERR, "error for private key file: %s", key);
2035 log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2036 SSL_CTX_free(ctx);
2037 return NULL;
2038 }
2039 if(!SSL_CTX_check_private_key(ctx)) {
2040 log_msg(LOG_ERR, "error for key file: %s", key);
2041 log_crypto_err("Error in SSL_CTX check_private_key");
2042 SSL_CTX_free(ctx);
2043 return NULL;
2044 }
2045 listen_sslctx_setup_2(ctx);
2046 if(verifypem && verifypem[0]) {
2047 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2048 log_crypto_err("Error in SSL_CTX verify locations");
2049 SSL_CTX_free(ctx);
2050 return NULL;
2051 }
2052 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2053 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2054 }
2055 return ctx;
2056 }
2057
2058 SSL_CTX*
server_tls_ctx_create(struct nsd * nsd,char * verifypem,char * ocspfile)2059 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2060 {
2061 char *key, *pem;
2062 SSL_CTX *ctx;
2063
2064 key = nsd->options->tls_service_key;
2065 pem = nsd->options->tls_service_pem;
2066 if(!key || key[0] == 0) {
2067 log_msg(LOG_ERR, "error: no tls-service-key file specified");
2068 return NULL;
2069 }
2070 if(!pem || pem[0] == 0) {
2071 log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2072 return NULL;
2073 }
2074
2075 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2076 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2077 ctx = server_tls_ctx_setup(key, pem, verifypem);
2078 if(!ctx) {
2079 log_msg(LOG_ERR, "could not setup server TLS context");
2080 return NULL;
2081 }
2082 if(ocspfile && ocspfile[0]) {
2083 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2084 log_crypto_err("Error reading OCSPfile");
2085 SSL_CTX_free(ctx);
2086 return NULL;
2087 } else {
2088 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2089 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2090 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2091 SSL_CTX_free(ctx);
2092 return NULL;
2093 }
2094 }
2095 }
2096 return ctx;
2097 }
2098
2099 /* check if tcp_handler_accept_data created for TLS dedicated port */
2100 int
using_tls_port(struct sockaddr * addr,const char * tls_port)2101 using_tls_port(struct sockaddr* addr, const char* tls_port)
2102 {
2103 in_port_t port = 0;
2104
2105 if (addr->sa_family == AF_INET)
2106 port = ((struct sockaddr_in*)addr)->sin_port;
2107 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2108 else
2109 port = ((struct sockaddr_in6*)addr)->sin6_port;
2110 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2111 if (atoi(tls_port) == ntohs(port))
2112 return 1;
2113
2114 return 0;
2115 }
2116 #endif
2117
2118 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2119 ssize_t
block_read(struct nsd * nsd,int s,void * p,ssize_t sz,int timeout)2120 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2121 {
2122 uint8_t* buf = (uint8_t*) p;
2123 ssize_t total = 0;
2124 struct pollfd fd;
2125 memset(&fd, 0, sizeof(fd));
2126 fd.fd = s;
2127 fd.events = POLLIN;
2128
2129 while( total < sz) {
2130 ssize_t ret;
2131 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2132 if(ret == -1) {
2133 if(errno == EAGAIN)
2134 /* blocking read */
2135 continue;
2136 if(errno == EINTR) {
2137 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2138 return -1;
2139 /* other signals can be handled later */
2140 continue;
2141 }
2142 /* some error */
2143 return -1;
2144 }
2145 if(ret == 0) {
2146 /* operation timed out */
2147 return -2;
2148 }
2149 ret = read(s, buf+total, sz-total);
2150 if(ret == -1) {
2151 if(errno == EAGAIN)
2152 /* blocking read */
2153 continue;
2154 if(errno == EINTR) {
2155 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2156 return -1;
2157 /* other signals can be handled later */
2158 continue;
2159 }
2160 /* some error */
2161 return -1;
2162 }
2163 if(ret == 0) {
2164 /* closed connection! */
2165 return 0;
2166 }
2167 total += ret;
2168 }
2169 return total;
2170 }
2171
2172 static void
reload_process_tasks(struct nsd * nsd,udb_ptr * last_task,int cmdsocket)2173 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2174 {
2175 sig_atomic_t cmd = NSD_QUIT_SYNC;
2176 udb_ptr t, next;
2177 udb_base* u = nsd->task[nsd->mytask];
2178 udb_ptr_init(&next, u);
2179 udb_ptr_new(&t, u, udb_base_get_userdata(u));
2180 udb_base_set_userdata(u, 0);
2181 while(!udb_ptr_is_null(&t)) {
2182 /* store next in list so this one can be deleted or reused */
2183 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2184 udb_rptr_zero(&TASKLIST(&t)->next, u);
2185
2186 /* process task t */
2187 /* append results for task t and update last_task */
2188 task_process_in_reload(nsd, u, last_task, &t);
2189
2190 /* go to next */
2191 udb_ptr_set_ptr(&t, u, &next);
2192
2193 /* if the parent has quit, we must quit too, poll the fd for cmds */
2194 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2195 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2196 if(cmd == NSD_QUIT) {
2197 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2198 /* sync to disk (if needed) */
2199 udb_base_sync(nsd->db->udb, 0);
2200 /* unlink files of remainder of tasks */
2201 while(!udb_ptr_is_null(&t)) {
2202 if(TASKLIST(&t)->task_type == task_apply_xfr) {
2203 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2204 }
2205 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2206 }
2207 udb_ptr_unlink(&t, u);
2208 udb_ptr_unlink(&next, u);
2209 exit(0);
2210 }
2211 }
2212
2213 }
2214 udb_ptr_unlink(&t, u);
2215 udb_ptr_unlink(&next, u);
2216 }
2217
2218 #ifdef BIND8_STATS
2219 static void
parent_send_stats(struct nsd * nsd,int cmdfd)2220 parent_send_stats(struct nsd* nsd, int cmdfd)
2221 {
2222 size_t i;
2223 if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2224 log_msg(LOG_ERR, "could not write stats to reload");
2225 return;
2226 }
2227 for(i=0; i<nsd->child_count; i++)
2228 if(!write_socket(cmdfd, &nsd->children[i].query_count,
2229 sizeof(stc_type))) {
2230 log_msg(LOG_ERR, "could not write stats to reload");
2231 return;
2232 }
2233 }
2234
2235 static void
reload_do_stats(int cmdfd,struct nsd * nsd,udb_ptr * last)2236 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2237 {
2238 struct nsdst s;
2239 stc_type* p;
2240 size_t i;
2241 if(block_read(nsd, cmdfd, &s, sizeof(s),
2242 RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2243 log_msg(LOG_ERR, "could not read stats from oldpar");
2244 return;
2245 }
2246 s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2247 s.db_mem = region_get_mem(nsd->db->region);
2248 p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2249 nsd->child_count);
2250 if(!p) return;
2251 for(i=0; i<nsd->child_count; i++) {
2252 if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2253 sizeof(stc_type))
2254 return;
2255 }
2256 }
2257 #endif /* BIND8_STATS */
2258
2259 void server_verify(struct nsd *nsd, int cmdsocket);
2260
2261 /*
2262 * Reload the database, stop parent, re-fork children and continue.
2263 * as server_main.
2264 */
2265 static void
server_reload(struct nsd * nsd,region_type * server_region,netio_type * netio,int cmdsocket)2266 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2267 int cmdsocket)
2268 {
2269 pid_t mypid;
2270 sig_atomic_t cmd = NSD_QUIT_SYNC;
2271 int ret;
2272 udb_ptr last_task;
2273 struct sigaction old_sigchld, ign_sigchld;
2274 struct radnode* node;
2275 zone_type* zone;
2276 enum soainfo_hint hint;
2277 /* ignore SIGCHLD from the previous server_main that used this pid */
2278 memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2279 ign_sigchld.sa_handler = SIG_IGN;
2280 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2281
2282 #ifdef HAVE_SETPROCTITLE
2283 setproctitle("main");
2284 #endif
2285 #ifdef HAVE_CPUSET_T
2286 if(nsd->use_cpu_affinity) {
2287 set_cpu_affinity(nsd->cpuset);
2288 }
2289 #endif
2290
2291 /* see what tasks we got from xfrd */
2292 task_remap(nsd->task[nsd->mytask]);
2293 udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2294 udb_compact_inhibited(nsd->db->udb, 1);
2295 reload_process_tasks(nsd, &last_task, cmdsocket);
2296 udb_compact_inhibited(nsd->db->udb, 0);
2297 udb_compact(nsd->db->udb);
2298
2299 #ifndef NDEBUG
2300 if(nsd_debug_level >= 1)
2301 region_log_stats(nsd->db->region);
2302 #endif /* NDEBUG */
2303 /* sync to disk (if needed) */
2304 udb_base_sync(nsd->db->udb, 0);
2305
2306 initialize_dname_compression_tables(nsd);
2307
2308 #ifdef BIND8_STATS
2309 /* Restart dumping stats if required. */
2310 time(&nsd->st.boot);
2311 set_bind8_alarm(nsd);
2312 #endif
2313 #ifdef USE_ZONE_STATS
2314 server_zonestat_realloc(nsd); /* realloc for new children */
2315 server_zonestat_switch(nsd);
2316 #endif
2317
2318 if(nsd->options->verify_enable) {
2319 #ifdef RATELIMIT
2320 /* allocate resources for rate limiting. use a slot that is guaranteed
2321 not mapped to a file so no persistent data is overwritten */
2322 rrl_init(nsd->child_count + 1);
2323 #endif
2324
2325 /* spin-up server and execute verifiers for each zone */
2326 server_verify(nsd, cmdsocket);
2327 #ifdef RATELIMIT
2328 /* deallocate rate limiting resources */
2329 rrl_deinit(nsd->child_count + 1);
2330 #endif
2331 }
2332
2333 for(node = radix_first(nsd->db->zonetree);
2334 node != NULL;
2335 node = radix_next(node))
2336 {
2337 zone = (zone_type *)node->elem;
2338 if(zone->is_updated) {
2339 if(zone->is_bad) {
2340 nsd->mode = NSD_RELOAD_FAILED;
2341 hint = soainfo_bad;
2342 } else {
2343 hint = soainfo_ok;
2344 }
2345 /* update(s), verified or not, possibly with subsequent
2346 skipped update(s). skipped update(s) are picked up
2347 by failed update check in xfrd */
2348 task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2349 zone, hint);
2350 } else if(zone->is_skipped) {
2351 /* corrupt or inconsistent update without preceding
2352 update(s), communicate soainfo_gone */
2353 task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2354 zone, soainfo_gone);
2355 }
2356 zone->is_updated = 0;
2357 zone->is_skipped = 0;
2358 }
2359
2360 if(nsd->mode == NSD_RELOAD_FAILED) {
2361 exit(NSD_RELOAD_FAILED);
2362 }
2363
2364 /* listen for the signals of failed children again */
2365 sigaction(SIGCHLD, &old_sigchld, NULL);
2366 #ifdef USE_DNSTAP
2367 if (nsd->dt_collector) {
2368 int *swap_fd_send;
2369 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2370 /* Swap fd_send with fd_swap so old serve child and new serve
2371 * childs will not write to the same pipe ends simultaneously */
2372 swap_fd_send = nsd->dt_collector_fd_send;
2373 nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2374 nsd->dt_collector_fd_swap = swap_fd_send;
2375
2376 }
2377 #endif
2378 /* Start new child processes */
2379 if (server_start_children(nsd, server_region, netio, &nsd->
2380 xfrd_listener->fd) != 0) {
2381 send_children_quit(nsd);
2382 exit(1);
2383 }
2384
2385 /* if the parent has quit, we must quit too, poll the fd for cmds */
2386 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2387 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2388 if(cmd == NSD_QUIT) {
2389 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2390 send_children_quit(nsd);
2391 exit(0);
2392 }
2393 }
2394
2395 /* Send quit command to parent: blocking, wait for receipt. */
2396 do {
2397 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2398 if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2399 {
2400 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2401 strerror(errno));
2402 }
2403 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2404 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2405 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2406 RELOAD_SYNC_TIMEOUT);
2407 if(ret == -2) {
2408 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2409 }
2410 } while (ret == -2);
2411 if(ret == -1) {
2412 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2413 strerror(errno));
2414 }
2415 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2416 if(cmd == NSD_QUIT) {
2417 /* small race condition possible here, parent got quit cmd. */
2418 send_children_quit(nsd);
2419 exit(1);
2420 }
2421 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2422 #ifdef BIND8_STATS
2423 reload_do_stats(cmdsocket, nsd, &last_task);
2424 #endif
2425 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2426 task_process_sync(nsd->task[nsd->mytask]);
2427 #ifdef USE_ZONE_STATS
2428 server_zonestat_realloc(nsd); /* realloc for next children */
2429 #endif
2430
2431 /* send soainfo to the xfrd process, signal it that reload is done,
2432 * it picks up the taskudb */
2433 cmd = NSD_RELOAD_DONE;
2434 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2435 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2436 strerror(errno));
2437 }
2438 mypid = getpid();
2439 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) {
2440 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2441 strerror(errno));
2442 }
2443
2444 /* try to reopen file */
2445 if (nsd->file_rotation_ok)
2446 log_reopen(nsd->log_filename, 1);
2447 /* exit reload, continue as new server_main */
2448 }
2449
2450 /*
2451 * Get the mode depending on the signal hints that have been received.
2452 * Multiple signal hints can be received and will be handled in turn.
2453 */
2454 static sig_atomic_t
server_signal_mode(struct nsd * nsd)2455 server_signal_mode(struct nsd *nsd)
2456 {
2457 if(nsd->signal_hint_quit) {
2458 nsd->signal_hint_quit = 0;
2459 return NSD_QUIT;
2460 }
2461 else if(nsd->signal_hint_shutdown) {
2462 nsd->signal_hint_shutdown = 0;
2463 return NSD_SHUTDOWN;
2464 }
2465 else if(nsd->signal_hint_child) {
2466 nsd->signal_hint_child = 0;
2467 return NSD_REAP_CHILDREN;
2468 }
2469 else if(nsd->signal_hint_reload) {
2470 nsd->signal_hint_reload = 0;
2471 return NSD_RELOAD;
2472 }
2473 else if(nsd->signal_hint_reload_hup) {
2474 nsd->signal_hint_reload_hup = 0;
2475 return NSD_RELOAD_REQ;
2476 }
2477 else if(nsd->signal_hint_stats) {
2478 nsd->signal_hint_stats = 0;
2479 #ifdef BIND8_STATS
2480 set_bind8_alarm(nsd);
2481 #endif
2482 return NSD_STATS;
2483 }
2484 else if(nsd->signal_hint_statsusr) {
2485 nsd->signal_hint_statsusr = 0;
2486 return NSD_STATS;
2487 }
2488 return NSD_RUN;
2489 }
2490
2491 /*
2492 * The main server simply waits for signals and child processes to
2493 * terminate. Child processes are restarted as necessary.
2494 */
2495 void
server_main(struct nsd * nsd)2496 server_main(struct nsd *nsd)
2497 {
2498 region_type *server_region = region_create(xalloc, free);
2499 netio_type *netio = netio_create(server_region);
2500 netio_handler_type reload_listener;
2501 int reload_sockets[2] = {-1, -1};
2502 struct timespec timeout_spec;
2503 int status;
2504 pid_t child_pid;
2505 pid_t reload_pid = -1;
2506 sig_atomic_t mode;
2507
2508 /* Ensure we are the main process */
2509 assert(nsd->server_kind == NSD_SERVER_MAIN);
2510
2511 /* Add listener for the XFRD process */
2512 netio_add_handler(netio, nsd->xfrd_listener);
2513
2514 /* Start the child processes that handle incoming queries */
2515 if (server_start_children(nsd, server_region, netio,
2516 &nsd->xfrd_listener->fd) != 0) {
2517 send_children_quit(nsd);
2518 exit(1);
2519 }
2520 reload_listener.fd = -1;
2521
2522 /* This_child MUST be 0, because this is the parent process */
2523 assert(nsd->this_child == 0);
2524
2525 /* Run the server until we get a shutdown signal */
2526 while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2527 /* Did we receive a signal that changes our mode? */
2528 if(mode == NSD_RUN) {
2529 nsd->mode = mode = server_signal_mode(nsd);
2530 }
2531
2532 switch (mode) {
2533 case NSD_RUN:
2534 /* see if any child processes terminated */
2535 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2536 int is_child = delete_child_pid(nsd, child_pid);
2537 if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2538 if(nsd->children[is_child].child_fd == -1)
2539 nsd->children[is_child].has_exited = 1;
2540 parent_check_all_children_exited(nsd);
2541 } else if(is_child != -1) {
2542 log_msg(LOG_WARNING,
2543 "server %d died unexpectedly with status %d, restarting",
2544 (int) child_pid, status);
2545 restart_child_servers(nsd, server_region, netio,
2546 &nsd->xfrd_listener->fd);
2547 } else if (child_pid == reload_pid) {
2548 sig_atomic_t cmd = NSD_RELOAD_FAILED;
2549 pid_t mypid;
2550 log_msg(LOG_WARNING,
2551 "Reload process %d failed with status %d, continuing with old database",
2552 (int) child_pid, status);
2553 reload_pid = -1;
2554 if(reload_listener.fd != -1) close(reload_listener.fd);
2555 netio_remove_handler(netio, &reload_listener);
2556 reload_listener.fd = -1;
2557 reload_listener.event_types = NETIO_EVENT_NONE;
2558 task_process_sync(nsd->task[nsd->mytask]);
2559 /* inform xfrd reload attempt ended */
2560 if(!write_socket(nsd->xfrd_listener->fd,
2561 &cmd, sizeof(cmd))) {
2562 log_msg(LOG_ERR, "problems "
2563 "sending SOAEND to xfrd: %s",
2564 strerror(errno));
2565 }
2566 mypid = getpid();
2567 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) {
2568 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2569 strerror(errno));
2570 }
2571 #ifdef USE_DNSTAP
2572 } else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2573 log_msg(LOG_WARNING,
2574 "dnstap-collector %d terminated with status %d",
2575 (int) child_pid, status);
2576 if(nsd->dt_collector) {
2577 dt_collector_close(nsd->dt_collector, nsd);
2578 dt_collector_destroy(nsd->dt_collector, nsd);
2579 nsd->dt_collector = NULL;
2580 }
2581 /* Only respawn a crashed (or exited)
2582 * dnstap-collector when not reloading,
2583 * to not induce a reload during a
2584 * reload (which would seriously
2585 * disrupt nsd procedures and lead to
2586 * unpredictable results)!
2587 *
2588 * This will *leave* a dnstap-collector
2589 * process terminated, but because
2590 * signalling of the reload process to
2591 * the main process to respawn in this
2592 * situation will be cumbersome, and
2593 * because this situation is so
2594 * specific (and therefore hopefully
2595 * extremely rare or non-existing at
2596 * all), plus the fact that we are left
2597 * with a perfectly function NSD
2598 * (besides not logging dnstap
2599 * messages), I consider it acceptable
2600 * to leave this unresolved.
2601 */
2602 if(reload_pid == -1 && nsd->options->dnstap_enable) {
2603 nsd->dt_collector = dt_collector_create(nsd);
2604 dt_collector_start(nsd->dt_collector, nsd);
2605 nsd->mode = NSD_RELOAD_REQ;
2606 }
2607 #endif
2608 } else if(status != 0) {
2609 /* check for status, because we get
2610 * the old-servermain because reload
2611 * is the process-parent of old-main,
2612 * and we get older server-processes
2613 * that are exiting after a reload */
2614 log_msg(LOG_WARNING,
2615 "process %d terminated with status %d",
2616 (int) child_pid, status);
2617 }
2618 }
2619 if (child_pid == -1) {
2620 if (errno == EINTR) {
2621 continue;
2622 }
2623 if (errno != ECHILD)
2624 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2625 }
2626 if (nsd->mode != NSD_RUN)
2627 break;
2628
2629 /* timeout to collect processes. In case no sigchild happens. */
2630 timeout_spec.tv_sec = 60;
2631 timeout_spec.tv_nsec = 0;
2632
2633 /* listen on ports, timeout for collecting terminated children */
2634 if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2635 if (errno != EINTR) {
2636 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2637 }
2638 }
2639 if(nsd->restart_children) {
2640 restart_child_servers(nsd, server_region, netio,
2641 &nsd->xfrd_listener->fd);
2642 nsd->restart_children = 0;
2643 }
2644 if(nsd->reload_failed) {
2645 sig_atomic_t cmd = NSD_RELOAD_FAILED;
2646 pid_t mypid;
2647 nsd->reload_failed = 0;
2648 log_msg(LOG_WARNING,
2649 "Reload process %d failed, continuing with old database",
2650 (int) reload_pid);
2651 reload_pid = -1;
2652 if(reload_listener.fd != -1) close(reload_listener.fd);
2653 netio_remove_handler(netio, &reload_listener);
2654 reload_listener.fd = -1;
2655 reload_listener.event_types = NETIO_EVENT_NONE;
2656 task_process_sync(nsd->task[nsd->mytask]);
2657 /* inform xfrd reload attempt ended */
2658 if(!write_socket(nsd->xfrd_listener->fd,
2659 &cmd, sizeof(cmd))) {
2660 log_msg(LOG_ERR, "problems "
2661 "sending SOAEND to xfrd: %s",
2662 strerror(errno));
2663 }
2664 mypid = getpid();
2665 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) {
2666 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2667 strerror(errno));
2668 }
2669 }
2670
2671 break;
2672 case NSD_RELOAD_REQ: {
2673 sig_atomic_t cmd = NSD_RELOAD_REQ;
2674 log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2675 DEBUG(DEBUG_IPC,1, (LOG_INFO,
2676 "main: ipc send reload_req to xfrd"));
2677 if(!write_socket(nsd->xfrd_listener->fd,
2678 &cmd, sizeof(cmd))) {
2679 log_msg(LOG_ERR, "server_main: could not send "
2680 "reload_req to xfrd: %s", strerror(errno));
2681 }
2682 nsd->mode = NSD_RUN;
2683 } break;
2684 case NSD_RELOAD:
2685 /* Continue to run nsd after reload */
2686 nsd->mode = NSD_RUN;
2687 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2688 if (reload_pid != -1) {
2689 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2690 (int) reload_pid);
2691 break;
2692 }
2693
2694 /* switch the mytask to keep track of who owns task*/
2695 nsd->mytask = 1 - nsd->mytask;
2696 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2697 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2698 reload_pid = -1;
2699 break;
2700 }
2701
2702 /* Do actual reload */
2703 reload_pid = fork();
2704 switch (reload_pid) {
2705 case -1:
2706 log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2707 break;
2708 default:
2709 /* PARENT */
2710 close(reload_sockets[0]);
2711 server_reload(nsd, server_region, netio,
2712 reload_sockets[1]);
2713 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2714 close(reload_sockets[1]);
2715 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2716 /* drop stale xfrd ipc data */
2717 ((struct ipc_handler_conn_data*)nsd->
2718 xfrd_listener->user_data)
2719 ->conn->is_reading = 0;
2720 reload_pid = -1;
2721 reload_listener.fd = -1;
2722 reload_listener.event_types = NETIO_EVENT_NONE;
2723 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2724 break;
2725 case 0:
2726 /* CHILD */
2727 /* server_main keep running until NSD_QUIT_SYNC
2728 * received from reload. */
2729 close(reload_sockets[1]);
2730 reload_listener.fd = reload_sockets[0];
2731 reload_listener.timeout = NULL;
2732 reload_listener.user_data = nsd;
2733 reload_listener.event_types = NETIO_EVENT_READ;
2734 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2735 netio_add_handler(netio, &reload_listener);
2736 reload_pid = getppid();
2737 break;
2738 }
2739 break;
2740 case NSD_QUIT_SYNC:
2741 /* synchronisation of xfrd, parent and reload */
2742 if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2743 sig_atomic_t cmd = NSD_RELOAD;
2744 /* stop xfrd ipc writes in progress */
2745 DEBUG(DEBUG_IPC,1, (LOG_INFO,
2746 "main: ipc send indication reload"));
2747 if(!write_socket(nsd->xfrd_listener->fd,
2748 &cmd, sizeof(cmd))) {
2749 log_msg(LOG_ERR, "server_main: could not send reload "
2750 "indication to xfrd: %s", strerror(errno));
2751 }
2752 /* wait for ACK from xfrd */
2753 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2754 nsd->quit_sync_done = 1;
2755 }
2756 nsd->mode = NSD_RUN;
2757 break;
2758 case NSD_QUIT:
2759 /* silent shutdown during reload */
2760 if(reload_listener.fd != -1) {
2761 /* acknowledge the quit, to sync reload that we will really quit now */
2762 sig_atomic_t cmd = NSD_RELOAD;
2763 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2764 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2765 log_msg(LOG_ERR, "server_main: "
2766 "could not ack quit: %s", strerror(errno));
2767 }
2768 #ifdef BIND8_STATS
2769 parent_send_stats(nsd, reload_listener.fd);
2770 #endif /* BIND8_STATS */
2771 close(reload_listener.fd);
2772 }
2773 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2774 /* only quit children after xfrd has acked */
2775 send_children_quit(nsd);
2776
2777 #ifdef MEMCLEAN /* OS collects memory pages */
2778 region_destroy(server_region);
2779 #endif
2780 server_shutdown(nsd);
2781
2782 /* ENOTREACH */
2783 break;
2784 case NSD_SHUTDOWN:
2785 break;
2786 case NSD_REAP_CHILDREN:
2787 /* continue; wait for child in run loop */
2788 nsd->mode = NSD_RUN;
2789 break;
2790 case NSD_STATS:
2791 #ifdef BIND8_STATS
2792 set_children_stats(nsd);
2793 #endif
2794 nsd->mode = NSD_RUN;
2795 break;
2796 default:
2797 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2798 nsd->mode = NSD_RUN;
2799 break;
2800 }
2801 }
2802 log_msg(LOG_WARNING, "signal received, shutting down...");
2803
2804 /* close opened ports to avoid race with restart of nsd */
2805 server_close_all_sockets(nsd->udp, nsd->ifs);
2806 server_close_all_sockets(nsd->tcp, nsd->ifs);
2807 #ifdef HAVE_SSL
2808 daemon_remote_close(nsd->rc);
2809 #endif
2810 send_children_quit_and_wait(nsd);
2811
2812 /* Unlink it if possible... */
2813 unlinkpid(nsd->pidfile);
2814 unlink(nsd->task[0]->fname);
2815 unlink(nsd->task[1]->fname);
2816 #ifdef USE_ZONE_STATS
2817 unlink(nsd->zonestatfname[0]);
2818 unlink(nsd->zonestatfname[1]);
2819 #endif
2820 #ifdef USE_DNSTAP
2821 dt_collector_close(nsd->dt_collector, nsd);
2822 #endif
2823
2824 if(reload_listener.fd != -1) {
2825 sig_atomic_t cmd = NSD_QUIT;
2826 DEBUG(DEBUG_IPC,1, (LOG_INFO,
2827 "main: ipc send quit to reload-process"));
2828 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2829 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2830 strerror(errno));
2831 }
2832 fsync(reload_listener.fd);
2833 close(reload_listener.fd);
2834 /* wait for reload to finish processing */
2835 while(1) {
2836 if(waitpid(reload_pid, NULL, 0) == -1) {
2837 if(errno == EINTR) continue;
2838 if(errno == ECHILD) break;
2839 log_msg(LOG_ERR, "waitpid(reload %d): %s",
2840 (int)reload_pid, strerror(errno));
2841 }
2842 break;
2843 }
2844 }
2845 if(nsd->xfrd_listener->fd != -1) {
2846 /* complete quit, stop xfrd */
2847 sig_atomic_t cmd = NSD_QUIT;
2848 DEBUG(DEBUG_IPC,1, (LOG_INFO,
2849 "main: ipc send quit to xfrd"));
2850 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2851 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2852 strerror(errno));
2853 }
2854 fsync(nsd->xfrd_listener->fd);
2855 close(nsd->xfrd_listener->fd);
2856 (void)kill(nsd->pid, SIGTERM);
2857 }
2858
2859 #ifdef MEMCLEAN /* OS collects memory pages */
2860 region_destroy(server_region);
2861 #endif
2862 /* write the nsd.db to disk, wait for it to complete */
2863 udb_base_sync(nsd->db->udb, 1);
2864 udb_base_close(nsd->db->udb);
2865 server_shutdown(nsd);
2866 }
2867
2868 static query_state_type
server_process_query(struct nsd * nsd,struct query * query,uint32_t * now_p)2869 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2870 {
2871 return query_process(query, nsd, now_p);
2872 }
2873
2874 static query_state_type
server_process_query_udp(struct nsd * nsd,struct query * query,uint32_t * now_p)2875 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2876 {
2877 #ifdef RATELIMIT
2878 if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2879 if(query->edns.cookie_status != COOKIE_VALID
2880 && query->edns.cookie_status != COOKIE_VALID_REUSE
2881 && rrl_process_query(query))
2882 return rrl_slip(query);
2883 else return QUERY_PROCESSED;
2884 }
2885 return QUERY_DISCARDED;
2886 #else
2887 return query_process(query, nsd, now_p);
2888 #endif
2889 }
2890
2891 const char*
nsd_event_vs(void)2892 nsd_event_vs(void)
2893 {
2894 #ifdef USE_MINI_EVENT
2895 return "";
2896 #else
2897 return event_get_version();
2898 #endif
2899 }
2900
2901 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
ub_ev_backend2str(int b)2902 static const char* ub_ev_backend2str(int b)
2903 {
2904 switch(b) {
2905 case EVBACKEND_SELECT: return "select";
2906 case EVBACKEND_POLL: return "poll";
2907 case EVBACKEND_EPOLL: return "epoll";
2908 case EVBACKEND_KQUEUE: return "kqueue";
2909 case EVBACKEND_DEVPOLL: return "devpoll";
2910 case EVBACKEND_PORT: return "evport";
2911 }
2912 return "unknown";
2913 }
2914 #endif
2915
2916 const char*
nsd_event_method(void)2917 nsd_event_method(void)
2918 {
2919 #ifdef USE_MINI_EVENT
2920 return "select";
2921 #else
2922 struct event_base* b = nsd_child_event_base();
2923 const char* m = "?";
2924 # ifdef EV_FEATURE_BACKENDS
2925 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2926 # elif defined(HAVE_EVENT_BASE_GET_METHOD)
2927 m = event_base_get_method(b);
2928 # endif
2929 # ifdef MEMCLEAN
2930 event_base_free(b);
2931 # endif
2932 return m;
2933 #endif
2934 }
2935
2936 struct event_base*
nsd_child_event_base(void)2937 nsd_child_event_base(void)
2938 {
2939 struct event_base* base;
2940 #ifdef USE_MINI_EVENT
2941 static time_t secs;
2942 static struct timeval now;
2943 base = event_init(&secs, &now);
2944 #else
2945 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2946 /* libev */
2947 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2948 # else
2949 /* libevent */
2950 # ifdef HAVE_EVENT_BASE_NEW
2951 base = event_base_new();
2952 # else
2953 base = event_init();
2954 # endif
2955 # endif
2956 #endif
2957 return base;
2958 }
2959
2960 static void
add_udp_handler(struct nsd * nsd,struct nsd_socket * sock,struct udp_handler_data * data)2961 add_udp_handler(
2962 struct nsd *nsd,
2963 struct nsd_socket *sock,
2964 struct udp_handler_data *data)
2965 {
2966 struct event *handler = &data->event;
2967
2968 data->nsd = nsd;
2969 data->socket = sock;
2970
2971 memset(handler, 0, sizeof(*handler));
2972 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2973 if(event_base_set(nsd->event_base, handler) != 0)
2974 log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2975 if(event_add(handler, NULL) != 0)
2976 log_msg(LOG_ERR, "nsd udp: event_add failed");
2977 }
2978
2979 void
add_tcp_handler(struct nsd * nsd,struct nsd_socket * sock,struct tcp_accept_handler_data * data)2980 add_tcp_handler(
2981 struct nsd *nsd,
2982 struct nsd_socket *sock,
2983 struct tcp_accept_handler_data *data)
2984 {
2985 struct event *handler = &data->event;
2986
2987 data->nsd = nsd;
2988 data->socket = sock;
2989
2990 #ifdef HAVE_SSL
2991 if (nsd->tls_ctx &&
2992 nsd->options->tls_port &&
2993 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2994 {
2995 data->tls_accept = 1;
2996 if(verbosity >= 2) {
2997 char buf[48];
2998 addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2999 VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3000 }
3001 } else {
3002 data->tls_accept = 0;
3003 }
3004 #endif
3005
3006 memset(handler, 0, sizeof(*handler));
3007 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data);
3008 if(event_base_set(nsd->event_base, handler) != 0)
3009 log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3010 if(event_add(handler, NULL) != 0)
3011 log_msg(LOG_ERR, "nsd tcp: event_add failed");
3012 data->event_added = 1;
3013 }
3014
3015 /*
3016 * Serve DNS request to verifiers (short-lived)
3017 */
server_verify(struct nsd * nsd,int cmdsocket)3018 void server_verify(struct nsd *nsd, int cmdsocket)
3019 {
3020 size_t size = 0;
3021 struct event cmd_event, signal_event, exit_event;
3022 struct zone *zone;
3023
3024 assert(nsd != NULL);
3025
3026 zone = verify_next_zone(nsd, NULL);
3027 if(zone == NULL)
3028 return;
3029
3030 nsd->server_region = region_create(xalloc, free);
3031 nsd->event_base = nsd_child_event_base();
3032
3033 nsd->next_zone_to_verify = zone;
3034 nsd->verifier_count = 0;
3035 nsd->verifier_limit = nsd->options->verifier_count;
3036 size = sizeof(struct verifier) * nsd->verifier_limit;
3037 pipe(nsd->verifier_pipe);
3038 fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3039 fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3040 nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3041
3042 for(size_t i = 0; i < nsd->verifier_limit; i++) {
3043 nsd->verifiers[i].nsd = nsd;
3044 nsd->verifiers[i].zone = NULL;
3045 nsd->verifiers[i].pid = -1;
3046 nsd->verifiers[i].output_stream.fd = -1;
3047 nsd->verifiers[i].output_stream.priority = LOG_INFO;
3048 nsd->verifiers[i].error_stream.fd = -1;
3049 nsd->verifiers[i].error_stream.priority = LOG_ERR;
3050 }
3051
3052 event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3053 if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3054 event_add(&cmd_event, NULL) != 0)
3055 {
3056 log_msg(LOG_ERR, "verify: could not add command event");
3057 goto fail;
3058 }
3059
3060 event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3061 if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3062 signal_add(&signal_event, NULL) != 0)
3063 {
3064 log_msg(LOG_ERR, "verify: could not add signal event");
3065 goto fail;
3066 }
3067
3068 event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3069 if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3070 event_add(&exit_event, NULL) != 0)
3071 {
3072 log_msg(LOG_ERR, "verify: could not add exit event");
3073 goto fail;
3074 }
3075
3076 memset(msgs, 0, sizeof(msgs));
3077 for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
3078 queries[i] = query_create(nsd->server_region,
3079 compressed_dname_offsets,
3080 compression_table_size, compressed_dnames);
3081 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3082 iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3083 iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3084 msgs[i].msg_hdr.msg_iov = &iovecs[i];
3085 msgs[i].msg_hdr.msg_iovlen = 1;
3086 msgs[i].msg_hdr.msg_name = &queries[i]->addr;
3087 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3088 }
3089
3090 for (size_t i = 0; i < nsd->verify_ifs; i++) {
3091 struct udp_handler_data *data;
3092 data = region_alloc_zero(
3093 nsd->server_region, sizeof(*data));
3094 add_udp_handler(nsd, &nsd->verify_udp[i], data);
3095 }
3096
3097 tcp_accept_handler_count = nsd->verify_ifs;
3098 tcp_accept_handlers = region_alloc_array(nsd->server_region,
3099 nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3100
3101 for (size_t i = 0; i < nsd->verify_ifs; i++) {
3102 struct tcp_accept_handler_data *data;
3103 data = &tcp_accept_handlers[i];
3104 memset(data, 0, sizeof(*data));
3105 add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3106 }
3107
3108 while(nsd->next_zone_to_verify != NULL &&
3109 nsd->verifier_count < nsd->verifier_limit)
3110 {
3111 verify_zone(nsd, nsd->next_zone_to_verify);
3112 nsd->next_zone_to_verify
3113 = verify_next_zone(nsd, nsd->next_zone_to_verify);
3114 }
3115
3116 /* short-lived main loop */
3117 event_base_dispatch(nsd->event_base);
3118
3119 /* remove command and exit event handlers */
3120 event_del(&exit_event);
3121 event_del(&signal_event);
3122 event_del(&cmd_event);
3123
3124 assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3125 assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3126 fail:
3127 event_base_free(nsd->event_base);
3128 close(nsd->verifier_pipe[0]);
3129 close(nsd->verifier_pipe[1]);
3130 region_destroy(nsd->server_region);
3131
3132 nsd->event_base = NULL;
3133 nsd->server_region = NULL;
3134 nsd->verifier_limit = 0;
3135 nsd->verifier_pipe[0] = -1;
3136 nsd->verifier_pipe[1] = -1;
3137 nsd->verifiers = NULL;
3138 }
3139
3140 /*
3141 * Serve DNS requests.
3142 */
3143 void
server_child(struct nsd * nsd)3144 server_child(struct nsd *nsd)
3145 {
3146 size_t i, from, numifs;
3147 region_type *server_region = region_create(xalloc, free);
3148 struct event_base* event_base = nsd_child_event_base();
3149 sig_atomic_t mode;
3150
3151 if(!event_base) {
3152 log_msg(LOG_ERR, "nsd server could not create event base");
3153 exit(1);
3154 }
3155 nsd->event_base = event_base;
3156 nsd->server_region = server_region;
3157
3158 #ifdef RATELIMIT
3159 rrl_init(nsd->this_child->child_num);
3160 #endif
3161
3162 assert(nsd->server_kind != NSD_SERVER_MAIN);
3163 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3164
3165 #ifdef HAVE_SETPROCTITLE
3166 setproctitle("server %d", nsd->this_child->child_num + 1);
3167 #endif
3168 #ifdef HAVE_CPUSET_T
3169 if(nsd->use_cpu_affinity) {
3170 set_cpu_affinity(nsd->this_child->cpuset);
3171 }
3172 #endif
3173
3174 if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3175 server_close_all_sockets(nsd->tcp, nsd->ifs);
3176 }
3177 if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3178 server_close_all_sockets(nsd->udp, nsd->ifs);
3179 }
3180
3181 if (nsd->this_child->parent_fd != -1) {
3182 struct event *handler;
3183 struct ipc_handler_conn_data* user_data =
3184 (struct ipc_handler_conn_data*)region_alloc(
3185 server_region, sizeof(struct ipc_handler_conn_data));
3186 user_data->nsd = nsd;
3187 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3188
3189 handler = (struct event*) region_alloc(
3190 server_region, sizeof(*handler));
3191 memset(handler, 0, sizeof(*handler));
3192 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3193 EV_READ, child_handle_parent_command, user_data);
3194 if(event_base_set(event_base, handler) != 0)
3195 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3196 if(event_add(handler, NULL) != 0)
3197 log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3198 }
3199
3200 if(nsd->reuseport) {
3201 numifs = nsd->ifs / nsd->reuseport;
3202 from = numifs * nsd->this_child->child_num;
3203 if(from+numifs > nsd->ifs) { /* should not happen */
3204 from = 0;
3205 numifs = nsd->ifs;
3206 }
3207 } else {
3208 from = 0;
3209 numifs = nsd->ifs;
3210 }
3211
3212 if (nsd->server_kind & NSD_SERVER_UDP) {
3213 int child = nsd->this_child->child_num;
3214 memset(msgs, 0, sizeof(msgs));
3215 for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3216 queries[i] = query_create(server_region,
3217 compressed_dname_offsets,
3218 compression_table_size, compressed_dnames);
3219 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3220 iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3221 iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3222 msgs[i].msg_hdr.msg_iov = &iovecs[i];
3223 msgs[i].msg_hdr.msg_iovlen = 1;
3224 msgs[i].msg_hdr.msg_name = &queries[i]->addr;
3225 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3226 }
3227
3228 for (i = 0; i < nsd->ifs; i++) {
3229 int listen;
3230 struct udp_handler_data *data;
3231
3232 listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3233
3234 if(i >= from && i < (from + numifs) && listen) {
3235 data = region_alloc_zero(
3236 nsd->server_region, sizeof(*data));
3237 add_udp_handler(nsd, &nsd->udp[i], data);
3238 } else {
3239 /* close sockets intended for other servers */
3240 server_close_socket(&nsd->udp[i]);
3241 }
3242 }
3243 }
3244
3245 /*
3246 * Keep track of all the TCP accept handlers so we can enable
3247 * and disable them based on the current number of active TCP
3248 * connections.
3249 */
3250 if (nsd->server_kind & NSD_SERVER_TCP) {
3251 int child = nsd->this_child->child_num;
3252 tcp_accept_handler_count = numifs;
3253 tcp_accept_handlers = region_alloc_array(server_region,
3254 numifs, sizeof(*tcp_accept_handlers));
3255
3256 for (i = 0; i < nsd->ifs; i++) {
3257 int listen;
3258 struct tcp_accept_handler_data *data;
3259
3260 listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3261
3262 if(i >= from && i < (from + numifs) && listen) {
3263 data = &tcp_accept_handlers[i-from];
3264 memset(data, 0, sizeof(*data));
3265 add_tcp_handler(nsd, &nsd->tcp[i], data);
3266 } else {
3267 /* close sockets intended for other servers */
3268 /*
3269 * uncomment this once tcp servers are no
3270 * longer copied in the tcp fd copy line
3271 * in server_init().
3272 server_close_socket(&nsd->tcp[i]);
3273 */
3274 /* close sockets not meant for this server*/
3275 if(!listen)
3276 server_close_socket(&nsd->tcp[i]);
3277 }
3278 }
3279 } else {
3280 tcp_accept_handler_count = 0;
3281 }
3282
3283 /* The main loop... */
3284 while ((mode = nsd->mode) != NSD_QUIT) {
3285 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3286
3287 /* Do we need to do the statistics... */
3288 if (mode == NSD_STATS) {
3289 #ifdef BIND8_STATS
3290 int p = nsd->st.period;
3291 nsd->st.period = 1; /* force stats printout */
3292 /* Dump the statistics */
3293 bind8_stats(nsd);
3294 nsd->st.period = p;
3295 #else /* !BIND8_STATS */
3296 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3297 #endif /* BIND8_STATS */
3298
3299 nsd->mode = NSD_RUN;
3300 }
3301 else if (mode == NSD_REAP_CHILDREN) {
3302 /* got signal, notify parent. parent reaps terminated children. */
3303 if (nsd->this_child->parent_fd != -1) {
3304 sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3305 if (write(nsd->this_child->parent_fd,
3306 &parent_notify,
3307 sizeof(parent_notify)) == -1)
3308 {
3309 log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3310 (int) nsd->this_child->pid, strerror(errno));
3311 }
3312 } else /* no parent, so reap 'em */
3313 while (waitpid(-1, NULL, WNOHANG) > 0) ;
3314 nsd->mode = NSD_RUN;
3315 }
3316 else if(mode == NSD_RUN) {
3317 /* Wait for a query... */
3318 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3319 if (errno != EINTR) {
3320 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3321 break;
3322 }
3323 }
3324 } else if(mode == NSD_QUIT) {
3325 /* ignore here, quit */
3326 } else {
3327 log_msg(LOG_ERR, "mode bad value %d, back to service.",
3328 (int)mode);
3329 nsd->mode = NSD_RUN;
3330 }
3331 }
3332
3333 service_remaining_tcp(nsd);
3334 #ifdef BIND8_STATS
3335 bind8_stats(nsd);
3336 #endif /* BIND8_STATS */
3337
3338 #ifdef MEMCLEAN /* OS collects memory pages */
3339 #ifdef RATELIMIT
3340 rrl_deinit(nsd->this_child->child_num);
3341 #endif
3342 event_base_free(event_base);
3343 region_destroy(server_region);
3344 #endif
3345 server_shutdown(nsd);
3346 }
3347
remaining_tcp_timeout(int ATTR_UNUSED (fd),short event,void * arg)3348 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3349 {
3350 int* timed_out = (int*)arg;
3351 assert(event & EV_TIMEOUT); (void)event;
3352 /* wake up the service tcp thread, note event is no longer
3353 * registered */
3354 *timed_out = 1;
3355 }
3356
3357 void
service_remaining_tcp(struct nsd * nsd)3358 service_remaining_tcp(struct nsd* nsd)
3359 {
3360 struct tcp_handler_data* p;
3361 struct event_base* event_base;
3362 /* check if it is needed */
3363 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3364 return;
3365 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3366 #ifdef USE_DNSTAP
3367 /* remove dnstap collector, we cannot write there because the new
3368 * child process is using the file descriptor, or the child
3369 * process after that. */
3370 dt_collector_destroy(nsd->dt_collector, nsd);
3371 nsd->dt_collector = NULL;
3372 #endif
3373 /* setup event base */
3374 event_base = nsd_child_event_base();
3375 if(!event_base) {
3376 log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3377 return;
3378 }
3379 /* register tcp connections */
3380 for(p = tcp_active_list; p != NULL; p = p->next) {
3381 struct timeval timeout;
3382 int fd = p->event.ev_fd;
3383 #ifdef USE_MINI_EVENT
3384 short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3385 #else
3386 short event = p->event.ev_events & (EV_READ|EV_WRITE);
3387 #endif
3388 void (*fn)(int, short, void*);
3389 #ifdef HAVE_SSL
3390 if(p->tls) {
3391 if((event&EV_READ))
3392 fn = handle_tls_reading;
3393 else fn = handle_tls_writing;
3394 } else {
3395 #endif
3396 if((event&EV_READ))
3397 fn = handle_tcp_reading;
3398 else fn = handle_tcp_writing;
3399 #ifdef HAVE_SSL
3400 }
3401 #endif
3402
3403 p->tcp_no_more_queries = 1;
3404 /* set timeout to 1/10 second */
3405 if(p->tcp_timeout > 100)
3406 p->tcp_timeout = 100;
3407 timeout.tv_sec = p->tcp_timeout / 1000;
3408 timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3409 event_del(&p->event);
3410 memset(&p->event, 0, sizeof(p->event));
3411 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3412 fn, p);
3413 if(event_base_set(event_base, &p->event) != 0)
3414 log_msg(LOG_ERR, "event base set failed");
3415 if(event_add(&p->event, &timeout) != 0)
3416 log_msg(LOG_ERR, "event add failed");
3417 }
3418
3419 /* handle it */
3420 while(nsd->current_tcp_count > 0) {
3421 mode_t m = server_signal_mode(nsd);
3422 struct event timeout;
3423 struct timeval tv;
3424 int timed_out = 0;
3425 if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3426 m == NSD_REAP_CHILDREN) {
3427 /* quit */
3428 break;
3429 }
3430 /* timer */
3431 /* have to do something every second */
3432 tv.tv_sec = 1;
3433 tv.tv_usec = 0;
3434 memset(&timeout, 0, sizeof(timeout));
3435 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3436 &timed_out);
3437 if(event_base_set(event_base, &timeout) != 0)
3438 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3439 if(event_add(&timeout, &tv) != 0)
3440 log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3441
3442 /* service loop */
3443 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3444 if (errno != EINTR) {
3445 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3446 break;
3447 }
3448 }
3449 if(!timed_out) {
3450 event_del(&timeout);
3451 } else {
3452 /* timed out, quit */
3453 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3454 break;
3455 }
3456 }
3457 #ifdef MEMCLEAN
3458 event_base_free(event_base);
3459 #endif
3460 /* continue to quit after return */
3461 }
3462
3463 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3464 * are always used, even if nonblocking operations are broken, in which case
3465 * NUM_RECV_PER_SELECT is defined to 1 (one).
3466 */
3467 #if defined(HAVE_RECVMMSG)
3468 #define nsd_recvmmsg recvmmsg
3469 #else /* !HAVE_RECVMMSG */
3470
3471 static int
nsd_recvmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags,struct timespec * timeout)3472 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3473 int flags, struct timespec *timeout)
3474 {
3475 unsigned int vpos = 0;
3476 ssize_t rcvd;
3477
3478 /* timeout is ignored, ensure caller does not expect it to work */
3479 assert(timeout == NULL); (void)timeout;
3480
3481 while(vpos < vlen) {
3482 rcvd = recvfrom(sockfd,
3483 msgvec[vpos].msg_hdr.msg_iov->iov_base,
3484 msgvec[vpos].msg_hdr.msg_iov->iov_len,
3485 flags,
3486 msgvec[vpos].msg_hdr.msg_name,
3487 &msgvec[vpos].msg_hdr.msg_namelen);
3488 if(rcvd < 0) {
3489 break;
3490 } else {
3491 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3492 msgvec[vpos].msg_len = (unsigned int)rcvd;
3493 vpos++;
3494 }
3495 }
3496
3497 if(vpos) {
3498 /* error will be picked up next time */
3499 return (int)vpos;
3500 } else if(errno == 0) {
3501 return 0;
3502 } else if(errno == EAGAIN) {
3503 return 0;
3504 }
3505
3506 return -1;
3507 }
3508 #endif /* HAVE_RECVMMSG */
3509
3510 #ifdef HAVE_SENDMMSG
3511 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3512 #else /* !HAVE_SENDMMSG */
3513
3514 static int
nsd_sendmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags)3515 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3516 {
3517 unsigned int vpos = 0;
3518 ssize_t snd;
3519
3520 while(vpos < vlen) {
3521 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3522 snd = sendto(sockfd,
3523 msgvec[vpos].msg_hdr.msg_iov->iov_base,
3524 msgvec[vpos].msg_hdr.msg_iov->iov_len,
3525 flags,
3526 msgvec[vpos].msg_hdr.msg_name,
3527 msgvec[vpos].msg_hdr.msg_namelen);
3528 if(snd < 0) {
3529 break;
3530 } else {
3531 msgvec[vpos].msg_len = (unsigned int)snd;
3532 vpos++;
3533 }
3534 }
3535
3536 if(vpos) {
3537 return (int)vpos;
3538 } else if(errno == 0) {
3539 return 0;
3540 }
3541
3542 return -1;
3543 }
3544 #endif /* HAVE_SENDMMSG */
3545
3546 static int
port_is_zero(struct sockaddr_storage * addr)3547 port_is_zero(
3548 #ifdef INET6
3549 struct sockaddr_storage *addr
3550 #else
3551 struct sockaddr_in *addr
3552 #endif
3553 )
3554 {
3555 #ifdef INET6
3556 if(addr->ss_family == AF_INET6) {
3557 return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3558 } else if(addr->ss_family == AF_INET) {
3559 return (((struct sockaddr_in *)addr)->sin_port) == 0;
3560 }
3561 return 0;
3562 #else
3563 if(addr->sin_family == AF_INET) {
3564 return addr->sin_port == 0;
3565 }
3566 return 0;
3567 #endif
3568 }
3569
3570 static void
handle_udp(int fd,short event,void * arg)3571 handle_udp(int fd, short event, void* arg)
3572 {
3573 struct udp_handler_data *data = (struct udp_handler_data *) arg;
3574 int received, sent, recvcount, i;
3575 struct query *q;
3576 uint32_t now = 0;
3577
3578 if (!(event & EV_READ)) {
3579 return;
3580 }
3581 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3582 /* this printf strangely gave a performance increase on Linux */
3583 /* printf("recvcount %d \n", recvcount); */
3584 if (recvcount == -1) {
3585 if (errno != EAGAIN && errno != EINTR) {
3586 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3587 STATUP(data->nsd, rxerr);
3588 /* No zone statup */
3589 }
3590 /* Simply no data available */
3591 return;
3592 }
3593 for (i = 0; i < recvcount; i++) {
3594 loopstart:
3595 received = msgs[i].msg_len;
3596 queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3597 q = queries[i];
3598 if (received == -1) {
3599 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3600 #if defined(HAVE_RECVMMSG)
3601 msgs[i].msg_hdr.msg_flags
3602 #else
3603 errno
3604 #endif
3605 ));
3606 STATUP(data->nsd, rxerr);
3607 /* No zone statup */
3608 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3609 iovecs[i].iov_len = buffer_remaining(q->packet);
3610 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3611 goto swap_drop;
3612 }
3613
3614 /* Account... */
3615 #ifdef BIND8_STATS
3616 if (data->socket->addr.ai_family == AF_INET) {
3617 STATUP(data->nsd, qudp);
3618 } else if (data->socket->addr.ai_family == AF_INET6) {
3619 STATUP(data->nsd, qudp6);
3620 }
3621 #endif
3622
3623 buffer_skip(q->packet, received);
3624 buffer_flip(q->packet);
3625 #ifdef USE_DNSTAP
3626 /*
3627 * sending UDP-query with server address (local) and client address to dnstap process
3628 */
3629 log_addr("query from client", &q->addr);
3630 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3631 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen,
3632 q->tcp, q->packet);
3633 #endif /* USE_DNSTAP */
3634
3635 /* Process and answer the query... */
3636 if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3637 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3638 STATUP(data->nsd, nona);
3639 ZTATUP(data->nsd, q->zone, nona);
3640 }
3641
3642 #ifdef USE_ZONE_STATS
3643 if (data->socket->addr.ai_family == AF_INET) {
3644 ZTATUP(data->nsd, q->zone, qudp);
3645 } else if (data->socket->addr.ai_family == AF_INET6) {
3646 ZTATUP(data->nsd, q->zone, qudp6);
3647 }
3648 #endif
3649
3650 /* Add EDNS0 and TSIG info if necessary. */
3651 query_add_optional(q, data->nsd, &now);
3652
3653 buffer_flip(q->packet);
3654 iovecs[i].iov_len = buffer_remaining(q->packet);
3655 #ifdef BIND8_STATS
3656 /* Account the rcode & TC... */
3657 STATUP2(data->nsd, rcode, RCODE(q->packet));
3658 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3659 if (TC(q->packet)) {
3660 STATUP(data->nsd, truncated);
3661 ZTATUP(data->nsd, q->zone, truncated);
3662 }
3663 #endif /* BIND8_STATS */
3664 #ifdef USE_DNSTAP
3665 /*
3666 * sending UDP-response with server address (local) and client address to dnstap process
3667 */
3668 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3669 log_addr("response to client", &q->addr);
3670 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3671 &q->addr, q->addrlen, q->tcp, q->packet,
3672 q->zone);
3673 #endif /* USE_DNSTAP */
3674 } else {
3675 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3676 iovecs[i].iov_len = buffer_remaining(q->packet);
3677 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3678 swap_drop:
3679 STATUP(data->nsd, dropped);
3680 ZTATUP(data->nsd, q->zone, dropped);
3681 if(i != recvcount-1) {
3682 /* swap with last and decrease recvcount */
3683 struct mmsghdr mtmp = msgs[i];
3684 struct iovec iotmp = iovecs[i];
3685 recvcount--;
3686 msgs[i] = msgs[recvcount];
3687 iovecs[i] = iovecs[recvcount];
3688 queries[i] = queries[recvcount];
3689 msgs[recvcount] = mtmp;
3690 iovecs[recvcount] = iotmp;
3691 queries[recvcount] = q;
3692 msgs[i].msg_hdr.msg_iov = &iovecs[i];
3693 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3694 goto loopstart;
3695 } else { recvcount --; }
3696 }
3697 }
3698
3699 /* send until all are sent */
3700 i = 0;
3701 while(i<recvcount) {
3702 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3703 if(sent == -1) {
3704 if(errno == ENOBUFS ||
3705 #ifdef EWOULDBLOCK
3706 errno == EWOULDBLOCK ||
3707 #endif
3708 errno == EAGAIN) {
3709 /* block to wait until send buffer avail */
3710 int flag, errstore;
3711 if((flag = fcntl(fd, F_GETFL)) == -1) {
3712 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3713 flag = 0;
3714 }
3715 flag &= ~O_NONBLOCK;
3716 if(fcntl(fd, F_SETFL, flag) == -1)
3717 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3718 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3719 errstore = errno;
3720 flag |= O_NONBLOCK;
3721 if(fcntl(fd, F_SETFL, flag) == -1)
3722 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3723 if(sent != -1) {
3724 i += sent;
3725 continue;
3726 }
3727 errno = errstore;
3728 }
3729 if(errno == EINVAL) {
3730 /* skip the invalid argument entry,
3731 * send the remaining packets in the list */
3732 if(!(port_is_zero((void*)&queries[i]->addr) &&
3733 verbosity < 3)) {
3734 const char* es = strerror(errno);
3735 char a[64];
3736 addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3737 log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3738 }
3739 i += 1;
3740 continue;
3741 }
3742 /* don't log transient network full errors, unless
3743 * on higher verbosity */
3744 if(!(errno == ENOBUFS && verbosity < 1) &&
3745 #ifdef EWOULDBLOCK
3746 errno != EWOULDBLOCK &&
3747 #endif
3748 errno != EAGAIN) {
3749 const char* es = strerror(errno);
3750 char a[64];
3751 addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3752 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3753 }
3754 #ifdef BIND8_STATS
3755 data->nsd->st.txerr += recvcount-i;
3756 #endif /* BIND8_STATS */
3757 break;
3758 }
3759 i += sent;
3760 }
3761 for(i=0; i<recvcount; i++) {
3762 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3763 iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3764 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3765 }
3766 }
3767
3768 #ifdef HAVE_SSL
3769 /*
3770 * Setup an event for the tcp handler.
3771 */
3772 static void
tcp_handler_setup_event(struct tcp_handler_data * data,void (* fn)(int,short,void *),int fd,short event)3773 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3774 int fd, short event)
3775 {
3776 struct timeval timeout;
3777 struct event_base* ev_base;
3778
3779 timeout.tv_sec = data->nsd->tcp_timeout;
3780 timeout.tv_usec = 0L;
3781
3782 ev_base = data->event.ev_base;
3783 event_del(&data->event);
3784 memset(&data->event, 0, sizeof(data->event));
3785 event_set(&data->event, fd, event, fn, data);
3786 if(event_base_set(ev_base, &data->event) != 0)
3787 log_msg(LOG_ERR, "event base set failed");
3788 if(event_add(&data->event, &timeout) != 0)
3789 log_msg(LOG_ERR, "event add failed");
3790 }
3791 #endif /* HAVE_SSL */
3792
3793 static void
cleanup_tcp_handler(struct tcp_handler_data * data)3794 cleanup_tcp_handler(struct tcp_handler_data* data)
3795 {
3796 event_del(&data->event);
3797 #ifdef HAVE_SSL
3798 if(data->tls) {
3799 SSL_shutdown(data->tls);
3800 SSL_free(data->tls);
3801 data->tls = NULL;
3802 }
3803 #endif
3804 close(data->event.ev_fd);
3805 if(data->prev)
3806 data->prev->next = data->next;
3807 else tcp_active_list = data->next;
3808 if(data->next)
3809 data->next->prev = data->prev;
3810
3811 /*
3812 * Enable the TCP accept handlers when the current number of
3813 * TCP connections is about to drop below the maximum number
3814 * of TCP connections.
3815 */
3816 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3817 configure_handler_event_types(EV_READ|EV_PERSIST);
3818 if(slowaccept) {
3819 event_del(&slowaccept_event);
3820 slowaccept = 0;
3821 }
3822 }
3823 --data->nsd->current_tcp_count;
3824 assert(data->nsd->current_tcp_count >= 0);
3825
3826 region_destroy(data->region);
3827 }
3828
3829 static void
handle_tcp_reading(int fd,short event,void * arg)3830 handle_tcp_reading(int fd, short event, void* arg)
3831 {
3832 struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3833 ssize_t received;
3834 struct event_base* ev_base;
3835 struct timeval timeout;
3836 uint32_t now = 0;
3837
3838 if ((event & EV_TIMEOUT)) {
3839 /* Connection timed out. */
3840 cleanup_tcp_handler(data);
3841 return;
3842 }
3843
3844 if ((data->nsd->tcp_query_count > 0 &&
3845 data->query_count >= data->nsd->tcp_query_count) ||
3846 data->tcp_no_more_queries) {
3847 /* No more queries allowed on this tcp connection. */
3848 cleanup_tcp_handler(data);
3849 return;
3850 }
3851
3852 assert((event & EV_READ));
3853
3854 if (data->bytes_transmitted == 0) {
3855 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3856 }
3857
3858 /*
3859 * Check if we received the leading packet length bytes yet.
3860 */
3861 if (data->bytes_transmitted < sizeof(uint16_t)) {
3862 received = read(fd,
3863 (char *) &data->query->tcplen
3864 + data->bytes_transmitted,
3865 sizeof(uint16_t) - data->bytes_transmitted);
3866 if (received == -1) {
3867 if (errno == EAGAIN || errno == EINTR) {
3868 /*
3869 * Read would block, wait until more
3870 * data is available.
3871 */
3872 return;
3873 } else {
3874 char buf[48];
3875 addr2str(&data->query->addr, buf, sizeof(buf));
3876 #ifdef ECONNRESET
3877 if (verbosity >= 2 || errno != ECONNRESET)
3878 #endif /* ECONNRESET */
3879 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3880 cleanup_tcp_handler(data);
3881 return;
3882 }
3883 } else if (received == 0) {
3884 /* EOF */
3885 cleanup_tcp_handler(data);
3886 return;
3887 }
3888
3889 data->bytes_transmitted += received;
3890 if (data->bytes_transmitted < sizeof(uint16_t)) {
3891 /*
3892 * Not done with the tcplen yet, wait for more
3893 * data to become available.
3894 */
3895 return;
3896 }
3897
3898 assert(data->bytes_transmitted == sizeof(uint16_t));
3899
3900 data->query->tcplen = ntohs(data->query->tcplen);
3901
3902 /*
3903 * Minimum query size is:
3904 *
3905 * Size of the header (12)
3906 * + Root domain name (1)
3907 * + Query class (2)
3908 * + Query type (2)
3909 */
3910 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3911 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3912 cleanup_tcp_handler(data);
3913 return;
3914 }
3915
3916 if (data->query->tcplen > data->query->maxlen) {
3917 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3918 cleanup_tcp_handler(data);
3919 return;
3920 }
3921
3922 buffer_set_limit(data->query->packet, data->query->tcplen);
3923 }
3924
3925 assert(buffer_remaining(data->query->packet) > 0);
3926
3927 /* Read the (remaining) query data. */
3928 received = read(fd,
3929 buffer_current(data->query->packet),
3930 buffer_remaining(data->query->packet));
3931 if (received == -1) {
3932 if (errno == EAGAIN || errno == EINTR) {
3933 /*
3934 * Read would block, wait until more data is
3935 * available.
3936 */
3937 return;
3938 } else {
3939 char buf[48];
3940 addr2str(&data->query->addr, buf, sizeof(buf));
3941 #ifdef ECONNRESET
3942 if (verbosity >= 2 || errno != ECONNRESET)
3943 #endif /* ECONNRESET */
3944 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3945 cleanup_tcp_handler(data);
3946 return;
3947 }
3948 } else if (received == 0) {
3949 /* EOF */
3950 cleanup_tcp_handler(data);
3951 return;
3952 }
3953
3954 data->bytes_transmitted += received;
3955 buffer_skip(data->query->packet, received);
3956 if (buffer_remaining(data->query->packet) > 0) {
3957 /*
3958 * Message not yet complete, wait for more data to
3959 * become available.
3960 */
3961 return;
3962 }
3963
3964 assert(buffer_position(data->query->packet) == data->query->tcplen);
3965
3966 /* Account... */
3967 #ifdef BIND8_STATS
3968 #ifndef INET6
3969 STATUP(data->nsd, ctcp);
3970 #else
3971 if (data->query->addr.ss_family == AF_INET) {
3972 STATUP(data->nsd, ctcp);
3973 } else if (data->query->addr.ss_family == AF_INET6) {
3974 STATUP(data->nsd, ctcp6);
3975 }
3976 #endif
3977 #endif /* BIND8_STATS */
3978
3979 /* We have a complete query, process it. */
3980
3981 /* tcp-query-count: handle query counter ++ */
3982 data->query_count++;
3983
3984 buffer_flip(data->query->packet);
3985 #ifdef USE_DNSTAP
3986 /*
3987 * and send TCP-query with found address (local) and client address to dnstap process
3988 */
3989 log_addr("query from client", &data->query->addr);
3990 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3991 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
3992 data->query->addrlen, data->query->tcp, data->query->packet);
3993 #endif /* USE_DNSTAP */
3994 data->query_state = server_process_query(data->nsd, data->query, &now);
3995 if (data->query_state == QUERY_DISCARDED) {
3996 /* Drop the packet and the entire connection... */
3997 STATUP(data->nsd, dropped);
3998 ZTATUP(data->nsd, data->query->zone, dropped);
3999 cleanup_tcp_handler(data);
4000 return;
4001 }
4002
4003 #ifdef BIND8_STATS
4004 if (RCODE(data->query->packet) == RCODE_OK
4005 && !AA(data->query->packet))
4006 {
4007 STATUP(data->nsd, nona);
4008 ZTATUP(data->nsd, data->query->zone, nona);
4009 }
4010 #endif /* BIND8_STATS */
4011
4012 #ifdef USE_ZONE_STATS
4013 #ifndef INET6
4014 ZTATUP(data->nsd, data->query->zone, ctcp);
4015 #else
4016 if (data->query->addr.ss_family == AF_INET) {
4017 ZTATUP(data->nsd, data->query->zone, ctcp);
4018 } else if (data->query->addr.ss_family == AF_INET6) {
4019 ZTATUP(data->nsd, data->query->zone, ctcp6);
4020 }
4021 #endif
4022 #endif /* USE_ZONE_STATS */
4023
4024 query_add_optional(data->query, data->nsd, &now);
4025
4026 /* Switch to the tcp write handler. */
4027 buffer_flip(data->query->packet);
4028 data->query->tcplen = buffer_remaining(data->query->packet);
4029 #ifdef BIND8_STATS
4030 /* Account the rcode & TC... */
4031 STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4032 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4033 if (TC(data->query->packet)) {
4034 STATUP(data->nsd, truncated);
4035 ZTATUP(data->nsd, data->query->zone, truncated);
4036 }
4037 #endif /* BIND8_STATS */
4038 #ifdef USE_DNSTAP
4039 /*
4040 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4041 */
4042 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4043 log_addr("response to client", &data->query->addr);
4044 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4045 data->query->addrlen, data->query->tcp, data->query->packet,
4046 data->query->zone);
4047 #endif /* USE_DNSTAP */
4048 data->bytes_transmitted = 0;
4049
4050 timeout.tv_sec = data->tcp_timeout / 1000;
4051 timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4052
4053 ev_base = data->event.ev_base;
4054 event_del(&data->event);
4055 memset(&data->event, 0, sizeof(data->event));
4056 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4057 handle_tcp_reading, data);
4058 if(event_base_set(ev_base, &data->event) != 0)
4059 log_msg(LOG_ERR, "event base set tcpr failed");
4060 if(event_add(&data->event, &timeout) != 0)
4061 log_msg(LOG_ERR, "event add tcpr failed");
4062 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4063 handle_tcp_writing(fd, EV_WRITE, data);
4064 }
4065
4066 static void
handle_tcp_writing(int fd,short event,void * arg)4067 handle_tcp_writing(int fd, short event, void* arg)
4068 {
4069 struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4070 ssize_t sent;
4071 struct query *q = data->query;
4072 struct timeval timeout;
4073 struct event_base* ev_base;
4074 uint32_t now = 0;
4075
4076 if ((event & EV_TIMEOUT)) {
4077 /* Connection timed out. */
4078 cleanup_tcp_handler(data);
4079 return;
4080 }
4081
4082 assert((event & EV_WRITE));
4083
4084 if (data->bytes_transmitted < sizeof(q->tcplen)) {
4085 /* Writing the response packet length. */
4086 uint16_t n_tcplen = htons(q->tcplen);
4087 #ifdef HAVE_WRITEV
4088 struct iovec iov[2];
4089 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4090 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4091 iov[1].iov_base = buffer_begin(q->packet);
4092 iov[1].iov_len = buffer_limit(q->packet);
4093 sent = writev(fd, iov, 2);
4094 #else /* HAVE_WRITEV */
4095 sent = write(fd,
4096 (const char *) &n_tcplen + data->bytes_transmitted,
4097 sizeof(n_tcplen) - data->bytes_transmitted);
4098 #endif /* HAVE_WRITEV */
4099 if (sent == -1) {
4100 if (errno == EAGAIN || errno == EINTR) {
4101 /*
4102 * Write would block, wait until
4103 * socket becomes writable again.
4104 */
4105 return;
4106 } else {
4107 #ifdef ECONNRESET
4108 if(verbosity >= 2 || errno != ECONNRESET)
4109 #endif /* ECONNRESET */
4110 #ifdef EPIPE
4111 if(verbosity >= 2 || errno != EPIPE)
4112 #endif /* EPIPE 'broken pipe' */
4113 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4114 cleanup_tcp_handler(data);
4115 return;
4116 }
4117 }
4118
4119 data->bytes_transmitted += sent;
4120 if (data->bytes_transmitted < sizeof(q->tcplen)) {
4121 /*
4122 * Writing not complete, wait until socket
4123 * becomes writable again.
4124 */
4125 return;
4126 }
4127
4128 #ifdef HAVE_WRITEV
4129 sent -= sizeof(n_tcplen);
4130 /* handle potential 'packet done' code */
4131 goto packet_could_be_done;
4132 #endif
4133 }
4134
4135 sent = write(fd,
4136 buffer_current(q->packet),
4137 buffer_remaining(q->packet));
4138 if (sent == -1) {
4139 if (errno == EAGAIN || errno == EINTR) {
4140 /*
4141 * Write would block, wait until
4142 * socket becomes writable again.
4143 */
4144 return;
4145 } else {
4146 #ifdef ECONNRESET
4147 if(verbosity >= 2 || errno != ECONNRESET)
4148 #endif /* ECONNRESET */
4149 #ifdef EPIPE
4150 if(verbosity >= 2 || errno != EPIPE)
4151 #endif /* EPIPE 'broken pipe' */
4152 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4153 cleanup_tcp_handler(data);
4154 return;
4155 }
4156 }
4157
4158 data->bytes_transmitted += sent;
4159 #ifdef HAVE_WRITEV
4160 packet_could_be_done:
4161 #endif
4162 buffer_skip(q->packet, sent);
4163 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4164 /*
4165 * Still more data to write when socket becomes
4166 * writable again.
4167 */
4168 return;
4169 }
4170
4171 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4172
4173 if (data->query_state == QUERY_IN_AXFR ||
4174 data->query_state == QUERY_IN_IXFR) {
4175 /* Continue processing AXFR and writing back results. */
4176 buffer_clear(q->packet);
4177 if(data->query_state == QUERY_IN_AXFR)
4178 data->query_state = query_axfr(data->nsd, q, 0);
4179 else data->query_state = query_ixfr(data->nsd, q);
4180 if (data->query_state != QUERY_PROCESSED) {
4181 query_add_optional(data->query, data->nsd, &now);
4182
4183 /* Reset data. */
4184 buffer_flip(q->packet);
4185 q->tcplen = buffer_remaining(q->packet);
4186 data->bytes_transmitted = 0;
4187 /* Reset timeout. */
4188 timeout.tv_sec = data->tcp_timeout / 1000;
4189 timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4190 ev_base = data->event.ev_base;
4191 event_del(&data->event);
4192 memset(&data->event, 0, sizeof(data->event));
4193 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4194 handle_tcp_writing, data);
4195 if(event_base_set(ev_base, &data->event) != 0)
4196 log_msg(LOG_ERR, "event base set tcpw failed");
4197 if(event_add(&data->event, &timeout) != 0)
4198 log_msg(LOG_ERR, "event add tcpw failed");
4199
4200 /*
4201 * Write data if/when the socket is writable
4202 * again.
4203 */
4204 return;
4205 }
4206 }
4207
4208 /*
4209 * Done sending, wait for the next request to arrive on the
4210 * TCP socket by installing the TCP read handler.
4211 */
4212 if ((data->nsd->tcp_query_count > 0 &&
4213 data->query_count >= data->nsd->tcp_query_count) ||
4214 data->tcp_no_more_queries) {
4215
4216 (void) shutdown(fd, SHUT_WR);
4217 }
4218
4219 data->bytes_transmitted = 0;
4220
4221 timeout.tv_sec = data->tcp_timeout / 1000;
4222 timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4223 ev_base = data->event.ev_base;
4224 event_del(&data->event);
4225 memset(&data->event, 0, sizeof(data->event));
4226 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4227 handle_tcp_reading, data);
4228 if(event_base_set(ev_base, &data->event) != 0)
4229 log_msg(LOG_ERR, "event base set tcpw failed");
4230 if(event_add(&data->event, &timeout) != 0)
4231 log_msg(LOG_ERR, "event add tcpw failed");
4232 }
4233
4234 #ifdef HAVE_SSL
4235 /** create SSL object and associate fd */
4236 static SSL*
incoming_ssl_fd(SSL_CTX * ctx,int fd)4237 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4238 {
4239 SSL* ssl = SSL_new((SSL_CTX*)ctx);
4240 if(!ssl) {
4241 log_crypto_err("could not SSL_new");
4242 return NULL;
4243 }
4244 SSL_set_accept_state(ssl);
4245 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4246 if(!SSL_set_fd(ssl, fd)) {
4247 log_crypto_err("could not SSL_set_fd");
4248 SSL_free(ssl);
4249 return NULL;
4250 }
4251 return ssl;
4252 }
4253
4254 /** TLS handshake to upgrade TCP connection */
4255 static int
tls_handshake(struct tcp_handler_data * data,int fd,int writing)4256 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4257 {
4258 int r;
4259 if(data->shake_state == tls_hs_read_event) {
4260 /* read condition satisfied back to writing */
4261 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4262 data->shake_state = tls_hs_none;
4263 return 1;
4264 }
4265 if(data->shake_state == tls_hs_write_event) {
4266 /* write condition satisfied back to reading */
4267 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4268 data->shake_state = tls_hs_none;
4269 return 1;
4270 }
4271
4272 /* (continue to) setup the TLS connection */
4273 ERR_clear_error();
4274 r = SSL_do_handshake(data->tls);
4275
4276 if(r != 1) {
4277 int want = SSL_get_error(data->tls, r);
4278 if(want == SSL_ERROR_WANT_READ) {
4279 if(data->shake_state == tls_hs_read) {
4280 /* try again later */
4281 return 1;
4282 }
4283 data->shake_state = tls_hs_read;
4284 /* switch back to reading mode */
4285 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4286 return 1;
4287 } else if(want == SSL_ERROR_WANT_WRITE) {
4288 if(data->shake_state == tls_hs_write) {
4289 /* try again later */
4290 return 1;
4291 }
4292 data->shake_state = tls_hs_write;
4293 /* switch back to writing mode */
4294 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4295 return 1;
4296 } else {
4297 if(r == 0)
4298 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4299 else {
4300 unsigned long err = ERR_get_error();
4301 if(!squelch_err_ssl_handshake(err)) {
4302 char a[64], s[256];
4303 addr2str(&data->query->addr, a, sizeof(a));
4304 snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4305 log_crypto_from_err(s, err);
4306 }
4307 }
4308 cleanup_tcp_handler(data);
4309 return 0;
4310 }
4311 }
4312
4313 /* Use to log successful upgrade for testing - could be removed*/
4314 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4315 /* set back to the event we need to have when reading (or writing) */
4316 if(data->shake_state == tls_hs_read && writing) {
4317 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4318 } else if(data->shake_state == tls_hs_write && !writing) {
4319 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4320 }
4321 data->shake_state = tls_hs_none;
4322 return 1;
4323 }
4324
4325 /** handle TLS reading of incoming query */
4326 static void
handle_tls_reading(int fd,short event,void * arg)4327 handle_tls_reading(int fd, short event, void* arg)
4328 {
4329 struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4330 ssize_t received;
4331 uint32_t now = 0;
4332
4333 if ((event & EV_TIMEOUT)) {
4334 /* Connection timed out. */
4335 cleanup_tcp_handler(data);
4336 return;
4337 }
4338
4339 if ((data->nsd->tcp_query_count > 0 &&
4340 data->query_count >= data->nsd->tcp_query_count) ||
4341 data->tcp_no_more_queries) {
4342 /* No more queries allowed on this tcp connection. */
4343 cleanup_tcp_handler(data);
4344 return;
4345 }
4346
4347 assert((event & EV_READ));
4348
4349 if (data->bytes_transmitted == 0) {
4350 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4351 }
4352
4353 if(data->shake_state != tls_hs_none) {
4354 if(!tls_handshake(data, fd, 0))
4355 return;
4356 if(data->shake_state != tls_hs_none)
4357 return;
4358 }
4359
4360 /*
4361 * Check if we received the leading packet length bytes yet.
4362 */
4363 if(data->bytes_transmitted < sizeof(uint16_t)) {
4364 ERR_clear_error();
4365 if((received=SSL_read(data->tls, (char *) &data->query->tcplen
4366 + data->bytes_transmitted,
4367 sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4368 int want = SSL_get_error(data->tls, received);
4369 if(want == SSL_ERROR_ZERO_RETURN) {
4370 cleanup_tcp_handler(data);
4371 return; /* shutdown, closed */
4372 } else if(want == SSL_ERROR_WANT_READ) {
4373 /* wants to be called again */
4374 return;
4375 }
4376 else if(want == SSL_ERROR_WANT_WRITE) {
4377 /* switch to writing */
4378 data->shake_state = tls_hs_write_event;
4379 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4380 return;
4381 }
4382 cleanup_tcp_handler(data);
4383 log_crypto_err("could not SSL_read");
4384 return;
4385 }
4386
4387 data->bytes_transmitted += received;
4388 if (data->bytes_transmitted < sizeof(uint16_t)) {
4389 /*
4390 * Not done with the tcplen yet, wait for more
4391 * data to become available.
4392 */
4393 return;
4394 }
4395
4396 assert(data->bytes_transmitted == sizeof(uint16_t));
4397
4398 data->query->tcplen = ntohs(data->query->tcplen);
4399
4400 /*
4401 * Minimum query size is:
4402 *
4403 * Size of the header (12)
4404 * + Root domain name (1)
4405 * + Query class (2)
4406 * + Query type (2)
4407 */
4408 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4409 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4410 cleanup_tcp_handler(data);
4411 return;
4412 }
4413
4414 if (data->query->tcplen > data->query->maxlen) {
4415 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4416 cleanup_tcp_handler(data);
4417 return;
4418 }
4419
4420 buffer_set_limit(data->query->packet, data->query->tcplen);
4421 }
4422
4423 assert(buffer_remaining(data->query->packet) > 0);
4424
4425 /* Read the (remaining) query data. */
4426 ERR_clear_error();
4427 received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4428 (int)buffer_remaining(data->query->packet));
4429 if(received <= 0) {
4430 int want = SSL_get_error(data->tls, received);
4431 if(want == SSL_ERROR_ZERO_RETURN) {
4432 cleanup_tcp_handler(data);
4433 return; /* shutdown, closed */
4434 } else if(want == SSL_ERROR_WANT_READ) {
4435 /* wants to be called again */
4436 return;
4437 }
4438 else if(want == SSL_ERROR_WANT_WRITE) {
4439 /* switch back writing */
4440 data->shake_state = tls_hs_write_event;
4441 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4442 return;
4443 }
4444 cleanup_tcp_handler(data);
4445 log_crypto_err("could not SSL_read");
4446 return;
4447 }
4448
4449 data->bytes_transmitted += received;
4450 buffer_skip(data->query->packet, received);
4451 if (buffer_remaining(data->query->packet) > 0) {
4452 /*
4453 * Message not yet complete, wait for more data to
4454 * become available.
4455 */
4456 return;
4457 }
4458
4459 assert(buffer_position(data->query->packet) == data->query->tcplen);
4460
4461 /* Account... */
4462 #ifndef INET6
4463 STATUP(data->nsd, ctls);
4464 #else
4465 if (data->query->addr.ss_family == AF_INET) {
4466 STATUP(data->nsd, ctls);
4467 } else if (data->query->addr.ss_family == AF_INET6) {
4468 STATUP(data->nsd, ctls6);
4469 }
4470 #endif
4471
4472 /* We have a complete query, process it. */
4473
4474 /* tcp-query-count: handle query counter ++ */
4475 data->query_count++;
4476
4477 buffer_flip(data->query->packet);
4478 #ifdef USE_DNSTAP
4479 /*
4480 * and send TCP-query with found address (local) and client address to dnstap process
4481 */
4482 log_addr("query from client", &data->query->addr);
4483 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4484 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4485 data->query->addrlen, data->query->tcp, data->query->packet);
4486 #endif /* USE_DNSTAP */
4487 data->query_state = server_process_query(data->nsd, data->query, &now);
4488 if (data->query_state == QUERY_DISCARDED) {
4489 /* Drop the packet and the entire connection... */
4490 STATUP(data->nsd, dropped);
4491 ZTATUP(data->nsd, data->query->zone, dropped);
4492 cleanup_tcp_handler(data);
4493 return;
4494 }
4495
4496 #ifdef BIND8_STATS
4497 if (RCODE(data->query->packet) == RCODE_OK
4498 && !AA(data->query->packet))
4499 {
4500 STATUP(data->nsd, nona);
4501 ZTATUP(data->nsd, data->query->zone, nona);
4502 }
4503 #endif /* BIND8_STATS */
4504
4505 #ifdef USE_ZONE_STATS
4506 #ifndef INET6
4507 ZTATUP(data->nsd, data->query->zone, ctls);
4508 #else
4509 if (data->query->addr.ss_family == AF_INET) {
4510 ZTATUP(data->nsd, data->query->zone, ctls);
4511 } else if (data->query->addr.ss_family == AF_INET6) {
4512 ZTATUP(data->nsd, data->query->zone, ctls6);
4513 }
4514 #endif
4515 #endif /* USE_ZONE_STATS */
4516
4517 query_add_optional(data->query, data->nsd, &now);
4518
4519 /* Switch to the tcp write handler. */
4520 buffer_flip(data->query->packet);
4521 data->query->tcplen = buffer_remaining(data->query->packet);
4522 #ifdef BIND8_STATS
4523 /* Account the rcode & TC... */
4524 STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4525 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4526 if (TC(data->query->packet)) {
4527 STATUP(data->nsd, truncated);
4528 ZTATUP(data->nsd, data->query->zone, truncated);
4529 }
4530 #endif /* BIND8_STATS */
4531 #ifdef USE_DNSTAP
4532 /*
4533 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4534 */
4535 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4536 log_addr("response to client", &data->query->addr);
4537 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4538 data->query->addrlen, data->query->tcp, data->query->packet,
4539 data->query->zone);
4540 #endif /* USE_DNSTAP */
4541 data->bytes_transmitted = 0;
4542
4543 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4544
4545 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4546 handle_tls_writing(fd, EV_WRITE, data);
4547 }
4548
4549 /** handle TLS writing of outgoing response */
4550 static void
handle_tls_writing(int fd,short event,void * arg)4551 handle_tls_writing(int fd, short event, void* arg)
4552 {
4553 struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4554 ssize_t sent;
4555 struct query *q = data->query;
4556 /* static variable that holds reassembly buffer used to put the
4557 * TCP length in front of the packet, like writev. */
4558 static buffer_type* global_tls_temp_buffer = NULL;
4559 buffer_type* write_buffer;
4560 uint32_t now = 0;
4561
4562 if ((event & EV_TIMEOUT)) {
4563 /* Connection timed out. */
4564 cleanup_tcp_handler(data);
4565 return;
4566 }
4567
4568 assert((event & EV_WRITE));
4569
4570 if(data->shake_state != tls_hs_none) {
4571 if(!tls_handshake(data, fd, 1))
4572 return;
4573 if(data->shake_state != tls_hs_none)
4574 return;
4575 }
4576
4577 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4578
4579 /* If we are writing the start of a message, we must include the length
4580 * this is done with a copy into write_buffer. */
4581 write_buffer = NULL;
4582 if (data->bytes_transmitted == 0) {
4583 if(!global_tls_temp_buffer) {
4584 /* gets deallocated when nsd shuts down from
4585 * nsd.region */
4586 global_tls_temp_buffer = buffer_create(nsd.region,
4587 QIOBUFSZ + sizeof(q->tcplen));
4588 if (!global_tls_temp_buffer) {
4589 return;
4590 }
4591 }
4592 write_buffer = global_tls_temp_buffer;
4593 buffer_clear(write_buffer);
4594 buffer_write_u16(write_buffer, q->tcplen);
4595 buffer_write(write_buffer, buffer_current(q->packet),
4596 (int)buffer_remaining(q->packet));
4597 buffer_flip(write_buffer);
4598 } else {
4599 write_buffer = q->packet;
4600 }
4601
4602 /* Write the response */
4603 ERR_clear_error();
4604 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4605 if(sent <= 0) {
4606 int want = SSL_get_error(data->tls, sent);
4607 if(want == SSL_ERROR_ZERO_RETURN) {
4608 cleanup_tcp_handler(data);
4609 /* closed */
4610 } else if(want == SSL_ERROR_WANT_READ) {
4611 /* switch back to reading */
4612 data->shake_state = tls_hs_read_event;
4613 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4614 } else if(want != SSL_ERROR_WANT_WRITE) {
4615 cleanup_tcp_handler(data);
4616 log_crypto_err("could not SSL_write");
4617 }
4618 return;
4619 }
4620
4621 buffer_skip(write_buffer, sent);
4622 if(buffer_remaining(write_buffer) != 0) {
4623 /* If not all sent, sync up the real buffer if it wasn't used.*/
4624 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4625 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4626 }
4627 }
4628
4629 data->bytes_transmitted += sent;
4630 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4631 /*
4632 * Still more data to write when socket becomes
4633 * writable again.
4634 */
4635 return;
4636 }
4637
4638 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4639
4640 if (data->query_state == QUERY_IN_AXFR ||
4641 data->query_state == QUERY_IN_IXFR) {
4642 /* Continue processing AXFR and writing back results. */
4643 buffer_clear(q->packet);
4644 if(data->query_state == QUERY_IN_AXFR)
4645 data->query_state = query_axfr(data->nsd, q, 0);
4646 else data->query_state = query_ixfr(data->nsd, q);
4647 if (data->query_state != QUERY_PROCESSED) {
4648 query_add_optional(data->query, data->nsd, &now);
4649
4650 /* Reset data. */
4651 buffer_flip(q->packet);
4652 q->tcplen = buffer_remaining(q->packet);
4653 data->bytes_transmitted = 0;
4654 /* Reset to writing mode. */
4655 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4656
4657 /*
4658 * Write data if/when the socket is writable
4659 * again.
4660 */
4661 return;
4662 }
4663 }
4664
4665 /*
4666 * Done sending, wait for the next request to arrive on the
4667 * TCP socket by installing the TCP read handler.
4668 */
4669 if ((data->nsd->tcp_query_count > 0 &&
4670 data->query_count >= data->nsd->tcp_query_count) ||
4671 data->tcp_no_more_queries) {
4672
4673 (void) shutdown(fd, SHUT_WR);
4674 }
4675
4676 data->bytes_transmitted = 0;
4677
4678 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4679 }
4680 #endif
4681
4682 static void
handle_slowaccept_timeout(int ATTR_UNUSED (fd),short ATTR_UNUSED (event),void * ATTR_UNUSED (arg))4683 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4684 void* ATTR_UNUSED(arg))
4685 {
4686 if(slowaccept) {
4687 configure_handler_event_types(EV_PERSIST | EV_READ);
4688 slowaccept = 0;
4689 }
4690 }
4691
perform_accept(int fd,struct sockaddr * addr,socklen_t * addrlen)4692 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4693 {
4694 #ifndef HAVE_ACCEPT4
4695 int s = accept(fd, addr, addrlen);
4696 if (s != -1) {
4697 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4698 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4699 close(s);
4700 s = -1;
4701 errno=EINTR; /* stop error printout as error in accept4
4702 by setting this errno, it omits printout, in
4703 later code that calls nsd_accept4 */
4704 }
4705 }
4706 return s;
4707 #else
4708 return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4709 #endif /* HAVE_ACCEPT4 */
4710 }
4711
4712 /*
4713 * Handle an incoming TCP connection. The connection is accepted and
4714 * a new TCP reader event handler is added. The TCP handler
4715 * is responsible for cleanup when the connection is closed.
4716 */
4717 static void
handle_tcp_accept(int fd,short event,void * arg)4718 handle_tcp_accept(int fd, short event, void* arg)
4719 {
4720 struct tcp_accept_handler_data *data
4721 = (struct tcp_accept_handler_data *) arg;
4722 int s;
4723 int reject = 0;
4724 struct tcp_handler_data *tcp_data;
4725 region_type *tcp_region;
4726 #ifdef INET6
4727 struct sockaddr_storage addr;
4728 #else
4729 struct sockaddr_in addr;
4730 #endif
4731 socklen_t addrlen;
4732 struct timeval timeout;
4733
4734 if (!(event & EV_READ)) {
4735 return;
4736 }
4737
4738 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4739 reject = data->nsd->options->tcp_reject_overflow;
4740 if (!reject) {
4741 return;
4742 }
4743 }
4744
4745 /* Accept it... */
4746 addrlen = sizeof(addr);
4747 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4748 if (s == -1) {
4749 /**
4750 * EMFILE and ENFILE is a signal that the limit of open
4751 * file descriptors has been reached. Pause accept().
4752 * EINTR is a signal interrupt. The others are various OS ways
4753 * of saying that the client has closed the connection.
4754 */
4755 if (errno == EMFILE || errno == ENFILE) {
4756 if (!slowaccept) {
4757 /* disable accept events */
4758 struct timeval tv;
4759 configure_handler_event_types(0);
4760 tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4761 tv.tv_usec = 0L;
4762 memset(&slowaccept_event, 0,
4763 sizeof(slowaccept_event));
4764 event_set(&slowaccept_event, -1, EV_TIMEOUT,
4765 handle_slowaccept_timeout, NULL);
4766 (void)event_base_set(data->event.ev_base,
4767 &slowaccept_event);
4768 (void)event_add(&slowaccept_event, &tv);
4769 slowaccept = 1;
4770 /* We don't want to spam the logs here */
4771 }
4772 } else if (errno != EINTR
4773 && errno != EWOULDBLOCK
4774 #ifdef ECONNABORTED
4775 && errno != ECONNABORTED
4776 #endif /* ECONNABORTED */
4777 #ifdef EPROTO
4778 && errno != EPROTO
4779 #endif /* EPROTO */
4780 ) {
4781 log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4782 }
4783 return;
4784 }
4785
4786 if (reject) {
4787 shutdown(s, SHUT_RDWR);
4788 close(s);
4789 return;
4790 }
4791
4792 /*
4793 * This region is deallocated when the TCP connection is
4794 * closed by the TCP handler.
4795 */
4796 tcp_region = region_create(xalloc, free);
4797 tcp_data = (struct tcp_handler_data *) region_alloc(
4798 tcp_region, sizeof(struct tcp_handler_data));
4799 tcp_data->region = tcp_region;
4800 tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4801 compression_table_size, compressed_dnames);
4802 tcp_data->nsd = data->nsd;
4803 tcp_data->query_count = 0;
4804 #ifdef HAVE_SSL
4805 tcp_data->shake_state = tls_hs_none;
4806 tcp_data->tls = NULL;
4807 #endif
4808 tcp_data->prev = NULL;
4809 tcp_data->next = NULL;
4810
4811 tcp_data->query_state = QUERY_PROCESSED;
4812 tcp_data->bytes_transmitted = 0;
4813 memcpy(&tcp_data->query->addr, &addr, addrlen);
4814 tcp_data->query->addrlen = addrlen;
4815
4816 tcp_data->tcp_no_more_queries = 0;
4817 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4818 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4819 /* very busy, give smaller timeout */
4820 tcp_data->tcp_timeout = 200;
4821 }
4822 memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4823 timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4824 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4825
4826 #ifdef USE_DNSTAP
4827 /* save the address of the connection */
4828 tcp_data->socket = data->socket;
4829 #endif /* USE_DNSTAP */
4830
4831 #ifdef HAVE_SSL
4832 if (data->tls_accept) {
4833 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4834 if(!tcp_data->tls) {
4835 close(s);
4836 return;
4837 }
4838 tcp_data->shake_state = tls_hs_read;
4839 memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4840 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4841 handle_tls_reading, tcp_data);
4842 } else {
4843 #endif
4844 memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4845 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4846 handle_tcp_reading, tcp_data);
4847 #ifdef HAVE_SSL
4848 }
4849 #endif
4850 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4851 log_msg(LOG_ERR, "cannot set tcp event base");
4852 close(s);
4853 region_destroy(tcp_region);
4854 return;
4855 }
4856 if(event_add(&tcp_data->event, &timeout) != 0) {
4857 log_msg(LOG_ERR, "cannot add tcp to event base");
4858 close(s);
4859 region_destroy(tcp_region);
4860 return;
4861 }
4862 if(tcp_active_list) {
4863 tcp_active_list->prev = tcp_data;
4864 tcp_data->next = tcp_active_list;
4865 }
4866 tcp_active_list = tcp_data;
4867
4868 /*
4869 * Keep track of the total number of TCP handlers installed so
4870 * we can stop accepting connections when the maximum number
4871 * of simultaneous TCP connections is reached.
4872 *
4873 * If tcp-reject-overflow is enabled, however, then we do not
4874 * change the handler event type; we keep it as-is and accept
4875 * overflow TCP connections only so that we can forcibly kill
4876 * them off.
4877 */
4878 ++data->nsd->current_tcp_count;
4879 if (!data->nsd->options->tcp_reject_overflow &&
4880 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4881 {
4882 configure_handler_event_types(0);
4883 }
4884 }
4885
4886 static void
send_children_command(struct nsd * nsd,sig_atomic_t command,int timeout)4887 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4888 {
4889 size_t i;
4890 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4891 for (i = 0; i < nsd->child_count; ++i) {
4892 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4893 if (write(nsd->children[i].child_fd,
4894 &command,
4895 sizeof(command)) == -1)
4896 {
4897 if(errno != EAGAIN && errno != EINTR)
4898 log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4899 (int) command,
4900 (int) nsd->children[i].pid,
4901 strerror(errno));
4902 } else if (timeout > 0) {
4903 (void)block_read(NULL,
4904 nsd->children[i].child_fd,
4905 &command, sizeof(command), timeout);
4906 }
4907 fsync(nsd->children[i].child_fd);
4908 close(nsd->children[i].child_fd);
4909 nsd->children[i].child_fd = -1;
4910 }
4911 }
4912 }
4913
4914 static void
send_children_quit(struct nsd * nsd)4915 send_children_quit(struct nsd* nsd)
4916 {
4917 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4918 send_children_command(nsd, NSD_QUIT, 0);
4919 }
4920
4921 static void
send_children_quit_and_wait(struct nsd * nsd)4922 send_children_quit_and_wait(struct nsd* nsd)
4923 {
4924 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4925 send_children_command(nsd, NSD_QUIT_CHILD, 3);
4926 }
4927
4928 #ifdef BIND8_STATS
4929 static void
set_children_stats(struct nsd * nsd)4930 set_children_stats(struct nsd* nsd)
4931 {
4932 size_t i;
4933 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4934 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4935 for (i = 0; i < nsd->child_count; ++i) {
4936 nsd->children[i].need_to_send_STATS = 1;
4937 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4938 }
4939 }
4940 #endif /* BIND8_STATS */
4941
4942 static void
configure_handler_event_types(short event_types)4943 configure_handler_event_types(short event_types)
4944 {
4945 size_t i;
4946
4947 for (i = 0; i < tcp_accept_handler_count; ++i) {
4948 struct event* handler = &tcp_accept_handlers[i].event;
4949 if(event_types) {
4950 /* reassign */
4951 int fd = handler->ev_fd;
4952 struct event_base* base = handler->ev_base;
4953 if(tcp_accept_handlers[i].event_added)
4954 event_del(handler);
4955 memset(handler, 0, sizeof(*handler));
4956 event_set(handler, fd, event_types,
4957 handle_tcp_accept, &tcp_accept_handlers[i]);
4958 if(event_base_set(base, handler) != 0)
4959 log_msg(LOG_ERR, "conhand: cannot event_base");
4960 if(event_add(handler, NULL) != 0)
4961 log_msg(LOG_ERR, "conhand: cannot event_add");
4962 tcp_accept_handlers[i].event_added = 1;
4963 } else {
4964 /* remove */
4965 if(tcp_accept_handlers[i].event_added) {
4966 event_del(handler);
4967 tcp_accept_handlers[i].event_added = 0;
4968 }
4969 }
4970 }
4971 }
4972