1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #include "ixfr.h" 85 #ifdef USE_DNSTAP 86 #include "dnstap/dnstap_collector.h" 87 #endif 88 #include "verify.h" 89 #include "util/proxy_protocol.h" 90 91 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 92 93 #ifdef USE_DNSTAP 94 /* 95 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content 96 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*) 97 */ 98 static void 99 log_addr(const char* descr, 100 #ifdef INET6 101 struct sockaddr_storage* addr 102 #else 103 struct sockaddr_in* addr 104 #endif 105 ) 106 { 107 char str_buf[64]; 108 if(verbosity < 6) 109 return; 110 if( 111 #ifdef INET6 112 addr->ss_family == AF_INET 113 #else 114 addr->sin_family == AF_INET 115 #endif 116 ) { 117 struct sockaddr_in* s = (struct sockaddr_in*)addr; 118 inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf)); 119 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port))); 120 #ifdef INET6 121 } else { 122 struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr; 123 inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf)); 124 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port))); 125 #endif 126 } 127 } 128 #endif /* USE_DNSTAP */ 129 130 #ifdef USE_TCP_FASTOPEN 131 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 132 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 133 #endif 134 135 /* header state for the PROXYv2 header (for TCP) */ 136 enum pp2_header_state { 137 /* no header encounter yet */ 138 pp2_header_none = 0, 139 /* read the static part of the header */ 140 pp2_header_init, 141 /* read the full header */ 142 pp2_header_done 143 }; 144 145 /* 146 * Data for the UDP handlers. 147 */ 148 struct udp_handler_data 149 { 150 struct nsd *nsd; 151 struct nsd_socket *socket; 152 struct event event; 153 /* if set, PROXYv2 is expected on this connection */ 154 int pp2_enabled; 155 }; 156 157 struct tcp_accept_handler_data { 158 struct nsd *nsd; 159 struct nsd_socket *socket; 160 int event_added; 161 struct event event; 162 #ifdef HAVE_SSL 163 /* handler accepts TLS connections on the dedicated port */ 164 int tls_accept; 165 #endif 166 /* if set, PROXYv2 is expected on this connection */ 167 int pp2_enabled; 168 }; 169 170 /* 171 * These globals are used to enable the TCP accept handlers 172 * when the number of TCP connection drops below the maximum 173 * number of TCP connections. 174 */ 175 static size_t tcp_accept_handler_count; 176 static struct tcp_accept_handler_data *tcp_accept_handlers; 177 178 static struct event slowaccept_event; 179 static int slowaccept; 180 181 #ifdef HAVE_SSL 182 static unsigned char *ocspdata = NULL; 183 static long ocspdata_len = 0; 184 #endif 185 186 #ifdef NONBLOCKING_IS_BROKEN 187 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 188 read multiple times from a socket when reported ready by select. */ 189 # define NUM_RECV_PER_SELECT (1) 190 #else /* !NONBLOCKING_IS_BROKEN */ 191 # define NUM_RECV_PER_SELECT (100) 192 #endif /* NONBLOCKING_IS_BROKEN */ 193 194 #ifndef HAVE_MMSGHDR 195 struct mmsghdr { 196 struct msghdr msg_hdr; 197 unsigned int msg_len; 198 }; 199 #endif 200 201 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 202 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 203 static struct query *queries[NUM_RECV_PER_SELECT]; 204 205 /* 206 * Data for the TCP connection handlers. 207 * 208 * The TCP handlers use non-blocking I/O. This is necessary to avoid 209 * blocking the entire server on a slow TCP connection, but does make 210 * reading from and writing to the socket more complicated. 211 * 212 * Basically, whenever a read/write would block (indicated by the 213 * EAGAIN errno variable) we remember the position we were reading 214 * from/writing to and return from the TCP reading/writing event 215 * handler. When the socket becomes readable/writable again we 216 * continue from the same position. 217 */ 218 struct tcp_handler_data 219 { 220 /* 221 * The region used to allocate all TCP connection related 222 * data, including this structure. This region is destroyed 223 * when the connection is closed. 224 */ 225 region_type* region; 226 227 /* 228 * The global nsd structure. 229 */ 230 struct nsd* nsd; 231 232 /* 233 * The current query data for this TCP connection. 234 */ 235 query_type* query; 236 237 /* 238 * The query_state is used to remember if we are performing an 239 * AXFR, if we're done processing, or if we should discard the 240 * query and connection. 241 */ 242 query_state_type query_state; 243 244 /* 245 * The event for the file descriptor and tcp timeout 246 */ 247 struct event event; 248 249 /* 250 * The bytes_transmitted field is used to remember the number 251 * of bytes transmitted when receiving or sending a DNS 252 * packet. The count includes the two additional bytes used 253 * to specify the packet length on a TCP connection. 254 */ 255 size_t bytes_transmitted; 256 257 /* If the query is restarted and needs a reset */ 258 int query_needs_reset; 259 260 /* 261 * The number of queries handled by this specific TCP connection. 262 */ 263 int query_count; 264 265 /* 266 * The timeout in msec for this tcp connection 267 */ 268 int tcp_timeout; 269 270 /* 271 * If the connection is allowed to have further queries on it. 272 */ 273 int tcp_no_more_queries; 274 275 #ifdef USE_DNSTAP 276 /* the socket of the accept socket to find proper service (local) address the socket is bound to. */ 277 struct nsd_socket *socket; 278 #endif /* USE_DNSTAP */ 279 280 /* if set, PROXYv2 is expected on this connection */ 281 int pp2_enabled; 282 283 /* header state for the PROXYv2 header (for TCP) */ 284 enum pp2_header_state pp2_header_state; 285 286 #ifdef HAVE_SSL 287 /* 288 * TLS object. 289 */ 290 SSL* tls; 291 292 /* 293 * TLS handshake state. 294 */ 295 enum { tls_hs_none, tls_hs_read, tls_hs_write, 296 tls_hs_read_event, tls_hs_write_event } shake_state; 297 #endif 298 /* list of connections, for service of remaining tcp channels */ 299 struct tcp_handler_data *prev, *next; 300 }; 301 /* global that is the list of active tcp channels */ 302 static struct tcp_handler_data *tcp_active_list = NULL; 303 304 /* 305 * Handle incoming queries on the UDP server sockets. 306 */ 307 static void handle_udp(int fd, short event, void* arg); 308 309 /* 310 * Handle incoming connections on the TCP sockets. These handlers 311 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 312 * connection) but are disabled when the number of current TCP 313 * connections is equal to the maximum number of TCP connections. 314 * Disabling is done by changing the handler to wait for the 315 * NETIO_EVENT_NONE type. This is done using the function 316 * configure_tcp_accept_handlers. 317 */ 318 static void handle_tcp_accept(int fd, short event, void* arg); 319 320 /* 321 * Handle incoming queries on a TCP connection. The TCP connections 322 * are configured to be non-blocking and the handler may be called 323 * multiple times before a complete query is received. 324 */ 325 static void handle_tcp_reading(int fd, short event, void* arg); 326 327 /* 328 * Handle outgoing responses on a TCP connection. The TCP connections 329 * are configured to be non-blocking and the handler may be called 330 * multiple times before a complete response is sent. 331 */ 332 static void handle_tcp_writing(int fd, short event, void* arg); 333 334 #ifdef HAVE_SSL 335 /* Create SSL object and associate fd */ 336 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 337 /* 338 * Handle TLS handshake. May be called multiple times if incomplete. 339 */ 340 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 341 342 /* 343 * Handle incoming queries on a TLS over TCP connection. The TLS 344 * connections are configured to be non-blocking and the handler may 345 * be called multiple times before a complete query is received. 346 */ 347 static void handle_tls_reading(int fd, short event, void* arg); 348 349 /* 350 * Handle outgoing responses on a TLS over TCP connection. The TLS 351 * connections are configured to be non-blocking and the handler may 352 * be called multiple times before a complete response is sent. 353 */ 354 static void handle_tls_writing(int fd, short event, void* arg); 355 #endif 356 357 /* 358 * Send all children the quit nonblocking, then close pipe. 359 */ 360 static void send_children_quit(struct nsd* nsd); 361 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 362 static void send_children_quit_and_wait(struct nsd* nsd); 363 364 /* set childrens flags to send NSD_STATS to them */ 365 #ifdef BIND8_STATS 366 static void set_children_stats(struct nsd* nsd); 367 #endif /* BIND8_STATS */ 368 369 /* 370 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 371 */ 372 static void configure_handler_event_types(short event_types); 373 374 static uint16_t *compressed_dname_offsets = 0; 375 static uint32_t compression_table_capacity = 0; 376 static uint32_t compression_table_size = 0; 377 static domain_type* compressed_dnames[MAXRRSPP]; 378 379 #ifdef USE_TCP_FASTOPEN 380 /* Checks to see if the kernel value must be manually changed in order for 381 TCP Fast Open to support server mode */ 382 static void report_tcp_fastopen_config() { 383 384 int tcp_fastopen_fp; 385 uint8_t tcp_fastopen_value; 386 387 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 388 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 389 } 390 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 391 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 392 close(tcp_fastopen_fp); 393 } 394 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 395 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 396 log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n"); 397 log_msg(LOG_WARNING, "To enable TFO use the command:"); 398 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 399 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 400 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 401 close(tcp_fastopen_fp); 402 } 403 close(tcp_fastopen_fp); 404 } 405 #endif 406 407 /* 408 * Remove the specified pid from the list of child pids. Returns -1 if 409 * the pid is not in the list, child_num otherwise. The field is set to 0. 410 */ 411 static int 412 delete_child_pid(struct nsd *nsd, pid_t pid) 413 { 414 size_t i; 415 for (i = 0; i < nsd->child_count; ++i) { 416 if (nsd->children[i].pid == pid) { 417 nsd->children[i].pid = 0; 418 if(!nsd->children[i].need_to_exit) { 419 if(nsd->children[i].child_fd != -1) 420 close(nsd->children[i].child_fd); 421 nsd->children[i].child_fd = -1; 422 if(nsd->children[i].handler) 423 nsd->children[i].handler->fd = -1; 424 } 425 return i; 426 } 427 } 428 return -1; 429 } 430 431 /* 432 * Restart child servers if necessary. 433 */ 434 static int 435 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 436 int* xfrd_sock_p) 437 { 438 struct main_ipc_handler_data *ipc_data; 439 size_t i; 440 int sv[2]; 441 442 /* Fork the child processes... */ 443 for (i = 0; i < nsd->child_count; ++i) { 444 if (nsd->children[i].pid <= 0) { 445 if (nsd->children[i].child_fd != -1) 446 close(nsd->children[i].child_fd); 447 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 448 log_msg(LOG_ERR, "socketpair: %s", 449 strerror(errno)); 450 return -1; 451 } 452 nsd->children[i].child_fd = sv[0]; 453 nsd->children[i].parent_fd = sv[1]; 454 nsd->children[i].pid = fork(); 455 switch (nsd->children[i].pid) { 456 default: /* SERVER MAIN */ 457 close(nsd->children[i].parent_fd); 458 nsd->children[i].parent_fd = -1; 459 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 460 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 461 } 462 if(!nsd->children[i].handler) 463 { 464 ipc_data = (struct main_ipc_handler_data*) region_alloc( 465 region, sizeof(struct main_ipc_handler_data)); 466 ipc_data->nsd = nsd; 467 ipc_data->child = &nsd->children[i]; 468 ipc_data->child_num = i; 469 ipc_data->xfrd_sock = xfrd_sock_p; 470 ipc_data->packet = buffer_create(region, QIOBUFSZ); 471 ipc_data->forward_mode = 0; 472 ipc_data->got_bytes = 0; 473 ipc_data->total_bytes = 0; 474 ipc_data->acl_num = 0; 475 nsd->children[i].handler = (struct netio_handler*) region_alloc( 476 region, sizeof(struct netio_handler)); 477 nsd->children[i].handler->fd = nsd->children[i].child_fd; 478 nsd->children[i].handler->timeout = NULL; 479 nsd->children[i].handler->user_data = ipc_data; 480 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 481 nsd->children[i].handler->event_handler = parent_handle_child_command; 482 netio_add_handler(netio, nsd->children[i].handler); 483 } 484 /* clear any ongoing ipc */ 485 ipc_data = (struct main_ipc_handler_data*) 486 nsd->children[i].handler->user_data; 487 ipc_data->forward_mode = 0; 488 /* restart - update fd */ 489 nsd->children[i].handler->fd = nsd->children[i].child_fd; 490 break; 491 case 0: /* CHILD */ 492 #ifdef MEMCLEAN /* OS collects memory pages */ 493 region_destroy(region); 494 #endif 495 496 if (pledge("stdio rpath inet", NULL) == -1) { 497 log_msg(LOG_ERR, "pledge"); 498 exit(1); 499 } 500 501 nsd->pid = 0; 502 nsd->child_count = 0; 503 nsd->server_kind = nsd->children[i].kind; 504 nsd->this_child = &nsd->children[i]; 505 nsd->this_child->child_num = i; 506 /* remove signal flags inherited from parent 507 the parent will handle them. */ 508 nsd->signal_hint_reload_hup = 0; 509 nsd->signal_hint_reload = 0; 510 nsd->signal_hint_child = 0; 511 nsd->signal_hint_quit = 0; 512 nsd->signal_hint_shutdown = 0; 513 nsd->signal_hint_stats = 0; 514 nsd->signal_hint_statsusr = 0; 515 close(*xfrd_sock_p); 516 close(nsd->this_child->child_fd); 517 nsd->this_child->child_fd = -1; 518 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 519 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 520 } 521 server_child(nsd); 522 /* NOTREACH */ 523 exit(0); 524 case -1: 525 log_msg(LOG_ERR, "fork failed: %s", 526 strerror(errno)); 527 return -1; 528 } 529 } 530 } 531 return 0; 532 } 533 534 #ifdef BIND8_STATS 535 static void set_bind8_alarm(struct nsd* nsd) 536 { 537 /* resync so that the next alarm is on the next whole minute */ 538 if(nsd->st_period > 0) /* % by 0 gives divbyzero error */ 539 alarm(nsd->st_period - (time(NULL) % nsd->st_period)); 540 } 541 #endif 542 543 /* set zone stat ids for zones initially read in */ 544 static void 545 zonestatid_tree_set(struct nsd* nsd) 546 { 547 struct radnode* n; 548 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 549 zone_type* zone = (zone_type*)n->elem; 550 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 551 } 552 } 553 554 #ifdef USE_ZONE_STATS 555 void 556 server_zonestat_alloc(struct nsd* nsd) 557 { 558 size_t num = (nsd->options->zonestatnames->count==0?1: 559 nsd->options->zonestatnames->count); 560 size_t sz = sizeof(struct nsdst)*num; 561 char tmpfile[256]; 562 uint8_t z = 0; 563 564 /* file names */ 565 nsd->zonestatfname[0] = 0; 566 nsd->zonestatfname[1] = 0; 567 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 568 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 569 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 570 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 571 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 572 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 573 574 /* file descriptors */ 575 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 576 if(nsd->zonestatfd[0] == -1) { 577 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 578 strerror(errno)); 579 exit(1); 580 } 581 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 582 if(nsd->zonestatfd[0] == -1) { 583 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 584 strerror(errno)); 585 close(nsd->zonestatfd[0]); 586 unlink(nsd->zonestatfname[0]); 587 exit(1); 588 } 589 590 #ifdef HAVE_MMAP 591 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 592 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 593 strerror(errno)); 594 exit(1); 595 } 596 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 597 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 598 nsd->zonestatfname[0], strerror(errno)); 599 exit(1); 600 } 601 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 602 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 603 strerror(errno)); 604 exit(1); 605 } 606 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 607 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 608 nsd->zonestatfname[1], strerror(errno)); 609 exit(1); 610 } 611 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 612 MAP_SHARED, nsd->zonestatfd[0], 0); 613 if(nsd->zonestat[0] == MAP_FAILED) { 614 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 615 unlink(nsd->zonestatfname[0]); 616 unlink(nsd->zonestatfname[1]); 617 exit(1); 618 } 619 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 620 MAP_SHARED, nsd->zonestatfd[1], 0); 621 if(nsd->zonestat[1] == MAP_FAILED) { 622 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 623 unlink(nsd->zonestatfname[0]); 624 unlink(nsd->zonestatfname[1]); 625 exit(1); 626 } 627 memset(nsd->zonestat[0], 0, sz); 628 memset(nsd->zonestat[1], 0, sz); 629 nsd->zonestatsize[0] = num; 630 nsd->zonestatsize[1] = num; 631 nsd->zonestatdesired = num; 632 nsd->zonestatsizenow = num; 633 nsd->zonestatnow = nsd->zonestat[0]; 634 #endif /* HAVE_MMAP */ 635 } 636 637 void 638 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 639 { 640 #ifdef HAVE_MMAP 641 #ifdef MREMAP_MAYMOVE 642 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 643 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 644 MREMAP_MAYMOVE); 645 if(nsd->zonestat[idx] == MAP_FAILED) { 646 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 647 exit(1); 648 } 649 #else /* !HAVE MREMAP */ 650 if(msync(nsd->zonestat[idx], 651 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 652 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 653 if(munmap(nsd->zonestat[idx], 654 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 655 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 656 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 657 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 658 if(nsd->zonestat[idx] == MAP_FAILED) { 659 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 660 exit(1); 661 } 662 #endif /* MREMAP */ 663 #endif /* HAVE_MMAP */ 664 } 665 666 /* realloc the zonestat array for the one that is not currently in use, 667 * to match the desired new size of the array (if applicable) */ 668 void 669 server_zonestat_realloc(struct nsd* nsd) 670 { 671 #ifdef HAVE_MMAP 672 uint8_t z = 0; 673 size_t sz; 674 int idx = 0; /* index of the zonestat array that is not in use */ 675 if(nsd->zonestatnow == nsd->zonestat[0]) 676 idx = 1; 677 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 678 return; 679 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 680 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 681 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 682 strerror(errno)); 683 exit(1); 684 } 685 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 686 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 687 nsd->zonestatfname[idx], strerror(errno)); 688 exit(1); 689 } 690 zonestat_remap(nsd, idx, sz); 691 /* zero the newly allocated region */ 692 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 693 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 694 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 695 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 696 } 697 nsd->zonestatsize[idx] = nsd->zonestatdesired; 698 #endif /* HAVE_MMAP */ 699 } 700 701 /* switchover to use the other array for the new children, that 702 * briefly coexist with the old children. And we want to avoid them 703 * both writing to the same statistics arrays. */ 704 void 705 server_zonestat_switch(struct nsd* nsd) 706 { 707 if(nsd->zonestatnow == nsd->zonestat[0]) { 708 nsd->zonestatnow = nsd->zonestat[1]; 709 nsd->zonestatsizenow = nsd->zonestatsize[1]; 710 } else { 711 nsd->zonestatnow = nsd->zonestat[0]; 712 nsd->zonestatsizenow = nsd->zonestatsize[0]; 713 } 714 } 715 #endif /* USE_ZONE_STATS */ 716 717 #ifdef BIND8_STATS 718 void 719 server_stat_alloc(struct nsd* nsd) 720 { 721 char tmpfile[256]; 722 size_t sz = sizeof(struct nsdst) * nsd->child_count * 2; 723 uint8_t z = 0; 724 725 /* file name */ 726 nsd->statfname = 0; 727 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat", 728 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 729 nsd->statfname = region_strdup(nsd->region, tmpfile); 730 731 /* file descriptor */ 732 nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600); 733 if(nsd->statfd == -1) { 734 log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname, 735 strerror(errno)); 736 unlink(nsd->zonestatfname[0]); 737 unlink(nsd->zonestatfname[1]); 738 exit(1); 739 } 740 741 #ifdef HAVE_MMAP 742 if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) { 743 log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname, 744 strerror(errno)); 745 goto fail_exit; 746 } 747 if(write(nsd->statfd, &z, 1) == -1) { 748 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 749 nsd->statfname, strerror(errno)); 750 goto fail_exit; 751 } 752 nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 753 MAP_SHARED, nsd->statfd, 0); 754 if(nsd->stat_map == MAP_FAILED) { 755 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 756 fail_exit: 757 close(nsd->statfd); 758 unlink(nsd->statfname); 759 unlink(nsd->zonestatfname[0]); 760 unlink(nsd->zonestatfname[1]); 761 exit(1); 762 } 763 memset(nsd->stat_map, 0, sz); 764 nsd->stats_per_child[0] = nsd->stat_map; 765 nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count]; 766 nsd->stat_current = 0; 767 nsd->st = &nsd->stats_per_child[nsd->stat_current][0]; 768 #endif /* HAVE_MMAP */ 769 } 770 #endif /* BIND8_STATS */ 771 772 #ifdef BIND8_STATS 773 void 774 server_stat_free(struct nsd* nsd) 775 { 776 unlink(nsd->statfname); 777 } 778 #endif /* BIND8_STATS */ 779 780 static void 781 cleanup_dname_compression_tables(void *ptr) 782 { 783 free(ptr); 784 compressed_dname_offsets = NULL; 785 compression_table_capacity = 0; 786 } 787 788 static void 789 initialize_dname_compression_tables(struct nsd *nsd) 790 { 791 size_t needed = domain_table_count(nsd->db->domains) + 1; 792 needed += EXTRA_DOMAIN_NUMBERS; 793 if(compression_table_capacity < needed) { 794 if(compressed_dname_offsets) { 795 region_remove_cleanup(nsd->db->region, 796 cleanup_dname_compression_tables, 797 compressed_dname_offsets); 798 free(compressed_dname_offsets); 799 } 800 compressed_dname_offsets = (uint16_t *) xmallocarray( 801 needed, sizeof(uint16_t)); 802 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 803 compressed_dname_offsets); 804 compression_table_capacity = needed; 805 compression_table_size=domain_table_count(nsd->db->domains)+1; 806 } 807 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 808 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 809 } 810 811 static int 812 set_cloexec(struct nsd_socket *sock) 813 { 814 assert(sock != NULL); 815 816 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 817 const char *socktype = 818 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 819 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 820 socktype, strerror(errno)); 821 return -1; 822 } 823 824 return 1; 825 } 826 827 static int 828 set_reuseport(struct nsd_socket *sock) 829 { 830 #ifdef SO_REUSEPORT 831 int on = 1; 832 #ifdef SO_REUSEPORT_LB 833 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 834 * SO_REUSEPORT on Linux. This is what the users want with the config 835 * option in nsd.conf; if we actually need local address and port reuse 836 * they'll also need to have SO_REUSEPORT set for them, assume it was 837 * _LB they want. 838 */ 839 int opt = SO_REUSEPORT_LB; 840 static const char optname[] = "SO_REUSEPORT_LB"; 841 #else /* !SO_REUSEPORT_LB */ 842 int opt = SO_REUSEPORT; 843 static const char optname[] = "SO_REUSEPORT"; 844 #endif /* SO_REUSEPORT_LB */ 845 846 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 847 return 1; 848 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 849 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 850 optname, strerror(errno)); 851 } 852 return -1; 853 #else 854 (void)sock; 855 #endif /* SO_REUSEPORT */ 856 857 return 0; 858 } 859 860 static int 861 set_reuseaddr(struct nsd_socket *sock) 862 { 863 #ifdef SO_REUSEADDR 864 int on = 1; 865 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 866 return 1; 867 } 868 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 869 strerror(errno)); 870 return -1; 871 #endif /* SO_REUSEADDR */ 872 return 0; 873 } 874 875 static int 876 set_rcvbuf(struct nsd_socket *sock, int rcv) 877 { 878 #ifdef SO_RCVBUF 879 #ifdef SO_RCVBUFFORCE 880 if(0 == setsockopt( 881 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 882 { 883 return 1; 884 } 885 if(errno == EPERM || errno == ENOBUFS) { 886 return 0; 887 } 888 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 889 strerror(errno)); 890 return -1; 891 #else /* !SO_RCVBUFFORCE */ 892 if (0 == setsockopt( 893 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 894 { 895 return 1; 896 } 897 if(errno == ENOSYS || errno == ENOBUFS) { 898 return 0; 899 } 900 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 901 strerror(errno)); 902 return -1; 903 #endif /* SO_RCVBUFFORCE */ 904 #endif /* SO_RCVBUF */ 905 906 return 0; 907 } 908 909 static int 910 set_sndbuf(struct nsd_socket *sock, int snd) 911 { 912 #ifdef SO_SNDBUF 913 #ifdef SO_SNDBUFFORCE 914 if(0 == setsockopt( 915 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 916 { 917 return 1; 918 } 919 if(errno == EPERM || errno == ENOBUFS) { 920 return 0; 921 } 922 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 923 strerror(errno)); 924 return -1; 925 #else /* !SO_SNDBUFFORCE */ 926 if(0 == setsockopt( 927 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 928 { 929 return 1; 930 } 931 if(errno == ENOSYS || errno == ENOBUFS) { 932 return 0; 933 } 934 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 935 strerror(errno)); 936 return -1; 937 #endif /* SO_SNDBUFFORCE */ 938 #endif /* SO_SNDBUF */ 939 940 return 0; 941 } 942 943 static int 944 set_nonblock(struct nsd_socket *sock) 945 { 946 const char *socktype = 947 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 948 949 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 950 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 951 socktype, strerror(errno)); 952 return -1; 953 } 954 955 return 1; 956 } 957 958 #ifdef INET6 959 static int 960 set_ipv6_v6only(struct nsd_socket *sock) 961 { 962 #ifdef IPV6_V6ONLY 963 int on = 1; 964 const char *socktype = 965 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 966 967 if(0 == setsockopt( 968 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 969 { 970 return 1; 971 } 972 973 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 974 socktype, strerror(errno)); 975 return -1; 976 #else 977 (void)sock; 978 #endif /* IPV6_V6ONLY */ 979 980 return 0; 981 } 982 #endif /* INET6 */ 983 984 #ifdef INET6 985 static int 986 set_ipv6_use_min_mtu(struct nsd_socket *sock) 987 { 988 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) 989 #if defined(IPV6_USE_MIN_MTU) 990 /* There is no fragmentation of IPv6 datagrams during forwarding in the 991 * network. Therefore we do not send UDP datagrams larger than the 992 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 993 * larger if the network stack supports IPV6_USE_MIN_MTU. 994 */ 995 int opt = IPV6_USE_MIN_MTU; 996 int optval = 1; 997 static const char optname[] = "IPV6_USE_MIN_MTU"; 998 #elif defined(IPV6_MTU) 999 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 1000 * to the MIN MTU to get the same. 1001 */ 1002 int opt = IPV6_MTU; 1003 int optval = IPV6_MIN_MTU; 1004 static const char optname[] = "IPV6_MTU"; 1005 #endif 1006 if(0 == setsockopt( 1007 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 1008 { 1009 return 1; 1010 } 1011 1012 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 1013 optname, strerror(errno)); 1014 return -1; 1015 #else 1016 (void)sock; 1017 #endif /* INET6 */ 1018 1019 return 0; 1020 } 1021 #endif /* INET6 */ 1022 1023 static int 1024 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 1025 { 1026 int ret = 0; 1027 1028 #if defined(IP_MTU_DISCOVER) 1029 int opt = IP_MTU_DISCOVER; 1030 int optval; 1031 # if defined(IP_PMTUDISC_OMIT) 1032 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 1033 * information and send packets with DF=0. Fragmentation is allowed if 1034 * and only if the packet size exceeds the outgoing interface MTU or 1035 * the packet encounters smaller MTU link in network. This mitigates 1036 * DNS fragmentation attacks by preventing forged PMTU information. 1037 * FreeBSD already has same semantics without setting the option. 1038 */ 1039 optval = IP_PMTUDISC_OMIT; 1040 if(0 == setsockopt( 1041 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 1042 { 1043 return 1; 1044 } 1045 1046 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1047 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 1048 # endif /* IP_PMTUDISC_OMIT */ 1049 # if defined(IP_PMTUDISC_DONT) 1050 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 1051 optval = IP_PMTUDISC_DONT; 1052 if(0 == setsockopt( 1053 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 1054 { 1055 return 1; 1056 } 1057 1058 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1059 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 1060 # endif 1061 ret = -1; 1062 #elif defined(IP_DONTFRAG) 1063 int off = 0; 1064 if (0 == setsockopt( 1065 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 1066 { 1067 return 1; 1068 } 1069 1070 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 1071 strerror(errno)); 1072 ret = -1; 1073 #else 1074 (void)sock; 1075 #endif 1076 1077 return ret; 1078 } 1079 1080 static int 1081 set_ip_freebind(struct nsd_socket *sock) 1082 { 1083 #ifdef IP_FREEBIND 1084 int on = 1; 1085 const char *socktype = 1086 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1087 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 1088 { 1089 return 1; 1090 } 1091 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 1092 socktype, strerror(errno)); 1093 return -1; 1094 #else 1095 (void)sock; 1096 #endif /* IP_FREEBIND */ 1097 1098 return 0; 1099 } 1100 1101 static int 1102 set_ip_transparent(struct nsd_socket *sock) 1103 { 1104 /* 1105 The scandalous preprocessor blob here calls for some explanation :) 1106 POSIX does not specify an option to bind non-local IPs, so 1107 platforms developed several implementation-specific options, 1108 all set in the same way, but with different names. 1109 For additional complexity, some platform manage this setting 1110 differently for different address families (IPv4 vs IPv6). 1111 This scandalous preprocessor blob below abstracts such variability 1112 in the way which leaves the C code as lean and clear as possible. 1113 */ 1114 1115 #if defined(IP_TRANSPARENT) 1116 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 1117 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1118 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 1119 // as of 2020-01, Linux does not support this on IPv6 programmatically 1120 #elif defined(SO_BINDANY) 1121 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 1122 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 1123 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 1124 #elif defined(IP_BINDANY) 1125 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 1126 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 1127 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1128 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 1129 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 1130 #endif 1131 1132 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 1133 (void)sock; 1134 #else 1135 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 1136 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 1137 # endif 1138 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1139 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1140 # endif 1141 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1142 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1143 # endif 1144 1145 int on = 1; 1146 const char *socktype = 1147 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1148 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1149 1150 if(0 == setsockopt( 1151 sock->s, 1152 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1153 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1154 &on, sizeof(on))) 1155 { 1156 return 1; 1157 } 1158 1159 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1160 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1161 return -1; 1162 #endif 1163 1164 return 0; 1165 } 1166 1167 static int 1168 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1169 { 1170 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1171 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1172 return 1; 1173 } 1174 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1175 strerror(errno)); 1176 return -1; 1177 #else 1178 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1179 #endif 1180 return 0; 1181 } 1182 1183 #ifdef USE_TCP_FASTOPEN 1184 static int 1185 set_tcp_fastopen(struct nsd_socket *sock) 1186 { 1187 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1188 * a defense against IP spoofing attacks as suggested in RFC7413. 1189 */ 1190 int qlen; 1191 1192 #ifdef __APPLE__ 1193 /* macOS X implementation only supports qlen of 1 via this call. The 1194 * actual value is configured by the net.inet.tcp.fastopen_backlog 1195 * kernel parameter. 1196 */ 1197 qlen = 1; 1198 #else 1199 /* 5 is recommended on Linux. */ 1200 qlen = 5; 1201 #endif 1202 if (0 == setsockopt( 1203 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1204 { 1205 return 1; 1206 } 1207 1208 if (errno == EPERM) { 1209 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1210 "; this could likely be because sysctl " 1211 "net.inet.tcp.fastopen.enabled, " 1212 "net.inet.tcp.fastopen.server_enable, or " 1213 "net.ipv4.tcp_fastopen is disabled", 1214 strerror(errno)); 1215 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1216 * disabled, except when verbosity enabled for debugging 1217 */ 1218 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1219 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1220 strerror(errno)); 1221 } 1222 1223 return (errno == ENOPROTOOPT ? 0 : -1); 1224 } 1225 #endif /* USE_TCP_FASTOPEN */ 1226 1227 static int 1228 set_bindtodevice(struct nsd_socket *sock) 1229 { 1230 #if defined(SO_BINDTODEVICE) 1231 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1232 sock->device, strlen(sock->device)) == -1) 1233 { 1234 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1235 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1236 return -1; 1237 } 1238 1239 return 1; 1240 #else 1241 (void)sock; 1242 return 0; 1243 #endif 1244 } 1245 1246 static int 1247 set_setfib(struct nsd_socket *sock) 1248 { 1249 #if defined(SO_SETFIB) 1250 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1251 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1252 { 1253 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1254 "SO_SETFIB", sock->fib, strerror(errno)); 1255 return -1; 1256 } 1257 1258 return 1; 1259 #else 1260 (void)sock; 1261 return 0; 1262 #endif 1263 } 1264 1265 static int 1266 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1267 { 1268 int rcv = 1*1024*1024, snd = 1*1024*1024; 1269 1270 if(-1 == (sock->s = socket( 1271 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1272 { 1273 #ifdef INET6 1274 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1275 (sock->addr.ai_family == AF_INET6) && 1276 (errno == EAFNOSUPPORT)) 1277 { 1278 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1279 "not supported"); 1280 return 0; 1281 } 1282 #endif 1283 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1284 return -1; 1285 } 1286 1287 set_cloexec(sock); 1288 1289 if(nsd->reuseport && reuseport_works && *reuseport_works) 1290 *reuseport_works = (set_reuseport(sock) == 1); 1291 1292 if(nsd->options->receive_buffer_size > 0) 1293 rcv = nsd->options->receive_buffer_size; 1294 if(set_rcvbuf(sock, rcv) == -1) 1295 return -1; 1296 1297 if(nsd->options->send_buffer_size > 0) 1298 snd = nsd->options->send_buffer_size; 1299 if(set_sndbuf(sock, snd) == -1) 1300 return -1; 1301 #ifdef INET6 1302 if(sock->addr.ai_family == AF_INET6) { 1303 if(set_ipv6_v6only(sock) == -1 || 1304 set_ipv6_use_min_mtu(sock) == -1) 1305 return -1; 1306 } else 1307 #endif /* INET6 */ 1308 if(sock->addr.ai_family == AF_INET) { 1309 if(set_ipv4_no_pmtu_disc(sock) == -1) 1310 return -1; 1311 } 1312 1313 /* Set socket to non-blocking. Otherwise, on operating systems 1314 * with thundering herd problems, the UDP recv could block 1315 * after select returns readable. 1316 */ 1317 set_nonblock(sock); 1318 1319 if(nsd->options->ip_freebind) 1320 (void)set_ip_freebind(sock); 1321 if(nsd->options->ip_transparent) 1322 (void)set_ip_transparent(sock); 1323 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1324 return -1; 1325 if(sock->fib != -1 && set_setfib(sock) == -1) 1326 return -1; 1327 1328 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1329 char buf[256]; 1330 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1331 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1332 buf, strerror(errno)); 1333 return -1; 1334 } 1335 1336 return 1; 1337 } 1338 1339 static int 1340 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1341 { 1342 #ifdef USE_TCP_FASTOPEN 1343 report_tcp_fastopen_config(); 1344 #endif 1345 1346 (void)reuseport_works; 1347 1348 if(-1 == (sock->s = socket( 1349 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1350 { 1351 #ifdef INET6 1352 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1353 (sock->addr.ai_family == AF_INET6) && 1354 (errno == EAFNOSUPPORT)) 1355 { 1356 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1357 "not supported"); 1358 return 0; 1359 } 1360 #endif /* INET6 */ 1361 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1362 return -1; 1363 } 1364 1365 set_cloexec(sock); 1366 1367 if(nsd->reuseport && reuseport_works && *reuseport_works) 1368 *reuseport_works = (set_reuseport(sock) == 1); 1369 1370 (void)set_reuseaddr(sock); 1371 1372 #ifdef INET6 1373 if(sock->addr.ai_family == AF_INET6) { 1374 if (set_ipv6_v6only(sock) == -1 || 1375 set_ipv6_use_min_mtu(sock) == -1) 1376 return -1; 1377 } 1378 #endif 1379 1380 if(nsd->tcp_mss > 0) 1381 set_tcp_maxseg(sock, nsd->tcp_mss); 1382 /* (StevensUNP p463), if TCP listening socket is blocking, then 1383 it may block in accept, even if select() says readable. */ 1384 (void)set_nonblock(sock); 1385 if(nsd->options->ip_freebind) 1386 (void)set_ip_freebind(sock); 1387 if(nsd->options->ip_transparent) 1388 (void)set_ip_transparent(sock); 1389 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1390 return -1; 1391 if(sock->fib != -1 && set_setfib(sock) == -1) 1392 return -1; 1393 1394 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1395 char buf[256]; 1396 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1397 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1398 buf, strerror(errno)); 1399 return -1; 1400 } 1401 1402 #ifdef USE_TCP_FASTOPEN 1403 (void)set_tcp_fastopen(sock); 1404 #endif 1405 1406 if(listen(sock->s, TCP_BACKLOG) == -1) { 1407 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1408 return -1; 1409 } 1410 1411 return 1; 1412 } 1413 1414 /* 1415 * Initialize the server, reuseport, create and bind the sockets. 1416 */ 1417 int 1418 server_init(struct nsd *nsd) 1419 { 1420 size_t i; 1421 int reuseport = 1; /* Determine if REUSEPORT works. */ 1422 1423 /* open server interface ports */ 1424 for(i = 0; i < nsd->ifs; i++) { 1425 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1426 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1427 { 1428 return -1; 1429 } 1430 } 1431 1432 if(nsd->reuseport && reuseport) { 1433 size_t ifs = nsd->ifs * nsd->reuseport; 1434 1435 /* increase the size of the interface arrays, there are going 1436 * to be separate interface file descriptors for every server 1437 * instance */ 1438 region_remove_cleanup(nsd->region, free, nsd->udp); 1439 region_remove_cleanup(nsd->region, free, nsd->tcp); 1440 1441 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1442 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1443 region_add_cleanup(nsd->region, free, nsd->udp); 1444 region_add_cleanup(nsd->region, free, nsd->tcp); 1445 if(ifs > nsd->ifs) { 1446 memset(&nsd->udp[nsd->ifs], 0, 1447 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1448 memset(&nsd->tcp[nsd->ifs], 0, 1449 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1450 } 1451 1452 for(i = nsd->ifs; i < ifs; i++) { 1453 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1454 nsd->udp[i].s = -1; 1455 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1456 return -1; 1457 } 1458 /* Turn off REUSEPORT for TCP by copying the socket 1459 * file descriptor. 1460 * This means we should not close TCP used by 1461 * other servers in reuseport enabled mode, in 1462 * server_child(). 1463 */ 1464 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1465 } 1466 1467 nsd->ifs = ifs; 1468 } else { 1469 nsd->reuseport = 0; 1470 } 1471 1472 /* open server interface ports for verifiers */ 1473 for(i = 0; i < nsd->verify_ifs; i++) { 1474 if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 || 1475 open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1) 1476 { 1477 return -1; 1478 } 1479 } 1480 1481 return 0; 1482 } 1483 1484 /* 1485 * Prepare the server for take off. 1486 * 1487 */ 1488 int 1489 server_prepare(struct nsd *nsd) 1490 { 1491 #ifdef RATELIMIT 1492 /* set secret modifier for hashing (rate limits) */ 1493 #ifdef HAVE_GETRANDOM 1494 uint32_t v; 1495 if(getrandom(&v, sizeof(v), 0) == -1) { 1496 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1497 exit(1); 1498 } 1499 hash_set_raninit(v); 1500 #elif defined(HAVE_ARC4RANDOM) 1501 hash_set_raninit(arc4random()); 1502 #else 1503 uint32_t v = getpid() ^ time(NULL); 1504 srandom((unsigned long)v); 1505 # ifdef HAVE_SSL 1506 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1507 hash_set_raninit(v); 1508 else 1509 # endif 1510 hash_set_raninit(random()); 1511 #endif 1512 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1513 nsd->options->rrl_ratelimit, 1514 nsd->options->rrl_whitelist_ratelimit, 1515 nsd->options->rrl_slip, 1516 nsd->options->rrl_ipv4_prefix_length, 1517 nsd->options->rrl_ipv6_prefix_length); 1518 #endif /* RATELIMIT */ 1519 1520 /* Open the database... */ 1521 if ((nsd->db = namedb_open(nsd->options)) == NULL) { 1522 log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno)); 1523 unlink(nsd->task[0]->fname); 1524 unlink(nsd->task[1]->fname); 1525 #ifdef USE_ZONE_STATS 1526 unlink(nsd->zonestatfname[0]); 1527 unlink(nsd->zonestatfname[1]); 1528 #endif 1529 #ifdef BIND8_STATS 1530 server_stat_free(nsd); 1531 #endif 1532 xfrd_del_tempdir(nsd); 1533 return -1; 1534 } 1535 /* check if zone files can be read */ 1536 /* NULL for taskudb because we send soainfo in a moment, batched up, 1537 * for all zones */ 1538 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1539 zonestatid_tree_set(nsd); 1540 1541 compression_table_capacity = 0; 1542 initialize_dname_compression_tables(nsd); 1543 1544 #ifdef BIND8_STATS 1545 /* Initialize times... */ 1546 time(&nsd->st->boot); 1547 set_bind8_alarm(nsd); 1548 #endif /* BIND8_STATS */ 1549 1550 return 0; 1551 } 1552 1553 /* 1554 * Fork the required number of servers. 1555 */ 1556 static int 1557 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1558 int* xfrd_sock_p) 1559 { 1560 size_t i; 1561 1562 /* Start all child servers initially. */ 1563 for (i = 0; i < nsd->child_count; ++i) { 1564 nsd->children[i].pid = 0; 1565 } 1566 1567 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1568 } 1569 1570 static void 1571 server_close_socket(struct nsd_socket *sock) 1572 { 1573 if(sock->s != -1) { 1574 close(sock->s); 1575 sock->s = -1; 1576 } 1577 } 1578 1579 void 1580 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1581 { 1582 size_t i; 1583 1584 /* Close all the sockets... */ 1585 for (i = 0; i < n; ++i) { 1586 server_close_socket(&sockets[i]); 1587 } 1588 } 1589 1590 /* 1591 * Close the sockets, shutdown the server and exit. 1592 * Does not return. 1593 */ 1594 void 1595 server_shutdown(struct nsd *nsd) 1596 { 1597 size_t i; 1598 1599 server_close_all_sockets(nsd->udp, nsd->ifs); 1600 server_close_all_sockets(nsd->tcp, nsd->ifs); 1601 /* CHILD: close command channel to parent */ 1602 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1603 { 1604 close(nsd->this_child->parent_fd); 1605 nsd->this_child->parent_fd = -1; 1606 } 1607 /* SERVER: close command channels to children */ 1608 if(!nsd->this_child) 1609 { 1610 for(i=0; i < nsd->child_count; ++i) 1611 if(nsd->children[i].child_fd != -1) 1612 { 1613 close(nsd->children[i].child_fd); 1614 nsd->children[i].child_fd = -1; 1615 } 1616 } 1617 1618 tsig_finalize(); 1619 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1620 #ifdef HAVE_SSL 1621 if (nsd->tls_ctx) 1622 SSL_CTX_free(nsd->tls_ctx); 1623 #endif 1624 1625 #ifdef MEMCLEAN /* OS collects memory pages */ 1626 #ifdef RATELIMIT 1627 rrl_mmap_deinit_keep_mmap(); 1628 #endif 1629 #ifdef USE_DNSTAP 1630 dt_collector_destroy(nsd->dt_collector, nsd); 1631 #endif 1632 udb_base_free_keep_mmap(nsd->task[0]); 1633 udb_base_free_keep_mmap(nsd->task[1]); 1634 namedb_free_ixfr(nsd->db); 1635 namedb_close(nsd->db); 1636 nsd_options_destroy(nsd->options); 1637 region_destroy(nsd->region); 1638 #endif 1639 log_finalize(); 1640 exit(0); 1641 } 1642 1643 void 1644 server_prepare_xfrd(struct nsd* nsd) 1645 { 1646 char tmpfile[256]; 1647 /* create task mmaps */ 1648 nsd->mytask = 0; 1649 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1650 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1651 nsd->task[0] = task_file_create(tmpfile); 1652 if(!nsd->task[0]) { 1653 #ifdef USE_ZONE_STATS 1654 unlink(nsd->zonestatfname[0]); 1655 unlink(nsd->zonestatfname[1]); 1656 #endif 1657 #ifdef BIND8_STATS 1658 server_stat_free(nsd); 1659 #endif 1660 xfrd_del_tempdir(nsd); 1661 exit(1); 1662 } 1663 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1664 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1665 nsd->task[1] = task_file_create(tmpfile); 1666 if(!nsd->task[1]) { 1667 unlink(nsd->task[0]->fname); 1668 #ifdef USE_ZONE_STATS 1669 unlink(nsd->zonestatfname[0]); 1670 unlink(nsd->zonestatfname[1]); 1671 #endif 1672 #ifdef BIND8_STATS 1673 server_stat_free(nsd); 1674 #endif 1675 xfrd_del_tempdir(nsd); 1676 exit(1); 1677 } 1678 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1679 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1680 /* create xfrd listener structure */ 1681 nsd->xfrd_listener = region_alloc(nsd->region, 1682 sizeof(netio_handler_type)); 1683 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1684 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1685 nsd->xfrd_listener->fd = -1; 1686 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1687 nsd; 1688 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1689 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1690 } 1691 1692 1693 void 1694 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1695 { 1696 pid_t pid; 1697 int sockets[2] = {0,0}; 1698 struct ipc_handler_conn_data *data; 1699 1700 if(nsd->xfrd_listener->fd != -1) 1701 close(nsd->xfrd_listener->fd); 1702 if(del_db) { 1703 /* recreate taskdb that xfrd was using, it may be corrupt */ 1704 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1705 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1706 nsd->task[1-nsd->mytask]->fname = NULL; 1707 /* free alloc already, so udb does not shrink itself */ 1708 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1709 nsd->task[1-nsd->mytask]->alloc = NULL; 1710 udb_base_free(nsd->task[1-nsd->mytask]); 1711 /* create new file, overwrite the old one */ 1712 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1713 free(tmpfile); 1714 } 1715 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1716 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1717 return; 1718 } 1719 pid = fork(); 1720 switch (pid) { 1721 case -1: 1722 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1723 break; 1724 default: 1725 /* PARENT: close first socket, use second one */ 1726 close(sockets[0]); 1727 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1728 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1729 } 1730 if(del_db) xfrd_free_namedb(nsd); 1731 /* use other task than I am using, since if xfrd died and is 1732 * restarted, the reload is using nsd->mytask */ 1733 nsd->mytask = 1 - nsd->mytask; 1734 1735 #ifdef HAVE_SETPROCTITLE 1736 setproctitle("xfrd"); 1737 #endif 1738 #ifdef HAVE_CPUSET_T 1739 if(nsd->use_cpu_affinity) { 1740 set_cpu_affinity(nsd->xfrd_cpuset); 1741 } 1742 #endif 1743 1744 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1745 /* ENOTREACH */ 1746 break; 1747 case 0: 1748 /* CHILD: close second socket, use first one */ 1749 close(sockets[1]); 1750 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1751 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1752 } 1753 nsd->xfrd_listener->fd = sockets[0]; 1754 break; 1755 } 1756 /* server-parent only */ 1757 nsd->xfrd_listener->timeout = NULL; 1758 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1759 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1760 /* clear ongoing ipc reads */ 1761 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1762 data->conn->is_reading = 0; 1763 } 1764 1765 /** add all soainfo to taskdb */ 1766 static void 1767 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1768 { 1769 struct radnode* n; 1770 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1771 /* add all SOA INFO to mytask */ 1772 udb_ptr_init(&task_last, taskudb); 1773 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1774 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1775 } 1776 udb_ptr_unlink(&task_last, taskudb); 1777 } 1778 1779 void 1780 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1781 { 1782 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1783 * parent fills one taskdb with soas, xfrd fills other with expires. 1784 * then they exchange and process. 1785 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1786 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1787 * expire notifications can be sent back via a normal reload later 1788 * (xfrd will wait for current running reload to finish if any). 1789 */ 1790 sig_atomic_t cmd = 0; 1791 pid_t mypid; 1792 int xfrd_sock = nsd->xfrd_listener->fd; 1793 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1794 udb_ptr t; 1795 if(!shortsoa) { 1796 if(nsd->signal_hint_shutdown) { 1797 shutdown: 1798 log_msg(LOG_WARNING, "signal received, shutting down..."); 1799 server_close_all_sockets(nsd->udp, nsd->ifs); 1800 server_close_all_sockets(nsd->tcp, nsd->ifs); 1801 daemon_remote_close(nsd->rc); 1802 /* Unlink it if possible... */ 1803 unlinkpid(nsd->pidfile); 1804 unlink(nsd->task[0]->fname); 1805 unlink(nsd->task[1]->fname); 1806 #ifdef USE_ZONE_STATS 1807 unlink(nsd->zonestatfname[0]); 1808 unlink(nsd->zonestatfname[1]); 1809 #endif 1810 #ifdef BIND8_STATS 1811 server_stat_free(nsd); 1812 #endif 1813 server_shutdown(nsd); 1814 /* ENOTREACH */ 1815 exit(0); 1816 } 1817 } 1818 if(shortsoa) { 1819 /* put SOA in xfrd task because mytask may be in use */ 1820 taskudb = nsd->task[1-nsd->mytask]; 1821 } 1822 1823 add_all_soa_to_task(nsd, taskudb); 1824 if(!shortsoa) { 1825 /* wait for xfrd to signal task is ready, RELOAD signal */ 1826 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1827 cmd != NSD_RELOAD) { 1828 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1829 exit(1); 1830 } 1831 if(nsd->signal_hint_shutdown) { 1832 goto shutdown; 1833 } 1834 } 1835 /* give xfrd our task, signal it with RELOAD_DONE */ 1836 task_process_sync(taskudb); 1837 cmd = NSD_RELOAD_DONE; 1838 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1839 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1840 (int)nsd->pid, strerror(errno)); 1841 } 1842 mypid = getpid(); 1843 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1844 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1845 strerror(errno)); 1846 } 1847 1848 if(!shortsoa) { 1849 /* process the xfrd task works (expiry data) */ 1850 nsd->mytask = 1 - nsd->mytask; 1851 taskudb = nsd->task[nsd->mytask]; 1852 task_remap(taskudb); 1853 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1854 while(!udb_ptr_is_null(&t)) { 1855 task_process_expire(nsd->db, TASKLIST(&t)); 1856 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1857 } 1858 udb_ptr_unlink(&t, taskudb); 1859 task_clear(taskudb); 1860 1861 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1862 cmd = NSD_RELOAD_DONE; 1863 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1864 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1865 (int)nsd->pid, strerror(errno)); 1866 } 1867 } 1868 } 1869 1870 #ifdef HAVE_SSL 1871 static void 1872 log_crypto_from_err(const char* str, unsigned long err) 1873 { 1874 /* error:[error code]:[library name]:[function name]:[reason string] */ 1875 char buf[128]; 1876 unsigned long e; 1877 ERR_error_string_n(err, buf, sizeof(buf)); 1878 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1879 while( (e=ERR_get_error()) ) { 1880 ERR_error_string_n(e, buf, sizeof(buf)); 1881 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1882 } 1883 } 1884 1885 void 1886 log_crypto_err(const char* str) 1887 { 1888 log_crypto_from_err(str, ERR_get_error()); 1889 } 1890 1891 /** true if the ssl handshake error has to be squelched from the logs */ 1892 static int 1893 squelch_err_ssl_handshake(unsigned long err) 1894 { 1895 if(verbosity >= 3) 1896 return 0; /* only squelch on low verbosity */ 1897 /* this is very specific, we could filter on ERR_GET_REASON() 1898 * (the third element in ERR_PACK) */ 1899 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1900 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1901 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1902 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1903 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1904 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1905 #endif 1906 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1907 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1908 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1909 # ifdef SSL_R_VERSION_TOO_LOW 1910 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1911 # endif 1912 #endif 1913 ) 1914 return 1; 1915 return 0; 1916 } 1917 1918 void 1919 perform_openssl_init(void) 1920 { 1921 /* init SSL library */ 1922 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1923 ERR_load_crypto_strings(); 1924 #endif 1925 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS) 1926 ERR_load_SSL_strings(); 1927 #endif 1928 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1929 OpenSSL_add_all_algorithms(); 1930 #else 1931 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1932 | OPENSSL_INIT_ADD_ALL_DIGESTS 1933 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1934 #endif 1935 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1936 (void)SSL_library_init(); 1937 #else 1938 OPENSSL_init_ssl(0, NULL); 1939 #endif 1940 1941 if(!RAND_status()) { 1942 /* try to seed it */ 1943 unsigned char buf[256]; 1944 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1945 size_t i; 1946 v = seed; 1947 for(i=0; i<256/sizeof(v); i++) { 1948 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1949 v = v*seed + (unsigned int)i; 1950 } 1951 RAND_seed(buf, 256); 1952 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1953 } 1954 } 1955 1956 static int 1957 get_ocsp(char *filename, unsigned char **ocsp) 1958 { 1959 BIO *bio; 1960 OCSP_RESPONSE *response; 1961 int len = -1; 1962 unsigned char *p, *buf; 1963 assert(filename); 1964 1965 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1966 log_crypto_err("get_ocsp: BIO_new_file failed"); 1967 return -1; 1968 } 1969 1970 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1971 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1972 BIO_free(bio); 1973 return -1; 1974 } 1975 1976 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1977 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1978 OCSP_RESPONSE_free(response); 1979 BIO_free(bio); 1980 return -1; 1981 } 1982 1983 if ((buf = malloc((size_t) len)) == NULL) { 1984 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1985 OCSP_RESPONSE_free(response); 1986 BIO_free(bio); 1987 return -1; 1988 } 1989 1990 p = buf; 1991 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1992 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1993 free(buf); 1994 OCSP_RESPONSE_free(response); 1995 BIO_free(bio); 1996 return -1; 1997 } 1998 1999 OCSP_RESPONSE_free(response); 2000 BIO_free(bio); 2001 2002 *ocsp = buf; 2003 return len; 2004 } 2005 2006 /* further setup ssl ctx after the keys are loaded */ 2007 static void 2008 listen_sslctx_setup_2(void* ctxt) 2009 { 2010 SSL_CTX* ctx = (SSL_CTX*)ctxt; 2011 (void)ctx; 2012 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 2013 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 2014 /* ENOTREACH */ 2015 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 2016 } 2017 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 2018 if(1) { 2019 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 2020 if (!ecdh) { 2021 log_crypto_err("could not find p256, not enabling ECDHE"); 2022 } else { 2023 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 2024 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 2025 } 2026 EC_KEY_free (ecdh); 2027 } 2028 } 2029 #endif 2030 } 2031 2032 static int 2033 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 2034 { 2035 if(ocspdata) { 2036 unsigned char *p; 2037 if ((p=malloc(ocspdata_len)) == NULL) { 2038 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 2039 return SSL_TLSEXT_ERR_NOACK; 2040 } 2041 memcpy(p, ocspdata, ocspdata_len); 2042 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 2043 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 2044 free(p); 2045 return SSL_TLSEXT_ERR_NOACK; 2046 } 2047 return SSL_TLSEXT_ERR_OK; 2048 } else { 2049 return SSL_TLSEXT_ERR_NOACK; 2050 } 2051 } 2052 2053 SSL_CTX* 2054 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 2055 { 2056 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 2057 if(!ctx) { 2058 log_crypto_err("could not SSL_CTX_new"); 2059 return NULL; 2060 } 2061 /* no SSLv2, SSLv3 because has defects */ 2062 #if SSL_OP_NO_SSLv2 != 0 2063 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 2064 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 2065 SSL_CTX_free(ctx); 2066 return NULL; 2067 } 2068 #endif 2069 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 2070 != SSL_OP_NO_SSLv3){ 2071 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 2072 SSL_CTX_free(ctx); 2073 return 0; 2074 } 2075 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 2076 /* if we have tls 1.1 disable 1.0 */ 2077 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 2078 != SSL_OP_NO_TLSv1){ 2079 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 2080 SSL_CTX_free(ctx); 2081 return 0; 2082 } 2083 #endif 2084 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 2085 /* if we have tls 1.2 disable 1.1 */ 2086 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 2087 != SSL_OP_NO_TLSv1_1){ 2088 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 2089 SSL_CTX_free(ctx); 2090 return 0; 2091 } 2092 #endif 2093 #if defined(SSL_OP_NO_RENEGOTIATION) 2094 /* disable client renegotiation */ 2095 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 2096 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 2097 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 2098 SSL_CTX_free(ctx); 2099 return 0; 2100 } 2101 #endif 2102 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 2103 /* if we detect system-wide crypto policies, use those */ 2104 if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) { 2105 /* if we have sha256, set the cipher list to have no known vulns */ 2106 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 2107 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 2108 } 2109 #endif 2110 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 2111 SSL_OP_CIPHER_SERVER_PREFERENCE) != 2112 SSL_OP_CIPHER_SERVER_PREFERENCE) { 2113 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 2114 SSL_CTX_free(ctx); 2115 return 0; 2116 } 2117 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 2118 SSL_CTX_set_security_level(ctx, 0); 2119 #endif 2120 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 2121 log_msg(LOG_ERR, "error for cert file: %s", pem); 2122 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 2123 SSL_CTX_free(ctx); 2124 return NULL; 2125 } 2126 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 2127 log_msg(LOG_ERR, "error for private key file: %s", key); 2128 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 2129 SSL_CTX_free(ctx); 2130 return NULL; 2131 } 2132 if(!SSL_CTX_check_private_key(ctx)) { 2133 log_msg(LOG_ERR, "error for key file: %s", key); 2134 log_crypto_err("Error in SSL_CTX check_private_key"); 2135 SSL_CTX_free(ctx); 2136 return NULL; 2137 } 2138 listen_sslctx_setup_2(ctx); 2139 if(verifypem && verifypem[0]) { 2140 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 2141 log_crypto_err("Error in SSL_CTX verify locations"); 2142 SSL_CTX_free(ctx); 2143 return NULL; 2144 } 2145 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 2146 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 2147 } 2148 return ctx; 2149 } 2150 2151 SSL_CTX* 2152 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 2153 { 2154 char *key, *pem; 2155 SSL_CTX *ctx; 2156 2157 key = nsd->options->tls_service_key; 2158 pem = nsd->options->tls_service_pem; 2159 if(!key || key[0] == 0) { 2160 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2161 return NULL; 2162 } 2163 if(!pem || pem[0] == 0) { 2164 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2165 return NULL; 2166 } 2167 2168 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2169 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2170 ctx = server_tls_ctx_setup(key, pem, verifypem); 2171 if(!ctx) { 2172 log_msg(LOG_ERR, "could not setup server TLS context"); 2173 return NULL; 2174 } 2175 if(ocspfile && ocspfile[0]) { 2176 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2177 log_crypto_err("Error reading OCSPfile"); 2178 SSL_CTX_free(ctx); 2179 return NULL; 2180 } else { 2181 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2182 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2183 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2184 SSL_CTX_free(ctx); 2185 return NULL; 2186 } 2187 } 2188 } 2189 return ctx; 2190 } 2191 2192 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2193 int 2194 using_tls_port(struct sockaddr* addr, const char* tls_port) 2195 { 2196 in_port_t port = 0; 2197 2198 if (addr->sa_family == AF_INET) 2199 port = ((struct sockaddr_in*)addr)->sin_port; 2200 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2201 else 2202 port = ((struct sockaddr_in6*)addr)->sin6_port; 2203 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2204 if (atoi(tls_port) == ntohs(port)) 2205 return 1; 2206 2207 return 0; 2208 } 2209 #endif 2210 2211 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2212 ssize_t 2213 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2214 { 2215 uint8_t* buf = (uint8_t*) p; 2216 ssize_t total = 0; 2217 struct pollfd fd; 2218 memset(&fd, 0, sizeof(fd)); 2219 fd.fd = s; 2220 fd.events = POLLIN; 2221 2222 while( total < sz) { 2223 ssize_t ret; 2224 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2225 if(ret == -1) { 2226 if(errno == EAGAIN) 2227 /* blocking read */ 2228 continue; 2229 if(errno == EINTR) { 2230 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2231 return -1; 2232 /* other signals can be handled later */ 2233 continue; 2234 } 2235 /* some error */ 2236 return -1; 2237 } 2238 if(ret == 0) { 2239 /* operation timed out */ 2240 return -2; 2241 } 2242 ret = read(s, buf+total, sz-total); 2243 if(ret == -1) { 2244 if(errno == EAGAIN) 2245 /* blocking read */ 2246 continue; 2247 if(errno == EINTR) { 2248 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2249 return -1; 2250 /* other signals can be handled later */ 2251 continue; 2252 } 2253 /* some error */ 2254 return -1; 2255 } 2256 if(ret == 0) { 2257 /* closed connection! */ 2258 return 0; 2259 } 2260 total += ret; 2261 } 2262 return total; 2263 } 2264 2265 static void 2266 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2267 { 2268 sig_atomic_t cmd = NSD_QUIT_SYNC; 2269 udb_ptr t, next; 2270 udb_base* u = nsd->task[nsd->mytask]; 2271 udb_ptr_init(&next, u); 2272 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2273 udb_base_set_userdata(u, 0); 2274 while(!udb_ptr_is_null(&t)) { 2275 /* store next in list so this one can be deleted or reused */ 2276 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2277 udb_rptr_zero(&TASKLIST(&t)->next, u); 2278 2279 /* process task t */ 2280 /* append results for task t and update last_task */ 2281 task_process_in_reload(nsd, u, last_task, &t); 2282 2283 /* go to next */ 2284 udb_ptr_set_ptr(&t, u, &next); 2285 2286 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2287 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2288 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2289 if(cmd == NSD_QUIT) { 2290 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2291 /* unlink files of remainder of tasks */ 2292 while(!udb_ptr_is_null(&t)) { 2293 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2294 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2295 } 2296 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2297 } 2298 udb_ptr_unlink(&t, u); 2299 udb_ptr_unlink(&next, u); 2300 exit(0); 2301 } 2302 } 2303 2304 } 2305 udb_ptr_unlink(&t, u); 2306 udb_ptr_unlink(&next, u); 2307 } 2308 2309 void server_verify(struct nsd *nsd, int cmdsocket); 2310 2311 /* 2312 * Reload the database, stop parent, re-fork children and continue. 2313 * as server_main. 2314 */ 2315 static void 2316 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2317 int cmdsocket) 2318 { 2319 pid_t mypid; 2320 sig_atomic_t cmd = NSD_QUIT_SYNC; 2321 int ret; 2322 udb_ptr last_task; 2323 struct sigaction old_sigchld, ign_sigchld; 2324 struct radnode* node; 2325 zone_type* zone; 2326 enum soainfo_hint hint; 2327 /* ignore SIGCHLD from the previous server_main that used this pid */ 2328 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2329 ign_sigchld.sa_handler = SIG_IGN; 2330 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2331 2332 #ifdef HAVE_SETPROCTITLE 2333 setproctitle("main"); 2334 #endif 2335 #ifdef HAVE_CPUSET_T 2336 if(nsd->use_cpu_affinity) { 2337 set_cpu_affinity(nsd->cpuset); 2338 } 2339 #endif 2340 2341 /* see what tasks we got from xfrd */ 2342 task_remap(nsd->task[nsd->mytask]); 2343 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2344 reload_process_tasks(nsd, &last_task, cmdsocket); 2345 2346 #ifndef NDEBUG 2347 if(nsd_debug_level >= 1) 2348 region_log_stats(nsd->db->region); 2349 #endif /* NDEBUG */ 2350 initialize_dname_compression_tables(nsd); 2351 2352 #ifdef BIND8_STATS 2353 /* Restart dumping stats if required. */ 2354 time(&nsd->st->boot); 2355 set_bind8_alarm(nsd); 2356 /* Switch to a different set of stat array for new server processes, 2357 * because they can briefly coexist with the old processes. They 2358 * have their own stat structure. */ 2359 nsd->stat_current = (nsd->stat_current==0?1:0); 2360 #endif 2361 #ifdef USE_ZONE_STATS 2362 server_zonestat_realloc(nsd); /* realloc for new children */ 2363 server_zonestat_switch(nsd); 2364 #endif 2365 2366 if(nsd->options->verify_enable) { 2367 #ifdef RATELIMIT 2368 /* allocate resources for rate limiting. use a slot that is guaranteed 2369 not mapped to a file so no persistent data is overwritten */ 2370 rrl_init(nsd->child_count + 1); 2371 #endif 2372 2373 /* spin-up server and execute verifiers for each zone */ 2374 server_verify(nsd, cmdsocket); 2375 #ifdef RATELIMIT 2376 /* deallocate rate limiting resources */ 2377 rrl_deinit(nsd->child_count + 1); 2378 #endif 2379 } 2380 2381 for(node = radix_first(nsd->db->zonetree); 2382 node != NULL; 2383 node = radix_next(node)) 2384 { 2385 zone = (zone_type *)node->elem; 2386 if(zone->is_updated) { 2387 if(zone->is_bad) { 2388 nsd->mode = NSD_RELOAD_FAILED; 2389 hint = soainfo_bad; 2390 } else { 2391 hint = soainfo_ok; 2392 } 2393 /* update(s), verified or not, possibly with subsequent 2394 skipped update(s). skipped update(s) are picked up 2395 by failed update check in xfrd */ 2396 task_new_soainfo(nsd->task[nsd->mytask], &last_task, 2397 zone, hint); 2398 } else if(zone->is_skipped) { 2399 /* corrupt or inconsistent update without preceding 2400 update(s), communicate soainfo_gone */ 2401 task_new_soainfo(nsd->task[nsd->mytask], &last_task, 2402 zone, soainfo_gone); 2403 } 2404 zone->is_updated = 0; 2405 zone->is_skipped = 0; 2406 } 2407 2408 if(nsd->mode == NSD_RELOAD_FAILED) { 2409 exit(NSD_RELOAD_FAILED); 2410 } 2411 2412 /* listen for the signals of failed children again */ 2413 sigaction(SIGCHLD, &old_sigchld, NULL); 2414 #ifdef USE_DNSTAP 2415 if (nsd->dt_collector) { 2416 int *swap_fd_send; 2417 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes")); 2418 /* Swap fd_send with fd_swap so old serve child and new serve 2419 * childs will not write to the same pipe ends simultaneously */ 2420 swap_fd_send = nsd->dt_collector_fd_send; 2421 nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap; 2422 nsd->dt_collector_fd_swap = swap_fd_send; 2423 2424 } 2425 #endif 2426 /* Start new child processes */ 2427 if (server_start_children(nsd, server_region, netio, &nsd-> 2428 xfrd_listener->fd) != 0) { 2429 send_children_quit(nsd); 2430 exit(1); 2431 } 2432 2433 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2434 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2435 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2436 if(cmd == NSD_QUIT) { 2437 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2438 send_children_quit(nsd); 2439 exit(0); 2440 } 2441 } 2442 2443 /* Send quit command to parent: blocking, wait for receipt. */ 2444 do { 2445 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2446 cmd = NSD_QUIT_SYNC; 2447 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2448 { 2449 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2450 strerror(errno)); 2451 } 2452 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2453 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2454 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2455 RELOAD_SYNC_TIMEOUT); 2456 if(ret == -2) { 2457 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2458 } 2459 } while (ret == -2); 2460 if(ret == -1) { 2461 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2462 strerror(errno)); 2463 } 2464 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2465 if(cmd == NSD_QUIT) { 2466 /* small race condition possible here, parent got quit cmd. */ 2467 send_children_quit(nsd); 2468 exit(1); 2469 } 2470 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2471 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2472 task_process_sync(nsd->task[nsd->mytask]); 2473 #ifdef USE_ZONE_STATS 2474 server_zonestat_realloc(nsd); /* realloc for next children */ 2475 #endif 2476 2477 /* send soainfo to the xfrd process, signal it that reload is done, 2478 * it picks up the taskudb */ 2479 cmd = NSD_RELOAD_DONE; 2480 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2481 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2482 strerror(errno)); 2483 } 2484 mypid = getpid(); 2485 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2486 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2487 strerror(errno)); 2488 } 2489 2490 /* try to reopen file */ 2491 if (nsd->file_rotation_ok) 2492 log_reopen(nsd->log_filename, 1); 2493 /* exit reload, continue as new server_main */ 2494 } 2495 2496 /* 2497 * Get the mode depending on the signal hints that have been received. 2498 * Multiple signal hints can be received and will be handled in turn. 2499 */ 2500 static sig_atomic_t 2501 server_signal_mode(struct nsd *nsd) 2502 { 2503 if(nsd->signal_hint_quit) { 2504 nsd->signal_hint_quit = 0; 2505 return NSD_QUIT; 2506 } 2507 else if(nsd->signal_hint_shutdown) { 2508 nsd->signal_hint_shutdown = 0; 2509 return NSD_SHUTDOWN; 2510 } 2511 else if(nsd->signal_hint_child) { 2512 nsd->signal_hint_child = 0; 2513 return NSD_REAP_CHILDREN; 2514 } 2515 else if(nsd->signal_hint_reload) { 2516 nsd->signal_hint_reload = 0; 2517 return NSD_RELOAD; 2518 } 2519 else if(nsd->signal_hint_reload_hup) { 2520 nsd->signal_hint_reload_hup = 0; 2521 return NSD_RELOAD_REQ; 2522 } 2523 else if(nsd->signal_hint_stats) { 2524 nsd->signal_hint_stats = 0; 2525 #ifdef BIND8_STATS 2526 set_bind8_alarm(nsd); 2527 #endif 2528 return NSD_STATS; 2529 } 2530 else if(nsd->signal_hint_statsusr) { 2531 nsd->signal_hint_statsusr = 0; 2532 return NSD_STATS; 2533 } 2534 return NSD_RUN; 2535 } 2536 2537 /* 2538 * The main server simply waits for signals and child processes to 2539 * terminate. Child processes are restarted as necessary. 2540 */ 2541 void 2542 server_main(struct nsd *nsd) 2543 { 2544 region_type *server_region = region_create(xalloc, free); 2545 netio_type *netio = netio_create(server_region); 2546 netio_handler_type reload_listener; 2547 int reload_sockets[2] = {-1, -1}; 2548 struct timespec timeout_spec; 2549 int status; 2550 pid_t child_pid; 2551 pid_t reload_pid = -1; 2552 sig_atomic_t mode; 2553 2554 /* Ensure we are the main process */ 2555 assert(nsd->server_kind == NSD_SERVER_MAIN); 2556 2557 /* Add listener for the XFRD process */ 2558 netio_add_handler(netio, nsd->xfrd_listener); 2559 2560 #ifdef BIND8_STATS 2561 nsd->st = &nsd->stat_map[0]; 2562 nsd->st->db_disk = 0; 2563 nsd->st->db_mem = region_get_mem(nsd->db->region); 2564 #endif 2565 2566 /* Start the child processes that handle incoming queries */ 2567 if (server_start_children(nsd, server_region, netio, 2568 &nsd->xfrd_listener->fd) != 0) { 2569 send_children_quit(nsd); 2570 exit(1); 2571 } 2572 reload_listener.fd = -1; 2573 2574 /* This_child MUST be 0, because this is the parent process */ 2575 assert(nsd->this_child == 0); 2576 2577 /* Run the server until we get a shutdown signal */ 2578 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2579 /* Did we receive a signal that changes our mode? */ 2580 if(mode == NSD_RUN) { 2581 nsd->mode = mode = server_signal_mode(nsd); 2582 } 2583 2584 switch (mode) { 2585 case NSD_RUN: 2586 /* see if any child processes terminated */ 2587 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2588 int is_child = delete_child_pid(nsd, child_pid); 2589 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2590 if(nsd->children[is_child].child_fd == -1) 2591 nsd->children[is_child].has_exited = 1; 2592 parent_check_all_children_exited(nsd); 2593 } else if(is_child != -1) { 2594 log_msg(LOG_WARNING, 2595 "server %d died unexpectedly with status %d, restarting", 2596 (int) child_pid, status); 2597 restart_child_servers(nsd, server_region, netio, 2598 &nsd->xfrd_listener->fd); 2599 } else if (child_pid == reload_pid) { 2600 sig_atomic_t cmd = NSD_RELOAD_FAILED; 2601 pid_t mypid; 2602 log_msg(LOG_WARNING, 2603 "Reload process %d failed with status %d, continuing with old database", 2604 (int) child_pid, status); 2605 reload_pid = -1; 2606 if(reload_listener.fd != -1) close(reload_listener.fd); 2607 netio_remove_handler(netio, &reload_listener); 2608 reload_listener.fd = -1; 2609 reload_listener.event_types = NETIO_EVENT_NONE; 2610 task_process_sync(nsd->task[nsd->mytask]); 2611 /* inform xfrd reload attempt ended */ 2612 if(!write_socket(nsd->xfrd_listener->fd, 2613 &cmd, sizeof(cmd))) { 2614 log_msg(LOG_ERR, "problems " 2615 "sending SOAEND to xfrd: %s", 2616 strerror(errno)); 2617 } 2618 mypid = getpid(); 2619 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2620 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2621 strerror(errno)); 2622 } 2623 #ifdef USE_DNSTAP 2624 } else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) { 2625 log_msg(LOG_WARNING, 2626 "dnstap-collector %d terminated with status %d", 2627 (int) child_pid, status); 2628 if(nsd->dt_collector) { 2629 dt_collector_close(nsd->dt_collector, nsd); 2630 dt_collector_destroy(nsd->dt_collector, nsd); 2631 nsd->dt_collector = NULL; 2632 } 2633 /* Only respawn a crashed (or exited) 2634 * dnstap-collector when not reloading, 2635 * to not induce a reload during a 2636 * reload (which would seriously 2637 * disrupt nsd procedures and lead to 2638 * unpredictable results)! 2639 * 2640 * This will *leave* a dnstap-collector 2641 * process terminated, but because 2642 * signalling of the reload process to 2643 * the main process to respawn in this 2644 * situation will be cumbersome, and 2645 * because this situation is so 2646 * specific (and therefore hopefully 2647 * extremely rare or non-existing at 2648 * all), plus the fact that we are left 2649 * with a perfectly function NSD 2650 * (besides not logging dnstap 2651 * messages), I consider it acceptable 2652 * to leave this unresolved. 2653 */ 2654 if(reload_pid == -1 && nsd->options->dnstap_enable) { 2655 nsd->dt_collector = dt_collector_create(nsd); 2656 dt_collector_start(nsd->dt_collector, nsd); 2657 nsd->mode = NSD_RELOAD_REQ; 2658 } 2659 #endif 2660 } else if(status != 0) { 2661 /* check for status, because we get 2662 * the old-servermain because reload 2663 * is the process-parent of old-main, 2664 * and we get older server-processes 2665 * that are exiting after a reload */ 2666 log_msg(LOG_WARNING, 2667 "process %d terminated with status %d", 2668 (int) child_pid, status); 2669 } 2670 } 2671 if (child_pid == -1) { 2672 if (errno == EINTR) { 2673 continue; 2674 } 2675 if (errno != ECHILD) 2676 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2677 } 2678 if (nsd->mode != NSD_RUN) 2679 break; 2680 2681 /* timeout to collect processes. In case no sigchild happens. */ 2682 timeout_spec.tv_sec = 60; 2683 timeout_spec.tv_nsec = 0; 2684 2685 /* listen on ports, timeout for collecting terminated children */ 2686 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2687 if (errno != EINTR) { 2688 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2689 } 2690 } 2691 if(nsd->restart_children) { 2692 restart_child_servers(nsd, server_region, netio, 2693 &nsd->xfrd_listener->fd); 2694 nsd->restart_children = 0; 2695 } 2696 if(nsd->reload_failed) { 2697 sig_atomic_t cmd = NSD_RELOAD_FAILED; 2698 pid_t mypid; 2699 nsd->reload_failed = 0; 2700 log_msg(LOG_WARNING, 2701 "Reload process %d failed, continuing with old database", 2702 (int) reload_pid); 2703 reload_pid = -1; 2704 if(reload_listener.fd != -1) close(reload_listener.fd); 2705 netio_remove_handler(netio, &reload_listener); 2706 reload_listener.fd = -1; 2707 reload_listener.event_types = NETIO_EVENT_NONE; 2708 task_process_sync(nsd->task[nsd->mytask]); 2709 /* inform xfrd reload attempt ended */ 2710 if(!write_socket(nsd->xfrd_listener->fd, 2711 &cmd, sizeof(cmd))) { 2712 log_msg(LOG_ERR, "problems " 2713 "sending SOAEND to xfrd: %s", 2714 strerror(errno)); 2715 } 2716 mypid = getpid(); 2717 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2718 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2719 strerror(errno)); 2720 } 2721 } 2722 2723 break; 2724 case NSD_RELOAD_REQ: { 2725 sig_atomic_t cmd = NSD_RELOAD_REQ; 2726 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2727 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2728 "main: ipc send reload_req to xfrd")); 2729 if(!write_socket(nsd->xfrd_listener->fd, 2730 &cmd, sizeof(cmd))) { 2731 log_msg(LOG_ERR, "server_main: could not send " 2732 "reload_req to xfrd: %s", strerror(errno)); 2733 } 2734 nsd->mode = NSD_RUN; 2735 } break; 2736 case NSD_RELOAD: 2737 /* Continue to run nsd after reload */ 2738 nsd->mode = NSD_RUN; 2739 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2740 if (reload_pid != -1) { 2741 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2742 (int) reload_pid); 2743 break; 2744 } 2745 2746 /* switch the mytask to keep track of who owns task*/ 2747 nsd->mytask = 1 - nsd->mytask; 2748 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2749 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2750 reload_pid = -1; 2751 break; 2752 } 2753 2754 /* Do actual reload */ 2755 reload_pid = fork(); 2756 switch (reload_pid) { 2757 case -1: 2758 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2759 break; 2760 default: 2761 /* PARENT */ 2762 close(reload_sockets[0]); 2763 server_reload(nsd, server_region, netio, 2764 reload_sockets[1]); 2765 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2766 close(reload_sockets[1]); 2767 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2768 /* drop stale xfrd ipc data */ 2769 ((struct ipc_handler_conn_data*)nsd-> 2770 xfrd_listener->user_data) 2771 ->conn->is_reading = 0; 2772 reload_pid = -1; 2773 reload_listener.fd = -1; 2774 reload_listener.event_types = NETIO_EVENT_NONE; 2775 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2776 break; 2777 case 0: 2778 /* CHILD */ 2779 /* server_main keep running until NSD_QUIT_SYNC 2780 * received from reload. */ 2781 close(reload_sockets[1]); 2782 reload_listener.fd = reload_sockets[0]; 2783 reload_listener.timeout = NULL; 2784 reload_listener.user_data = nsd; 2785 reload_listener.event_types = NETIO_EVENT_READ; 2786 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2787 netio_add_handler(netio, &reload_listener); 2788 reload_pid = getppid(); 2789 break; 2790 } 2791 break; 2792 case NSD_QUIT_SYNC: 2793 /* synchronisation of xfrd, parent and reload */ 2794 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2795 sig_atomic_t cmd = NSD_RELOAD; 2796 /* stop xfrd ipc writes in progress */ 2797 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2798 "main: ipc send indication reload")); 2799 if(!write_socket(nsd->xfrd_listener->fd, 2800 &cmd, sizeof(cmd))) { 2801 log_msg(LOG_ERR, "server_main: could not send reload " 2802 "indication to xfrd: %s", strerror(errno)); 2803 } 2804 /* wait for ACK from xfrd */ 2805 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2806 nsd->quit_sync_done = 1; 2807 } 2808 nsd->mode = NSD_RUN; 2809 break; 2810 case NSD_QUIT: 2811 /* silent shutdown during reload */ 2812 if(reload_listener.fd != -1) { 2813 /* acknowledge the quit, to sync reload that we will really quit now */ 2814 sig_atomic_t cmd = NSD_RELOAD; 2815 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2816 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2817 log_msg(LOG_ERR, "server_main: " 2818 "could not ack quit: %s", strerror(errno)); 2819 } 2820 close(reload_listener.fd); 2821 } 2822 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2823 /* only quit children after xfrd has acked */ 2824 send_children_quit(nsd); 2825 2826 #ifdef MEMCLEAN /* OS collects memory pages */ 2827 region_destroy(server_region); 2828 #endif 2829 server_shutdown(nsd); 2830 2831 /* ENOTREACH */ 2832 break; 2833 case NSD_SHUTDOWN: 2834 break; 2835 case NSD_REAP_CHILDREN: 2836 /* continue; wait for child in run loop */ 2837 nsd->mode = NSD_RUN; 2838 break; 2839 case NSD_STATS: 2840 #ifdef BIND8_STATS 2841 set_children_stats(nsd); 2842 #endif 2843 nsd->mode = NSD_RUN; 2844 break; 2845 default: 2846 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2847 nsd->mode = NSD_RUN; 2848 break; 2849 } 2850 } 2851 log_msg(LOG_WARNING, "signal received, shutting down..."); 2852 2853 /* close opened ports to avoid race with restart of nsd */ 2854 server_close_all_sockets(nsd->udp, nsd->ifs); 2855 server_close_all_sockets(nsd->tcp, nsd->ifs); 2856 daemon_remote_close(nsd->rc); 2857 send_children_quit_and_wait(nsd); 2858 2859 /* Unlink it if possible... */ 2860 unlinkpid(nsd->pidfile); 2861 unlink(nsd->task[0]->fname); 2862 unlink(nsd->task[1]->fname); 2863 #ifdef USE_ZONE_STATS 2864 unlink(nsd->zonestatfname[0]); 2865 unlink(nsd->zonestatfname[1]); 2866 #endif 2867 #ifdef BIND8_STATS 2868 server_stat_free(nsd); 2869 #endif 2870 #ifdef USE_DNSTAP 2871 dt_collector_close(nsd->dt_collector, nsd); 2872 #endif 2873 2874 if(reload_listener.fd != -1) { 2875 sig_atomic_t cmd = NSD_QUIT; 2876 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2877 "main: ipc send quit to reload-process")); 2878 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2879 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2880 strerror(errno)); 2881 } 2882 fsync(reload_listener.fd); 2883 close(reload_listener.fd); 2884 /* wait for reload to finish processing */ 2885 while(1) { 2886 if(waitpid(reload_pid, NULL, 0) == -1) { 2887 if(errno == EINTR) continue; 2888 if(errno == ECHILD) break; 2889 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2890 (int)reload_pid, strerror(errno)); 2891 } 2892 break; 2893 } 2894 } 2895 if(nsd->xfrd_listener->fd != -1) { 2896 /* complete quit, stop xfrd */ 2897 sig_atomic_t cmd = NSD_QUIT; 2898 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2899 "main: ipc send quit to xfrd")); 2900 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2901 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2902 strerror(errno)); 2903 } 2904 fsync(nsd->xfrd_listener->fd); 2905 close(nsd->xfrd_listener->fd); 2906 (void)kill(nsd->pid, SIGTERM); 2907 } 2908 2909 #ifdef MEMCLEAN /* OS collects memory pages */ 2910 region_destroy(server_region); 2911 #endif 2912 server_shutdown(nsd); 2913 } 2914 2915 static query_state_type 2916 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p) 2917 { 2918 return query_process(query, nsd, now_p); 2919 } 2920 2921 static query_state_type 2922 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p) 2923 { 2924 #ifdef RATELIMIT 2925 if(query_process(query, nsd, now_p) != QUERY_DISCARDED) { 2926 if(query->edns.cookie_status != COOKIE_VALID 2927 && query->edns.cookie_status != COOKIE_VALID_REUSE 2928 && rrl_process_query(query)) 2929 return rrl_slip(query); 2930 else return QUERY_PROCESSED; 2931 } 2932 return QUERY_DISCARDED; 2933 #else 2934 return query_process(query, nsd, now_p); 2935 #endif 2936 } 2937 2938 const char* 2939 nsd_event_vs(void) 2940 { 2941 #ifdef USE_MINI_EVENT 2942 return ""; 2943 #else 2944 return event_get_version(); 2945 #endif 2946 } 2947 2948 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 2949 static const char* ub_ev_backend2str(int b) 2950 { 2951 switch(b) { 2952 case EVBACKEND_SELECT: return "select"; 2953 case EVBACKEND_POLL: return "poll"; 2954 case EVBACKEND_EPOLL: return "epoll"; 2955 case EVBACKEND_KQUEUE: return "kqueue"; 2956 case EVBACKEND_DEVPOLL: return "devpoll"; 2957 case EVBACKEND_PORT: return "evport"; 2958 } 2959 return "unknown"; 2960 } 2961 #endif 2962 2963 const char* 2964 nsd_event_method(void) 2965 { 2966 #ifdef USE_MINI_EVENT 2967 return "select"; 2968 #else 2969 struct event_base* b = nsd_child_event_base(); 2970 const char* m; 2971 # ifdef EV_FEATURE_BACKENDS 2972 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 2973 # elif defined(HAVE_EVENT_BASE_GET_METHOD) 2974 m = event_base_get_method(b); 2975 # else 2976 m = "?"; 2977 # endif 2978 # ifdef MEMCLEAN 2979 event_base_free(b); 2980 # endif 2981 return m; 2982 #endif 2983 } 2984 2985 struct event_base* 2986 nsd_child_event_base(void) 2987 { 2988 struct event_base* base; 2989 #ifdef USE_MINI_EVENT 2990 static time_t secs; 2991 static struct timeval now; 2992 base = event_init(&secs, &now); 2993 #else 2994 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2995 /* libev */ 2996 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2997 # else 2998 /* libevent */ 2999 # ifdef HAVE_EVENT_BASE_NEW 3000 base = event_base_new(); 3001 # else 3002 base = event_init(); 3003 # endif 3004 # endif 3005 #endif 3006 return base; 3007 } 3008 3009 static void 3010 add_udp_handler( 3011 struct nsd *nsd, 3012 struct nsd_socket *sock, 3013 struct udp_handler_data *data) 3014 { 3015 struct event *handler = &data->event; 3016 3017 data->nsd = nsd; 3018 data->socket = sock; 3019 3020 if(nsd->options->proxy_protocol_port && 3021 sockaddr_uses_proxy_protocol_port(nsd->options, 3022 (struct sockaddr *)&sock->addr.ai_addr)) { 3023 data->pp2_enabled = 1; 3024 } 3025 3026 memset(handler, 0, sizeof(*handler)); 3027 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 3028 if(event_base_set(nsd->event_base, handler) != 0) 3029 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 3030 if(event_add(handler, NULL) != 0) 3031 log_msg(LOG_ERR, "nsd udp: event_add failed"); 3032 } 3033 3034 void 3035 add_tcp_handler( 3036 struct nsd *nsd, 3037 struct nsd_socket *sock, 3038 struct tcp_accept_handler_data *data) 3039 { 3040 struct event *handler = &data->event; 3041 3042 data->nsd = nsd; 3043 data->socket = sock; 3044 3045 if(nsd->options->proxy_protocol_port && 3046 sockaddr_uses_proxy_protocol_port(nsd->options, 3047 (struct sockaddr *)&sock->addr.ai_addr)) { 3048 data->pp2_enabled = 1; 3049 } 3050 3051 #ifdef HAVE_SSL 3052 if (nsd->tls_ctx && 3053 nsd->options->tls_port && 3054 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 3055 { 3056 data->tls_accept = 1; 3057 if(verbosity >= 2) { 3058 char buf[48]; 3059 addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 3060 VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 3061 } 3062 } else { 3063 data->tls_accept = 0; 3064 } 3065 #endif 3066 3067 memset(handler, 0, sizeof(*handler)); 3068 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 3069 if(event_base_set(nsd->event_base, handler) != 0) 3070 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 3071 if(event_add(handler, NULL) != 0) 3072 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 3073 data->event_added = 1; 3074 } 3075 3076 /* 3077 * Serve DNS request to verifiers (short-lived) 3078 */ 3079 void server_verify(struct nsd *nsd, int cmdsocket) 3080 { 3081 size_t size = 0; 3082 struct event cmd_event, signal_event, exit_event; 3083 struct zone *zone; 3084 3085 assert(nsd != NULL); 3086 3087 zone = verify_next_zone(nsd, NULL); 3088 if(zone == NULL) 3089 return; 3090 3091 nsd->server_region = region_create(xalloc, free); 3092 nsd->event_base = nsd_child_event_base(); 3093 3094 nsd->next_zone_to_verify = zone; 3095 nsd->verifier_count = 0; 3096 nsd->verifier_limit = nsd->options->verifier_count; 3097 size = sizeof(struct verifier) * nsd->verifier_limit; 3098 if(pipe(nsd->verifier_pipe) == -1) { 3099 log_msg(LOG_ERR, "verify: could not create pipe: %s", 3100 strerror(errno)); 3101 goto fail_pipe; 3102 } 3103 fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC); 3104 fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC); 3105 nsd->verifiers = region_alloc_zero(nsd->server_region, size); 3106 3107 for(size_t i = 0; i < nsd->verifier_limit; i++) { 3108 nsd->verifiers[i].nsd = nsd; 3109 nsd->verifiers[i].zone = NULL; 3110 nsd->verifiers[i].pid = -1; 3111 nsd->verifiers[i].output_stream.fd = -1; 3112 nsd->verifiers[i].output_stream.priority = LOG_INFO; 3113 nsd->verifiers[i].error_stream.fd = -1; 3114 nsd->verifiers[i].error_stream.priority = LOG_ERR; 3115 } 3116 3117 event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd); 3118 if(event_base_set(nsd->event_base, &cmd_event) != 0 || 3119 event_add(&cmd_event, NULL) != 0) 3120 { 3121 log_msg(LOG_ERR, "verify: could not add command event"); 3122 goto fail; 3123 } 3124 3125 event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd); 3126 if(event_base_set(nsd->event_base, &signal_event) != 0 || 3127 signal_add(&signal_event, NULL) != 0) 3128 { 3129 log_msg(LOG_ERR, "verify: could not add signal event"); 3130 goto fail; 3131 } 3132 3133 event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd); 3134 if(event_base_set(nsd->event_base, &exit_event) != 0 || 3135 event_add(&exit_event, NULL) != 0) 3136 { 3137 log_msg(LOG_ERR, "verify: could not add exit event"); 3138 goto fail; 3139 } 3140 3141 memset(msgs, 0, sizeof(msgs)); 3142 for (int i = 0; i < NUM_RECV_PER_SELECT; i++) { 3143 queries[i] = query_create(nsd->server_region, 3144 compressed_dname_offsets, 3145 compression_table_size, compressed_dnames); 3146 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3147 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3148 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3149 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3150 msgs[i].msg_hdr.msg_iovlen = 1; 3151 msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr; 3152 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3153 } 3154 3155 for (size_t i = 0; i < nsd->verify_ifs; i++) { 3156 struct udp_handler_data *data; 3157 data = region_alloc_zero( 3158 nsd->server_region, sizeof(*data)); 3159 add_udp_handler(nsd, &nsd->verify_udp[i], data); 3160 } 3161 3162 tcp_accept_handler_count = nsd->verify_ifs; 3163 tcp_accept_handlers = region_alloc_array(nsd->server_region, 3164 nsd->verify_ifs, sizeof(*tcp_accept_handlers)); 3165 3166 for (size_t i = 0; i < nsd->verify_ifs; i++) { 3167 struct tcp_accept_handler_data *data; 3168 data = &tcp_accept_handlers[i]; 3169 memset(data, 0, sizeof(*data)); 3170 add_tcp_handler(nsd, &nsd->verify_tcp[i], data); 3171 } 3172 3173 while(nsd->next_zone_to_verify != NULL && 3174 nsd->verifier_count < nsd->verifier_limit) 3175 { 3176 verify_zone(nsd, nsd->next_zone_to_verify); 3177 nsd->next_zone_to_verify 3178 = verify_next_zone(nsd, nsd->next_zone_to_verify); 3179 } 3180 3181 /* short-lived main loop */ 3182 event_base_dispatch(nsd->event_base); 3183 3184 /* remove command and exit event handlers */ 3185 event_del(&exit_event); 3186 event_del(&signal_event); 3187 event_del(&cmd_event); 3188 3189 assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT); 3190 assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT); 3191 fail: 3192 close(nsd->verifier_pipe[0]); 3193 close(nsd->verifier_pipe[1]); 3194 fail_pipe: 3195 event_base_free(nsd->event_base); 3196 region_destroy(nsd->server_region); 3197 3198 nsd->event_base = NULL; 3199 nsd->server_region = NULL; 3200 nsd->verifier_limit = 0; 3201 nsd->verifier_pipe[0] = -1; 3202 nsd->verifier_pipe[1] = -1; 3203 nsd->verifiers = NULL; 3204 } 3205 3206 /* 3207 * Serve DNS requests. 3208 */ 3209 void 3210 server_child(struct nsd *nsd) 3211 { 3212 size_t i, from, numifs; 3213 region_type *server_region = region_create(xalloc, free); 3214 struct event_base* event_base = nsd_child_event_base(); 3215 sig_atomic_t mode; 3216 3217 if(!event_base) { 3218 log_msg(LOG_ERR, "nsd server could not create event base"); 3219 exit(1); 3220 } 3221 nsd->event_base = event_base; 3222 nsd->server_region = server_region; 3223 3224 #ifdef RATELIMIT 3225 rrl_init(nsd->this_child->child_num); 3226 #endif 3227 3228 assert(nsd->server_kind != NSD_SERVER_MAIN); 3229 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 3230 3231 #ifdef HAVE_SETPROCTITLE 3232 setproctitle("server %d", nsd->this_child->child_num + 1); 3233 #endif 3234 #ifdef HAVE_CPUSET_T 3235 if(nsd->use_cpu_affinity) { 3236 set_cpu_affinity(nsd->this_child->cpuset); 3237 } 3238 #endif 3239 #ifdef BIND8_STATS 3240 nsd->st = &nsd->stats_per_child[nsd->stat_current] 3241 [nsd->this_child->child_num]; 3242 nsd->st->boot = nsd->stat_map[0].boot; 3243 memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc)); 3244 #endif 3245 3246 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 3247 server_close_all_sockets(nsd->tcp, nsd->ifs); 3248 } 3249 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 3250 server_close_all_sockets(nsd->udp, nsd->ifs); 3251 } 3252 3253 if (nsd->this_child->parent_fd != -1) { 3254 struct event *handler; 3255 struct ipc_handler_conn_data* user_data = 3256 (struct ipc_handler_conn_data*)region_alloc( 3257 server_region, sizeof(struct ipc_handler_conn_data)); 3258 user_data->nsd = nsd; 3259 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 3260 3261 handler = (struct event*) region_alloc( 3262 server_region, sizeof(*handler)); 3263 memset(handler, 0, sizeof(*handler)); 3264 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 3265 EV_READ, child_handle_parent_command, user_data); 3266 if(event_base_set(event_base, handler) != 0) 3267 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 3268 if(event_add(handler, NULL) != 0) 3269 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 3270 } 3271 3272 if(nsd->reuseport) { 3273 numifs = nsd->ifs / nsd->reuseport; 3274 from = numifs * nsd->this_child->child_num; 3275 if(from+numifs > nsd->ifs) { /* should not happen */ 3276 from = 0; 3277 numifs = nsd->ifs; 3278 } 3279 } else { 3280 from = 0; 3281 numifs = nsd->ifs; 3282 } 3283 3284 if (nsd->server_kind & NSD_SERVER_UDP) { 3285 int child = nsd->this_child->child_num; 3286 memset(msgs, 0, sizeof(msgs)); 3287 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 3288 queries[i] = query_create(server_region, 3289 compressed_dname_offsets, 3290 compression_table_size, compressed_dnames); 3291 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3292 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3293 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3294 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3295 msgs[i].msg_hdr.msg_iovlen = 1; 3296 msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr; 3297 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3298 } 3299 3300 for (i = 0; i < nsd->ifs; i++) { 3301 int listen; 3302 struct udp_handler_data *data; 3303 3304 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 3305 3306 if(i >= from && i < (from + numifs) && listen) { 3307 data = region_alloc_zero( 3308 nsd->server_region, sizeof(*data)); 3309 add_udp_handler(nsd, &nsd->udp[i], data); 3310 } else { 3311 /* close sockets intended for other servers */ 3312 server_close_socket(&nsd->udp[i]); 3313 } 3314 } 3315 } 3316 3317 /* 3318 * Keep track of all the TCP accept handlers so we can enable 3319 * and disable them based on the current number of active TCP 3320 * connections. 3321 */ 3322 if (nsd->server_kind & NSD_SERVER_TCP) { 3323 int child = nsd->this_child->child_num; 3324 tcp_accept_handler_count = numifs; 3325 tcp_accept_handlers = region_alloc_array(server_region, 3326 numifs, sizeof(*tcp_accept_handlers)); 3327 3328 for (i = 0; i < nsd->ifs; i++) { 3329 int listen; 3330 struct tcp_accept_handler_data *data; 3331 3332 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 3333 3334 if(i >= from && i < (from + numifs) && listen) { 3335 data = &tcp_accept_handlers[i-from]; 3336 memset(data, 0, sizeof(*data)); 3337 add_tcp_handler(nsd, &nsd->tcp[i], data); 3338 } else { 3339 /* close sockets intended for other servers */ 3340 /* 3341 * uncomment this once tcp servers are no 3342 * longer copied in the tcp fd copy line 3343 * in server_init(). 3344 server_close_socket(&nsd->tcp[i]); 3345 */ 3346 /* close sockets not meant for this server*/ 3347 if(!listen) 3348 server_close_socket(&nsd->tcp[i]); 3349 } 3350 } 3351 } else { 3352 tcp_accept_handler_count = 0; 3353 } 3354 3355 /* The main loop... */ 3356 while ((mode = nsd->mode) != NSD_QUIT) { 3357 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 3358 3359 /* Do we need to do the statistics... */ 3360 if (mode == NSD_STATS) { 3361 #ifdef BIND8_STATS 3362 int p = nsd->st_period; 3363 nsd->st_period = 1; /* force stats printout */ 3364 /* Dump the statistics */ 3365 bind8_stats(nsd); 3366 nsd->st_period = p; 3367 #else /* !BIND8_STATS */ 3368 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3369 #endif /* BIND8_STATS */ 3370 3371 nsd->mode = NSD_RUN; 3372 } 3373 else if (mode == NSD_REAP_CHILDREN) { 3374 /* got signal, notify parent. parent reaps terminated children. */ 3375 if (nsd->this_child->parent_fd != -1) { 3376 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3377 if (write(nsd->this_child->parent_fd, 3378 &parent_notify, 3379 sizeof(parent_notify)) == -1) 3380 { 3381 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3382 (int) nsd->this_child->pid, strerror(errno)); 3383 } 3384 } else /* no parent, so reap 'em */ 3385 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3386 nsd->mode = NSD_RUN; 3387 } 3388 else if(mode == NSD_RUN) { 3389 /* Wait for a query... */ 3390 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3391 if (errno != EINTR) { 3392 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3393 break; 3394 } 3395 } 3396 } else if(mode == NSD_QUIT) { 3397 /* ignore here, quit */ 3398 } else { 3399 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3400 (int)mode); 3401 nsd->mode = NSD_RUN; 3402 } 3403 } 3404 3405 service_remaining_tcp(nsd); 3406 #ifdef BIND8_STATS 3407 bind8_stats(nsd); 3408 #endif /* BIND8_STATS */ 3409 3410 #ifdef MEMCLEAN /* OS collects memory pages */ 3411 #ifdef RATELIMIT 3412 rrl_deinit(nsd->this_child->child_num); 3413 #endif 3414 event_base_free(event_base); 3415 region_destroy(server_region); 3416 #endif 3417 server_shutdown(nsd); 3418 } 3419 3420 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3421 { 3422 int* timed_out = (int*)arg; 3423 assert(event & EV_TIMEOUT); (void)event; 3424 /* wake up the service tcp thread, note event is no longer 3425 * registered */ 3426 *timed_out = 1; 3427 } 3428 3429 void 3430 service_remaining_tcp(struct nsd* nsd) 3431 { 3432 struct tcp_handler_data* p; 3433 struct event_base* event_base; 3434 /* check if it is needed */ 3435 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3436 return; 3437 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3438 #ifdef USE_DNSTAP 3439 /* remove dnstap collector, we cannot write there because the new 3440 * child process is using the file descriptor, or the child 3441 * process after that. */ 3442 dt_collector_destroy(nsd->dt_collector, nsd); 3443 nsd->dt_collector = NULL; 3444 #endif 3445 /* setup event base */ 3446 event_base = nsd_child_event_base(); 3447 if(!event_base) { 3448 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3449 return; 3450 } 3451 /* register tcp connections */ 3452 for(p = tcp_active_list; p != NULL; p = p->next) { 3453 struct timeval timeout; 3454 int fd = p->event.ev_fd; 3455 #ifdef USE_MINI_EVENT 3456 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3457 #else 3458 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3459 #endif 3460 void (*fn)(int, short, void*); 3461 #ifdef HAVE_SSL 3462 if(p->tls) { 3463 if((event&EV_READ)) 3464 fn = handle_tls_reading; 3465 else fn = handle_tls_writing; 3466 } else { 3467 #endif 3468 if((event&EV_READ)) 3469 fn = handle_tcp_reading; 3470 else fn = handle_tcp_writing; 3471 #ifdef HAVE_SSL 3472 } 3473 #endif 3474 3475 p->tcp_no_more_queries = 1; 3476 /* set timeout to 3 seconds (previously 1/10 second) */ 3477 if(p->tcp_timeout > 3000) 3478 p->tcp_timeout = 3000; 3479 timeout.tv_sec = p->tcp_timeout / 1000; 3480 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3481 event_del(&p->event); 3482 memset(&p->event, 0, sizeof(p->event)); 3483 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3484 fn, p); 3485 if(event_base_set(event_base, &p->event) != 0) 3486 log_msg(LOG_ERR, "event base set failed"); 3487 if(event_add(&p->event, &timeout) != 0) 3488 log_msg(LOG_ERR, "event add failed"); 3489 } 3490 3491 /* handle it */ 3492 while(nsd->current_tcp_count > 0) { 3493 mode_t m = server_signal_mode(nsd); 3494 struct event timeout; 3495 struct timeval tv; 3496 int timed_out = 0; 3497 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3498 m == NSD_REAP_CHILDREN) { 3499 /* quit */ 3500 break; 3501 } 3502 /* timer */ 3503 /* have to do something every 3 seconds */ 3504 tv.tv_sec = 3; 3505 tv.tv_usec = 0; 3506 memset(&timeout, 0, sizeof(timeout)); 3507 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3508 &timed_out); 3509 if(event_base_set(event_base, &timeout) != 0) 3510 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3511 if(event_add(&timeout, &tv) != 0) 3512 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3513 3514 /* service loop */ 3515 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3516 if (errno != EINTR) { 3517 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3518 break; 3519 } 3520 } 3521 if(!timed_out) { 3522 event_del(&timeout); 3523 } else { 3524 /* timed out, quit */ 3525 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3526 break; 3527 } 3528 } 3529 #ifdef MEMCLEAN 3530 event_base_free(event_base); 3531 #endif 3532 /* continue to quit after return */ 3533 } 3534 3535 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3536 * are always used, even if nonblocking operations are broken, in which case 3537 * NUM_RECV_PER_SELECT is defined to 1 (one). 3538 */ 3539 #if defined(HAVE_RECVMMSG) 3540 #define nsd_recvmmsg recvmmsg 3541 #else /* !HAVE_RECVMMSG */ 3542 3543 static int 3544 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3545 int flags, struct timespec *timeout) 3546 { 3547 unsigned int vpos = 0; 3548 ssize_t rcvd; 3549 3550 /* timeout is ignored, ensure caller does not expect it to work */ 3551 assert(timeout == NULL); (void)timeout; 3552 3553 while(vpos < vlen) { 3554 rcvd = recvfrom(sockfd, 3555 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3556 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3557 flags, 3558 msgvec[vpos].msg_hdr.msg_name, 3559 &msgvec[vpos].msg_hdr.msg_namelen); 3560 if(rcvd < 0) { 3561 break; 3562 } else { 3563 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3564 msgvec[vpos].msg_len = (unsigned int)rcvd; 3565 vpos++; 3566 } 3567 } 3568 3569 if(vpos) { 3570 /* error will be picked up next time */ 3571 return (int)vpos; 3572 } else if(errno == 0) { 3573 return 0; 3574 } else if(errno == EAGAIN) { 3575 return 0; 3576 } 3577 3578 return -1; 3579 } 3580 #endif /* HAVE_RECVMMSG */ 3581 3582 #ifdef HAVE_SENDMMSG 3583 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3584 #else /* !HAVE_SENDMMSG */ 3585 3586 static int 3587 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3588 { 3589 unsigned int vpos = 0; 3590 ssize_t snd; 3591 3592 while(vpos < vlen) { 3593 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3594 snd = sendto(sockfd, 3595 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3596 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3597 flags, 3598 msgvec[vpos].msg_hdr.msg_name, 3599 msgvec[vpos].msg_hdr.msg_namelen); 3600 if(snd < 0) { 3601 break; 3602 } else { 3603 msgvec[vpos].msg_len = (unsigned int)snd; 3604 vpos++; 3605 } 3606 } 3607 3608 if(vpos) { 3609 return (int)vpos; 3610 } else if(errno == 0) { 3611 return 0; 3612 } 3613 3614 return -1; 3615 } 3616 #endif /* HAVE_SENDMMSG */ 3617 3618 static int 3619 port_is_zero( 3620 #ifdef INET6 3621 struct sockaddr_storage *addr 3622 #else 3623 struct sockaddr_in *addr 3624 #endif 3625 ) 3626 { 3627 #ifdef INET6 3628 if(addr->ss_family == AF_INET6) { 3629 return (((struct sockaddr_in6 *)addr)->sin6_port) == 0; 3630 } else if(addr->ss_family == AF_INET) { 3631 return (((struct sockaddr_in *)addr)->sin_port) == 0; 3632 } 3633 return 0; 3634 #else 3635 if(addr->sin_family == AF_INET) { 3636 return addr->sin_port == 0; 3637 } 3638 return 0; 3639 #endif 3640 } 3641 3642 /* Parses the PROXYv2 header from buf and updates the struct. 3643 * Returns 1 on success, 0 on failure. */ 3644 static int 3645 consume_pp2_header(struct buffer* buf, struct query* q, int stream) 3646 { 3647 size_t size; 3648 struct pp2_header* header; 3649 int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf)); 3650 if(err) { 3651 VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse " 3652 "PROXYv2 header: %s", pp_lookup_error(err))); 3653 return 0; 3654 } 3655 header = (struct pp2_header*)buffer_begin(buf); 3656 size = PP2_HEADER_SIZE + read_uint16(&header->len); 3657 if(size > buffer_limit(buf)) { 3658 VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer " 3659 "size to read PROXYv2 header")); 3660 return 0; 3661 } 3662 if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) { 3663 /* A connection from the proxy itself. 3664 * No need to do anything with addresses. */ 3665 goto done; 3666 } 3667 if(header->fam_prot == PP2_UNSPEC_UNSPEC) { 3668 /* Unspecified family and protocol. This could be used for 3669 * health checks by proxies. 3670 * No need to do anything with addresses. */ 3671 goto done; 3672 } 3673 /* Read the proxied address */ 3674 switch(header->fam_prot) { 3675 case PP2_INET_STREAM: 3676 case PP2_INET_DGRAM: 3677 { 3678 struct sockaddr_in* addr = 3679 (struct sockaddr_in*)&q->client_addr; 3680 addr->sin_family = AF_INET; 3681 memmove(&addr->sin_addr.s_addr, 3682 &header->addr.addr4.src_addr, 4); 3683 memmove(&addr->sin_port, &header->addr.addr4.src_port, 3684 2); 3685 q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in); 3686 } 3687 /* Ignore the destination address; it should be us. */ 3688 break; 3689 #ifdef INET6 3690 case PP2_INET6_STREAM: 3691 case PP2_INET6_DGRAM: 3692 { 3693 struct sockaddr_in6* addr = 3694 (struct sockaddr_in6*)&q->client_addr; 3695 memset(addr, 0, sizeof(*addr)); 3696 addr->sin6_family = AF_INET6; 3697 memmove(&addr->sin6_addr, 3698 header->addr.addr6.src_addr, 16); 3699 memmove(&addr->sin6_port, &header->addr.addr6.src_port, 3700 2); 3701 q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6); 3702 } 3703 /* Ignore the destination address; it should be us. */ 3704 break; 3705 #endif /* INET6 */ 3706 default: 3707 VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported " 3708 "family and protocol 0x%x", 3709 (int)header->fam_prot)); 3710 return 0; 3711 } 3712 q->is_proxied = 1; 3713 done: 3714 if(!stream) { 3715 /* We are reading a whole packet; 3716 * Move the rest of the data to overwrite the PROXYv2 header */ 3717 /* XXX can we do better to avoid memmove? */ 3718 memmove(header, ((char*)header)+size, buffer_limit(buf)-size); 3719 buffer_set_limit(buf, buffer_limit(buf)-size); 3720 } 3721 return 1; 3722 } 3723 3724 static void 3725 handle_udp(int fd, short event, void* arg) 3726 { 3727 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3728 int received, sent, recvcount, i; 3729 struct query *q; 3730 uint32_t now = 0; 3731 3732 if (!(event & EV_READ)) { 3733 return; 3734 } 3735 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3736 /* this printf strangely gave a performance increase on Linux */ 3737 /* printf("recvcount %d \n", recvcount); */ 3738 if (recvcount == -1) { 3739 if (errno != EAGAIN && errno != EINTR) { 3740 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3741 STATUP(data->nsd, rxerr); 3742 /* No zone statup */ 3743 } 3744 /* Simply no data available */ 3745 return; 3746 } 3747 for (i = 0; i < recvcount; i++) { 3748 loopstart: 3749 received = msgs[i].msg_len; 3750 queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen; 3751 queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr); 3752 queries[i]->is_proxied = 0; 3753 q = queries[i]; 3754 if (received == -1) { 3755 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3756 #if defined(HAVE_RECVMMSG) 3757 msgs[i].msg_hdr.msg_flags 3758 #else 3759 errno 3760 #endif 3761 )); 3762 STATUP(data->nsd, rxerr); 3763 /* No zone statup */ 3764 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3765 iovecs[i].iov_len = buffer_remaining(q->packet); 3766 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3767 goto swap_drop; 3768 } 3769 3770 /* Account... */ 3771 #ifdef BIND8_STATS 3772 if (data->socket->addr.ai_family == AF_INET) { 3773 STATUP(data->nsd, qudp); 3774 } else if (data->socket->addr.ai_family == AF_INET6) { 3775 STATUP(data->nsd, qudp6); 3776 } 3777 #endif 3778 3779 buffer_skip(q->packet, received); 3780 buffer_flip(q->packet); 3781 if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) { 3782 VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not " 3783 "consume PROXYv2 header")); 3784 goto swap_drop; 3785 } 3786 if(!q->is_proxied) { 3787 q->client_addrlen = q->remote_addrlen; 3788 memmove(&q->client_addr, &q->remote_addr, 3789 q->remote_addrlen); 3790 } 3791 #ifdef USE_DNSTAP 3792 /* 3793 * sending UDP-query with server address (local) and client address to dnstap process 3794 */ 3795 log_addr("query from client", &q->client_addr); 3796 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 3797 if(verbosity >= 6 && q->is_proxied) 3798 log_addr("query via proxy", &q->remote_addr); 3799 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen, 3800 q->tcp, q->packet); 3801 #endif /* USE_DNSTAP */ 3802 3803 /* Process and answer the query... */ 3804 if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) { 3805 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3806 STATUP(data->nsd, nona); 3807 ZTATUP(data->nsd, q->zone, nona); 3808 } 3809 3810 #ifdef USE_ZONE_STATS 3811 if (data->socket->addr.ai_family == AF_INET) { 3812 ZTATUP(data->nsd, q->zone, qudp); 3813 } else if (data->socket->addr.ai_family == AF_INET6) { 3814 ZTATUP(data->nsd, q->zone, qudp6); 3815 } 3816 #endif 3817 3818 /* Add EDNS0 and TSIG info if necessary. */ 3819 query_add_optional(q, data->nsd, &now); 3820 3821 buffer_flip(q->packet); 3822 iovecs[i].iov_len = buffer_remaining(q->packet); 3823 #ifdef BIND8_STATS 3824 /* Account the rcode & TC... */ 3825 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3826 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3827 if (TC(q->packet)) { 3828 STATUP(data->nsd, truncated); 3829 ZTATUP(data->nsd, q->zone, truncated); 3830 } 3831 #endif /* BIND8_STATS */ 3832 #ifdef USE_DNSTAP 3833 /* 3834 * sending UDP-response with server address (local) and client address to dnstap process 3835 */ 3836 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 3837 log_addr("response to client", &q->client_addr); 3838 if(verbosity >= 6 && q->is_proxied) 3839 log_addr("response via proxy", &q->remote_addr); 3840 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, 3841 &q->client_addr, q->client_addrlen, q->tcp, q->packet, 3842 q->zone); 3843 #endif /* USE_DNSTAP */ 3844 } else { 3845 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3846 iovecs[i].iov_len = buffer_remaining(q->packet); 3847 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3848 swap_drop: 3849 STATUP(data->nsd, dropped); 3850 ZTATUP(data->nsd, q->zone, dropped); 3851 if(i != recvcount-1) { 3852 /* swap with last and decrease recvcount */ 3853 struct mmsghdr mtmp = msgs[i]; 3854 struct iovec iotmp = iovecs[i]; 3855 recvcount--; 3856 msgs[i] = msgs[recvcount]; 3857 iovecs[i] = iovecs[recvcount]; 3858 queries[i] = queries[recvcount]; 3859 msgs[recvcount] = mtmp; 3860 iovecs[recvcount] = iotmp; 3861 queries[recvcount] = q; 3862 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3863 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3864 goto loopstart; 3865 } else { recvcount --; } 3866 } 3867 } 3868 3869 /* send until all are sent */ 3870 i = 0; 3871 while(i<recvcount) { 3872 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3873 if(sent == -1) { 3874 if(errno == ENOBUFS || 3875 #ifdef EWOULDBLOCK 3876 errno == EWOULDBLOCK || 3877 #endif 3878 errno == EAGAIN) { 3879 /* block to wait until send buffer avail */ 3880 int flag, errstore; 3881 if((flag = fcntl(fd, F_GETFL)) == -1) { 3882 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 3883 flag = 0; 3884 } 3885 flag &= ~O_NONBLOCK; 3886 if(fcntl(fd, F_SETFL, flag) == -1) 3887 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 3888 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3889 errstore = errno; 3890 flag |= O_NONBLOCK; 3891 if(fcntl(fd, F_SETFL, flag) == -1) 3892 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 3893 if(sent != -1) { 3894 i += sent; 3895 continue; 3896 } 3897 errno = errstore; 3898 } 3899 if(errno == EINVAL) { 3900 /* skip the invalid argument entry, 3901 * send the remaining packets in the list */ 3902 if(!(port_is_zero((void*)&queries[i]->remote_addr) && 3903 verbosity < 3)) { 3904 const char* es = strerror(errno); 3905 char a[64]; 3906 addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a)); 3907 log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3908 } 3909 i += 1; 3910 continue; 3911 } 3912 /* don't log transient network full errors, unless 3913 * on higher verbosity */ 3914 if(!(errno == ENOBUFS && verbosity < 1) && 3915 #ifdef EWOULDBLOCK 3916 errno != EWOULDBLOCK && 3917 #endif 3918 errno != EAGAIN) { 3919 const char* es = strerror(errno); 3920 char a[64]; 3921 addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a)); 3922 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3923 } 3924 #ifdef BIND8_STATS 3925 data->nsd->st->txerr += recvcount-i; 3926 #endif /* BIND8_STATS */ 3927 break; 3928 } 3929 i += sent; 3930 } 3931 for(i=0; i<recvcount; i++) { 3932 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3933 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3934 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3935 } 3936 } 3937 3938 #ifdef HAVE_SSL 3939 /* 3940 * Setup an event for the tcp handler. 3941 */ 3942 static void 3943 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3944 int fd, short event) 3945 { 3946 struct timeval timeout; 3947 struct event_base* ev_base; 3948 3949 timeout.tv_sec = data->nsd->tcp_timeout; 3950 timeout.tv_usec = 0L; 3951 3952 ev_base = data->event.ev_base; 3953 event_del(&data->event); 3954 memset(&data->event, 0, sizeof(data->event)); 3955 event_set(&data->event, fd, event, fn, data); 3956 if(event_base_set(ev_base, &data->event) != 0) 3957 log_msg(LOG_ERR, "event base set failed"); 3958 if(event_add(&data->event, &timeout) != 0) 3959 log_msg(LOG_ERR, "event add failed"); 3960 } 3961 #endif /* HAVE_SSL */ 3962 3963 static void 3964 cleanup_tcp_handler(struct tcp_handler_data* data) 3965 { 3966 event_del(&data->event); 3967 #ifdef HAVE_SSL 3968 if(data->tls) { 3969 SSL_shutdown(data->tls); 3970 SSL_free(data->tls); 3971 data->tls = NULL; 3972 } 3973 #endif 3974 data->pp2_header_state = pp2_header_none; 3975 close(data->event.ev_fd); 3976 if(data->prev) 3977 data->prev->next = data->next; 3978 else tcp_active_list = data->next; 3979 if(data->next) 3980 data->next->prev = data->prev; 3981 3982 /* 3983 * Enable the TCP accept handlers when the current number of 3984 * TCP connections is about to drop below the maximum number 3985 * of TCP connections. 3986 */ 3987 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3988 configure_handler_event_types(EV_READ|EV_PERSIST); 3989 if(slowaccept) { 3990 event_del(&slowaccept_event); 3991 slowaccept = 0; 3992 } 3993 } 3994 --data->nsd->current_tcp_count; 3995 assert(data->nsd->current_tcp_count >= 0); 3996 3997 region_destroy(data->region); 3998 } 3999 4000 /* Read more data into the buffer for tcp read. Pass the amount of additional 4001 * data required. Returns false if nothing needs to be done this event, or 4002 * true if the additional data is in the buffer. */ 4003 static int 4004 more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos, 4005 size_t add_amount, ssize_t* received) 4006 { 4007 *received = read(fd, bufpos, add_amount); 4008 if (*received == -1) { 4009 if (errno == EAGAIN || errno == EINTR) { 4010 /* 4011 * Read would block, wait until more 4012 * data is available. 4013 */ 4014 return 0; 4015 } else { 4016 char buf[48]; 4017 addr2str(&data->query->remote_addr, buf, sizeof(buf)); 4018 #ifdef ECONNRESET 4019 if (verbosity >= 2 || errno != ECONNRESET) 4020 #endif /* ECONNRESET */ 4021 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 4022 cleanup_tcp_handler(data); 4023 return 0; 4024 } 4025 } else if (*received == 0) { 4026 /* EOF */ 4027 cleanup_tcp_handler(data); 4028 return 0; 4029 } 4030 return 1; 4031 } 4032 4033 static void 4034 handle_tcp_reading(int fd, short event, void* arg) 4035 { 4036 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4037 ssize_t received; 4038 struct event_base* ev_base; 4039 struct timeval timeout; 4040 uint32_t now = 0; 4041 4042 if ((event & EV_TIMEOUT)) { 4043 /* Connection timed out. */ 4044 cleanup_tcp_handler(data); 4045 return; 4046 } 4047 4048 if ((data->nsd->tcp_query_count > 0 && 4049 data->query_count >= data->nsd->tcp_query_count) || 4050 (data->query_count > 0 && data->tcp_no_more_queries)) 4051 { 4052 /* No more queries allowed on this tcp connection. */ 4053 cleanup_tcp_handler(data); 4054 return; 4055 } 4056 4057 assert((event & EV_READ)); 4058 4059 if (data->bytes_transmitted == 0 && data->query_needs_reset) { 4060 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4061 data->query_needs_reset = 0; 4062 } 4063 4064 if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) { 4065 struct pp2_header* header = NULL; 4066 size_t want_read_size = 0; 4067 size_t current_read_size = 0; 4068 if(data->pp2_header_state == pp2_header_none) { 4069 want_read_size = PP2_HEADER_SIZE; 4070 if(buffer_remaining(data->query->packet) < 4071 want_read_size) { 4072 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4073 cleanup_tcp_handler(data); 4074 return; 4075 } 4076 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4077 current_read_size = want_read_size; 4078 if(data->bytes_transmitted < current_read_size) { 4079 if(!more_read_buf_tcp(fd, data, 4080 (void*)buffer_at(data->query->packet, 4081 data->bytes_transmitted), 4082 current_read_size - data->bytes_transmitted, 4083 &received)) 4084 return; 4085 data->bytes_transmitted += received; 4086 buffer_skip(data->query->packet, received); 4087 if(data->bytes_transmitted != current_read_size) 4088 return; 4089 data->pp2_header_state = pp2_header_init; 4090 } 4091 } 4092 if(data->pp2_header_state == pp2_header_init) { 4093 int err; 4094 err = pp2_read_header(buffer_begin(data->query->packet), 4095 buffer_limit(data->query->packet)); 4096 if(err) { 4097 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err))); 4098 cleanup_tcp_handler(data); 4099 return; 4100 } 4101 header = (struct pp2_header*)buffer_begin(data->query->packet); 4102 want_read_size = ntohs(header->len); 4103 if(buffer_limit(data->query->packet) < 4104 PP2_HEADER_SIZE + want_read_size) { 4105 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4106 cleanup_tcp_handler(data); 4107 return; 4108 } 4109 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4110 current_read_size = PP2_HEADER_SIZE + want_read_size; 4111 if(want_read_size == 0) { 4112 /* nothing more to read; header is complete */ 4113 data->pp2_header_state = pp2_header_done; 4114 } else if(data->bytes_transmitted < current_read_size) { 4115 if(!more_read_buf_tcp(fd, data, 4116 (void*)buffer_at(data->query->packet, 4117 data->bytes_transmitted), 4118 current_read_size - data->bytes_transmitted, 4119 &received)) 4120 return; 4121 data->bytes_transmitted += received; 4122 buffer_skip(data->query->packet, received); 4123 if(data->bytes_transmitted != current_read_size) 4124 return; 4125 data->pp2_header_state = pp2_header_done; 4126 } 4127 } 4128 if(data->pp2_header_state != pp2_header_done || !header) { 4129 VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header")); 4130 4131 cleanup_tcp_handler(data); 4132 return; 4133 } 4134 buffer_flip(data->query->packet); 4135 if(!consume_pp2_header(data->query->packet, data->query, 1)) { 4136 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header")); 4137 4138 cleanup_tcp_handler(data); 4139 return; 4140 } 4141 /* Clear and reset the buffer to read the following 4142 * DNS packet(s). */ 4143 buffer_clear(data->query->packet); 4144 data->bytes_transmitted = 0; 4145 } 4146 4147 /* 4148 * Check if we received the leading packet length bytes yet. 4149 */ 4150 if (data->bytes_transmitted < sizeof(uint16_t)) { 4151 if(!more_read_buf_tcp(fd, data, 4152 (char*) &data->query->tcplen + data->bytes_transmitted, 4153 sizeof(uint16_t) - data->bytes_transmitted, &received)) 4154 return; 4155 data->bytes_transmitted += received; 4156 if (data->bytes_transmitted < sizeof(uint16_t)) { 4157 /* 4158 * Not done with the tcplen yet, wait for more 4159 * data to become available. 4160 */ 4161 return; 4162 } 4163 assert(data->bytes_transmitted == sizeof(uint16_t)); 4164 4165 data->query->tcplen = ntohs(data->query->tcplen); 4166 4167 /* 4168 * Minimum query size is: 4169 * 4170 * Size of the header (12) 4171 * + Root domain name (1) 4172 * + Query class (2) 4173 * + Query type (2) 4174 */ 4175 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4176 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4177 cleanup_tcp_handler(data); 4178 return; 4179 } 4180 4181 if (data->query->tcplen > data->query->maxlen) { 4182 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4183 cleanup_tcp_handler(data); 4184 return; 4185 } 4186 4187 buffer_set_limit(data->query->packet, data->query->tcplen); 4188 } 4189 4190 assert(buffer_remaining(data->query->packet) > 0); 4191 4192 /* Read the (remaining) query data. */ 4193 if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet), 4194 buffer_remaining(data->query->packet), &received)) 4195 return; 4196 data->bytes_transmitted += received; 4197 buffer_skip(data->query->packet, received); 4198 if (buffer_remaining(data->query->packet) > 0) { 4199 /* 4200 * Message not yet complete, wait for more data to 4201 * become available. 4202 */ 4203 return; 4204 } 4205 4206 assert(buffer_position(data->query->packet) == data->query->tcplen); 4207 4208 /* Account... */ 4209 #ifdef BIND8_STATS 4210 #ifndef INET6 4211 STATUP(data->nsd, ctcp); 4212 #else 4213 if (data->query->remote_addr.ss_family == AF_INET) { 4214 STATUP(data->nsd, ctcp); 4215 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4216 STATUP(data->nsd, ctcp6); 4217 } 4218 #endif 4219 #endif /* BIND8_STATS */ 4220 4221 /* We have a complete query, process it. */ 4222 4223 /* tcp-query-count: handle query counter ++ */ 4224 data->query_count++; 4225 4226 buffer_flip(data->query->packet); 4227 #ifdef USE_DNSTAP 4228 /* 4229 * and send TCP-query with found address (local) and client address to dnstap process 4230 */ 4231 log_addr("query from client", &data->query->client_addr); 4232 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4233 if(verbosity >= 6 && data->query->is_proxied) 4234 log_addr("query via proxy", &data->query->remote_addr); 4235 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4236 data->query->client_addrlen, data->query->tcp, data->query->packet); 4237 #endif /* USE_DNSTAP */ 4238 data->query_state = server_process_query(data->nsd, data->query, &now); 4239 if (data->query_state == QUERY_DISCARDED) { 4240 /* Drop the packet and the entire connection... */ 4241 STATUP(data->nsd, dropped); 4242 ZTATUP(data->nsd, data->query->zone, dropped); 4243 cleanup_tcp_handler(data); 4244 return; 4245 } 4246 4247 #ifdef BIND8_STATS 4248 if (RCODE(data->query->packet) == RCODE_OK 4249 && !AA(data->query->packet)) 4250 { 4251 STATUP(data->nsd, nona); 4252 ZTATUP(data->nsd, data->query->zone, nona); 4253 } 4254 #endif /* BIND8_STATS */ 4255 4256 #ifdef USE_ZONE_STATS 4257 #ifndef INET6 4258 ZTATUP(data->nsd, data->query->zone, ctcp); 4259 #else 4260 if (data->query->remote_addr.ss_family == AF_INET) { 4261 ZTATUP(data->nsd, data->query->zone, ctcp); 4262 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4263 ZTATUP(data->nsd, data->query->zone, ctcp6); 4264 } 4265 #endif 4266 #endif /* USE_ZONE_STATS */ 4267 4268 query_add_optional(data->query, data->nsd, &now); 4269 4270 /* Switch to the tcp write handler. */ 4271 buffer_flip(data->query->packet); 4272 data->query->tcplen = buffer_remaining(data->query->packet); 4273 #ifdef BIND8_STATS 4274 /* Account the rcode & TC... */ 4275 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4276 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4277 if (TC(data->query->packet)) { 4278 STATUP(data->nsd, truncated); 4279 ZTATUP(data->nsd, data->query->zone, truncated); 4280 } 4281 #endif /* BIND8_STATS */ 4282 #ifdef USE_DNSTAP 4283 /* 4284 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4285 */ 4286 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4287 log_addr("response to client", &data->query->client_addr); 4288 if(verbosity >= 6 && data->query->is_proxied) 4289 log_addr("response via proxy", &data->query->remote_addr); 4290 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4291 data->query->client_addrlen, data->query->tcp, data->query->packet, 4292 data->query->zone); 4293 #endif /* USE_DNSTAP */ 4294 data->bytes_transmitted = 0; 4295 4296 timeout.tv_sec = data->tcp_timeout / 1000; 4297 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4298 4299 ev_base = data->event.ev_base; 4300 event_del(&data->event); 4301 memset(&data->event, 0, sizeof(data->event)); 4302 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4303 handle_tcp_writing, data); 4304 if(event_base_set(ev_base, &data->event) != 0) 4305 log_msg(LOG_ERR, "event base set tcpr failed"); 4306 if(event_add(&data->event, &timeout) != 0) 4307 log_msg(LOG_ERR, "event add tcpr failed"); 4308 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4309 handle_tcp_writing(fd, EV_WRITE, data); 4310 } 4311 4312 static void 4313 handle_tcp_writing(int fd, short event, void* arg) 4314 { 4315 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4316 ssize_t sent; 4317 struct query *q = data->query; 4318 struct timeval timeout; 4319 struct event_base* ev_base; 4320 uint32_t now = 0; 4321 4322 if ((event & EV_TIMEOUT)) { 4323 /* Connection timed out. */ 4324 cleanup_tcp_handler(data); 4325 return; 4326 } 4327 4328 assert((event & EV_WRITE)); 4329 4330 if (data->bytes_transmitted < sizeof(q->tcplen)) { 4331 /* Writing the response packet length. */ 4332 uint16_t n_tcplen = htons(q->tcplen); 4333 #ifdef HAVE_WRITEV 4334 struct iovec iov[2]; 4335 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 4336 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 4337 iov[1].iov_base = buffer_begin(q->packet); 4338 iov[1].iov_len = buffer_limit(q->packet); 4339 sent = writev(fd, iov, 2); 4340 #else /* HAVE_WRITEV */ 4341 sent = write(fd, 4342 (const char *) &n_tcplen + data->bytes_transmitted, 4343 sizeof(n_tcplen) - data->bytes_transmitted); 4344 #endif /* HAVE_WRITEV */ 4345 if (sent == -1) { 4346 if (errno == EAGAIN || errno == EINTR) { 4347 /* 4348 * Write would block, wait until 4349 * socket becomes writable again. 4350 */ 4351 return; 4352 } else { 4353 #ifdef ECONNRESET 4354 if(verbosity >= 2 || errno != ECONNRESET) 4355 #endif /* ECONNRESET */ 4356 #ifdef EPIPE 4357 if(verbosity >= 2 || errno != EPIPE) 4358 #endif /* EPIPE 'broken pipe' */ 4359 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 4360 cleanup_tcp_handler(data); 4361 return; 4362 } 4363 } 4364 4365 data->bytes_transmitted += sent; 4366 if (data->bytes_transmitted < sizeof(q->tcplen)) { 4367 /* 4368 * Writing not complete, wait until socket 4369 * becomes writable again. 4370 */ 4371 return; 4372 } 4373 4374 #ifdef HAVE_WRITEV 4375 sent -= sizeof(n_tcplen); 4376 /* handle potential 'packet done' code */ 4377 goto packet_could_be_done; 4378 #endif 4379 } 4380 4381 sent = write(fd, 4382 buffer_current(q->packet), 4383 buffer_remaining(q->packet)); 4384 if (sent == -1) { 4385 if (errno == EAGAIN || errno == EINTR) { 4386 /* 4387 * Write would block, wait until 4388 * socket becomes writable again. 4389 */ 4390 return; 4391 } else { 4392 #ifdef ECONNRESET 4393 if(verbosity >= 2 || errno != ECONNRESET) 4394 #endif /* ECONNRESET */ 4395 #ifdef EPIPE 4396 if(verbosity >= 2 || errno != EPIPE) 4397 #endif /* EPIPE 'broken pipe' */ 4398 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 4399 cleanup_tcp_handler(data); 4400 return; 4401 } 4402 } 4403 4404 data->bytes_transmitted += sent; 4405 #ifdef HAVE_WRITEV 4406 packet_could_be_done: 4407 #endif 4408 buffer_skip(q->packet, sent); 4409 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4410 /* 4411 * Still more data to write when socket becomes 4412 * writable again. 4413 */ 4414 return; 4415 } 4416 4417 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4418 4419 if (data->query_state == QUERY_IN_AXFR || 4420 data->query_state == QUERY_IN_IXFR) { 4421 /* Continue processing AXFR and writing back results. */ 4422 buffer_clear(q->packet); 4423 if(data->query_state == QUERY_IN_AXFR) 4424 data->query_state = query_axfr(data->nsd, q, 0); 4425 else data->query_state = query_ixfr(data->nsd, q); 4426 if (data->query_state != QUERY_PROCESSED) { 4427 query_add_optional(data->query, data->nsd, &now); 4428 4429 /* Reset data. */ 4430 buffer_flip(q->packet); 4431 q->tcplen = buffer_remaining(q->packet); 4432 data->bytes_transmitted = 0; 4433 /* Reset timeout. */ 4434 timeout.tv_sec = data->tcp_timeout / 1000; 4435 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4436 ev_base = data->event.ev_base; 4437 event_del(&data->event); 4438 memset(&data->event, 0, sizeof(data->event)); 4439 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4440 handle_tcp_writing, data); 4441 if(event_base_set(ev_base, &data->event) != 0) 4442 log_msg(LOG_ERR, "event base set tcpw failed"); 4443 if(event_add(&data->event, &timeout) != 0) 4444 log_msg(LOG_ERR, "event add tcpw failed"); 4445 4446 /* 4447 * Write data if/when the socket is writable 4448 * again. 4449 */ 4450 return; 4451 } 4452 } 4453 4454 /* 4455 * Done sending, wait for the next request to arrive on the 4456 * TCP socket by installing the TCP read handler. 4457 */ 4458 if ((data->nsd->tcp_query_count > 0 && 4459 data->query_count >= data->nsd->tcp_query_count) || 4460 data->tcp_no_more_queries) { 4461 4462 (void) shutdown(fd, SHUT_WR); 4463 } 4464 4465 data->bytes_transmitted = 0; 4466 data->query_needs_reset = 1; 4467 4468 timeout.tv_sec = data->tcp_timeout / 1000; 4469 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4470 ev_base = data->event.ev_base; 4471 event_del(&data->event); 4472 memset(&data->event, 0, sizeof(data->event)); 4473 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 4474 handle_tcp_reading, data); 4475 if(event_base_set(ev_base, &data->event) != 0) 4476 log_msg(LOG_ERR, "event base set tcpw failed"); 4477 if(event_add(&data->event, &timeout) != 0) 4478 log_msg(LOG_ERR, "event add tcpw failed"); 4479 } 4480 4481 #ifdef HAVE_SSL 4482 /** create SSL object and associate fd */ 4483 static SSL* 4484 incoming_ssl_fd(SSL_CTX* ctx, int fd) 4485 { 4486 SSL* ssl = SSL_new((SSL_CTX*)ctx); 4487 if(!ssl) { 4488 log_crypto_err("could not SSL_new"); 4489 return NULL; 4490 } 4491 SSL_set_accept_state(ssl); 4492 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 4493 if(!SSL_set_fd(ssl, fd)) { 4494 log_crypto_err("could not SSL_set_fd"); 4495 SSL_free(ssl); 4496 return NULL; 4497 } 4498 return ssl; 4499 } 4500 4501 /** TLS handshake to upgrade TCP connection */ 4502 static int 4503 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 4504 { 4505 int r; 4506 if(data->shake_state == tls_hs_read_event) { 4507 /* read condition satisfied back to writing */ 4508 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4509 data->shake_state = tls_hs_none; 4510 return 1; 4511 } 4512 if(data->shake_state == tls_hs_write_event) { 4513 /* write condition satisfied back to reading */ 4514 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4515 data->shake_state = tls_hs_none; 4516 return 1; 4517 } 4518 4519 /* (continue to) setup the TLS connection */ 4520 ERR_clear_error(); 4521 r = SSL_do_handshake(data->tls); 4522 4523 if(r != 1) { 4524 int want = SSL_get_error(data->tls, r); 4525 if(want == SSL_ERROR_WANT_READ) { 4526 if(data->shake_state == tls_hs_read) { 4527 /* try again later */ 4528 return 1; 4529 } 4530 data->shake_state = tls_hs_read; 4531 /* switch back to reading mode */ 4532 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4533 return 1; 4534 } else if(want == SSL_ERROR_WANT_WRITE) { 4535 if(data->shake_state == tls_hs_write) { 4536 /* try again later */ 4537 return 1; 4538 } 4539 data->shake_state = tls_hs_write; 4540 /* switch back to writing mode */ 4541 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4542 return 1; 4543 } else { 4544 if(r == 0) 4545 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 4546 else { 4547 unsigned long err = ERR_get_error(); 4548 if(!squelch_err_ssl_handshake(err)) { 4549 char a[64], s[256]; 4550 addr2str(&data->query->remote_addr, a, sizeof(a)); 4551 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 4552 log_crypto_from_err(s, err); 4553 } 4554 } 4555 cleanup_tcp_handler(data); 4556 return 0; 4557 } 4558 } 4559 4560 /* Use to log successful upgrade for testing - could be removed*/ 4561 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 4562 /* set back to the event we need to have when reading (or writing) */ 4563 if(data->shake_state == tls_hs_read && writing) { 4564 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4565 } else if(data->shake_state == tls_hs_write && !writing) { 4566 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4567 } 4568 data->shake_state = tls_hs_none; 4569 return 1; 4570 } 4571 4572 /* Read more data into the buffer for tls read. Pass the amount of additional 4573 * data required. Returns false if nothing needs to be done this event, or 4574 * true if the additional data is in the buffer. */ 4575 static int 4576 more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos, 4577 size_t add_amount, ssize_t* received) 4578 { 4579 ERR_clear_error(); 4580 if((*received=SSL_read(data->tls, bufpos, add_amount)) <= 0) { 4581 int want = SSL_get_error(data->tls, *received); 4582 if(want == SSL_ERROR_ZERO_RETURN) { 4583 cleanup_tcp_handler(data); 4584 return 0; /* shutdown, closed */ 4585 } else if(want == SSL_ERROR_WANT_READ) { 4586 /* wants to be called again */ 4587 return 0; 4588 } 4589 else if(want == SSL_ERROR_WANT_WRITE) { 4590 /* switch to writing */ 4591 data->shake_state = tls_hs_write_event; 4592 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4593 return 0; 4594 } 4595 cleanup_tcp_handler(data); 4596 log_crypto_err("could not SSL_read"); 4597 return 0; 4598 } 4599 return 1; 4600 } 4601 4602 /** handle TLS reading of incoming query */ 4603 static void 4604 handle_tls_reading(int fd, short event, void* arg) 4605 { 4606 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4607 ssize_t received; 4608 uint32_t now = 0; 4609 4610 if ((event & EV_TIMEOUT)) { 4611 /* Connection timed out. */ 4612 cleanup_tcp_handler(data); 4613 return; 4614 } 4615 4616 if ((data->nsd->tcp_query_count > 0 && 4617 data->query_count >= data->nsd->tcp_query_count) || 4618 (data->query_count > 0 && data->tcp_no_more_queries)) 4619 { 4620 /* No more queries allowed on this tcp connection. */ 4621 cleanup_tcp_handler(data); 4622 return; 4623 } 4624 4625 assert((event & EV_READ)); 4626 4627 if (data->bytes_transmitted == 0 && data->query_needs_reset) { 4628 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4629 data->query_needs_reset = 0; 4630 } 4631 4632 if(data->shake_state != tls_hs_none) { 4633 if(!tls_handshake(data, fd, 0)) 4634 return; 4635 if(data->shake_state != tls_hs_none) 4636 return; 4637 } 4638 4639 if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) { 4640 struct pp2_header* header = NULL; 4641 size_t want_read_size = 0; 4642 size_t current_read_size = 0; 4643 if(data->pp2_header_state == pp2_header_none) { 4644 want_read_size = PP2_HEADER_SIZE; 4645 if(buffer_remaining(data->query->packet) < 4646 want_read_size) { 4647 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4648 cleanup_tcp_handler(data); 4649 return; 4650 } 4651 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4652 current_read_size = want_read_size; 4653 if(data->bytes_transmitted < current_read_size) { 4654 if(!more_read_buf_tls(fd, data, 4655 buffer_at(data->query->packet, 4656 data->bytes_transmitted), 4657 current_read_size - data->bytes_transmitted, 4658 &received)) 4659 return; 4660 data->bytes_transmitted += received; 4661 buffer_skip(data->query->packet, received); 4662 if(data->bytes_transmitted != current_read_size) 4663 return; 4664 data->pp2_header_state = pp2_header_init; 4665 } 4666 } 4667 if(data->pp2_header_state == pp2_header_init) { 4668 int err; 4669 err = pp2_read_header(buffer_begin(data->query->packet), 4670 buffer_limit(data->query->packet)); 4671 if(err) { 4672 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err))); 4673 cleanup_tcp_handler(data); 4674 return; 4675 } 4676 header = (struct pp2_header*)buffer_begin(data->query->packet); 4677 want_read_size = ntohs(header->len); 4678 if(buffer_limit(data->query->packet) < 4679 PP2_HEADER_SIZE + want_read_size) { 4680 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4681 cleanup_tcp_handler(data); 4682 return; 4683 } 4684 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4685 current_read_size = PP2_HEADER_SIZE + want_read_size; 4686 if(want_read_size == 0) { 4687 /* nothing more to read; header is complete */ 4688 data->pp2_header_state = pp2_header_done; 4689 } else if(data->bytes_transmitted < current_read_size) { 4690 if(!more_read_buf_tls(fd, data, 4691 buffer_at(data->query->packet, 4692 data->bytes_transmitted), 4693 current_read_size - data->bytes_transmitted, 4694 &received)) 4695 return; 4696 data->bytes_transmitted += received; 4697 buffer_skip(data->query->packet, received); 4698 if(data->bytes_transmitted != current_read_size) 4699 return; 4700 data->pp2_header_state = pp2_header_done; 4701 } 4702 } 4703 if(data->pp2_header_state != pp2_header_done || !header) { 4704 VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header")); 4705 cleanup_tcp_handler(data); 4706 return; 4707 } 4708 buffer_flip(data->query->packet); 4709 if(!consume_pp2_header(data->query->packet, data->query, 1)) { 4710 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header")); 4711 cleanup_tcp_handler(data); 4712 return; 4713 } 4714 /* Clear and reset the buffer to read the following 4715 * DNS packet(s). */ 4716 buffer_clear(data->query->packet); 4717 data->bytes_transmitted = 0; 4718 } 4719 /* 4720 * Check if we received the leading packet length bytes yet. 4721 */ 4722 if(data->bytes_transmitted < sizeof(uint16_t)) { 4723 if(!more_read_buf_tls(fd, data, 4724 (char *) &data->query->tcplen + data->bytes_transmitted, 4725 sizeof(uint16_t) - data->bytes_transmitted, &received)) 4726 return; 4727 data->bytes_transmitted += received; 4728 if (data->bytes_transmitted < sizeof(uint16_t)) { 4729 /* 4730 * Not done with the tcplen yet, wait for more 4731 * data to become available. 4732 */ 4733 return; 4734 } 4735 4736 assert(data->bytes_transmitted == sizeof(uint16_t)); 4737 4738 data->query->tcplen = ntohs(data->query->tcplen); 4739 4740 /* 4741 * Minimum query size is: 4742 * 4743 * Size of the header (12) 4744 * + Root domain name (1) 4745 * + Query class (2) 4746 * + Query type (2) 4747 */ 4748 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4749 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4750 cleanup_tcp_handler(data); 4751 return; 4752 } 4753 4754 if (data->query->tcplen > data->query->maxlen) { 4755 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4756 cleanup_tcp_handler(data); 4757 return; 4758 } 4759 4760 buffer_set_limit(data->query->packet, data->query->tcplen); 4761 } 4762 4763 assert(buffer_remaining(data->query->packet) > 0); 4764 4765 /* Read the (remaining) query data. */ 4766 if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet), 4767 buffer_remaining(data->query->packet), &received)) 4768 return; 4769 data->bytes_transmitted += received; 4770 buffer_skip(data->query->packet, received); 4771 if (buffer_remaining(data->query->packet) > 0) { 4772 /* 4773 * Message not yet complete, wait for more data to 4774 * become available. 4775 */ 4776 return; 4777 } 4778 4779 assert(buffer_position(data->query->packet) == data->query->tcplen); 4780 4781 /* Account... */ 4782 #ifndef INET6 4783 STATUP(data->nsd, ctls); 4784 #else 4785 if (data->query->remote_addr.ss_family == AF_INET) { 4786 STATUP(data->nsd, ctls); 4787 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4788 STATUP(data->nsd, ctls6); 4789 } 4790 #endif 4791 4792 /* We have a complete query, process it. */ 4793 4794 /* tcp-query-count: handle query counter ++ */ 4795 data->query_count++; 4796 4797 buffer_flip(data->query->packet); 4798 #ifdef USE_DNSTAP 4799 /* 4800 * and send TCP-query with found address (local) and client address to dnstap process 4801 */ 4802 log_addr("query from client", &data->query->client_addr); 4803 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4804 if(verbosity >= 6 && data->query->is_proxied) 4805 log_addr("query via proxy", &data->query->remote_addr); 4806 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4807 data->query->client_addrlen, data->query->tcp, data->query->packet); 4808 #endif /* USE_DNSTAP */ 4809 data->query_state = server_process_query(data->nsd, data->query, &now); 4810 if (data->query_state == QUERY_DISCARDED) { 4811 /* Drop the packet and the entire connection... */ 4812 STATUP(data->nsd, dropped); 4813 ZTATUP(data->nsd, data->query->zone, dropped); 4814 cleanup_tcp_handler(data); 4815 return; 4816 } 4817 4818 #ifdef BIND8_STATS 4819 if (RCODE(data->query->packet) == RCODE_OK 4820 && !AA(data->query->packet)) 4821 { 4822 STATUP(data->nsd, nona); 4823 ZTATUP(data->nsd, data->query->zone, nona); 4824 } 4825 #endif /* BIND8_STATS */ 4826 4827 #ifdef USE_ZONE_STATS 4828 #ifndef INET6 4829 ZTATUP(data->nsd, data->query->zone, ctls); 4830 #else 4831 if (data->query->remote_addr.ss_family == AF_INET) { 4832 ZTATUP(data->nsd, data->query->zone, ctls); 4833 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4834 ZTATUP(data->nsd, data->query->zone, ctls6); 4835 } 4836 #endif 4837 #endif /* USE_ZONE_STATS */ 4838 4839 query_add_optional(data->query, data->nsd, &now); 4840 4841 /* Switch to the tcp write handler. */ 4842 buffer_flip(data->query->packet); 4843 data->query->tcplen = buffer_remaining(data->query->packet); 4844 #ifdef BIND8_STATS 4845 /* Account the rcode & TC... */ 4846 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4847 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4848 if (TC(data->query->packet)) { 4849 STATUP(data->nsd, truncated); 4850 ZTATUP(data->nsd, data->query->zone, truncated); 4851 } 4852 #endif /* BIND8_STATS */ 4853 #ifdef USE_DNSTAP 4854 /* 4855 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4856 */ 4857 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4858 log_addr("response to client", &data->query->client_addr); 4859 if(verbosity >= 6 && data->query->is_proxied) 4860 log_addr("response via proxy", &data->query->remote_addr); 4861 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4862 data->query->client_addrlen, data->query->tcp, data->query->packet, 4863 data->query->zone); 4864 #endif /* USE_DNSTAP */ 4865 data->bytes_transmitted = 0; 4866 4867 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4868 4869 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4870 handle_tls_writing(fd, EV_WRITE, data); 4871 } 4872 4873 /** handle TLS writing of outgoing response */ 4874 static void 4875 handle_tls_writing(int fd, short event, void* arg) 4876 { 4877 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4878 ssize_t sent; 4879 struct query *q = data->query; 4880 /* static variable that holds reassembly buffer used to put the 4881 * TCP length in front of the packet, like writev. */ 4882 static buffer_type* global_tls_temp_buffer = NULL; 4883 buffer_type* write_buffer; 4884 uint32_t now = 0; 4885 4886 if ((event & EV_TIMEOUT)) { 4887 /* Connection timed out. */ 4888 cleanup_tcp_handler(data); 4889 return; 4890 } 4891 4892 assert((event & EV_WRITE)); 4893 4894 if(data->shake_state != tls_hs_none) { 4895 if(!tls_handshake(data, fd, 1)) 4896 return; 4897 if(data->shake_state != tls_hs_none) 4898 return; 4899 } 4900 4901 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4902 4903 /* If we are writing the start of a message, we must include the length 4904 * this is done with a copy into write_buffer. */ 4905 write_buffer = NULL; 4906 if (data->bytes_transmitted == 0) { 4907 if(!global_tls_temp_buffer) { 4908 /* gets deallocated when nsd shuts down from 4909 * nsd.region */ 4910 global_tls_temp_buffer = buffer_create(nsd.region, 4911 QIOBUFSZ + sizeof(q->tcplen)); 4912 if (!global_tls_temp_buffer) { 4913 return; 4914 } 4915 } 4916 write_buffer = global_tls_temp_buffer; 4917 buffer_clear(write_buffer); 4918 buffer_write_u16(write_buffer, q->tcplen); 4919 buffer_write(write_buffer, buffer_current(q->packet), 4920 (int)buffer_remaining(q->packet)); 4921 buffer_flip(write_buffer); 4922 } else { 4923 write_buffer = q->packet; 4924 } 4925 4926 /* Write the response */ 4927 ERR_clear_error(); 4928 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4929 if(sent <= 0) { 4930 int want = SSL_get_error(data->tls, sent); 4931 if(want == SSL_ERROR_ZERO_RETURN) { 4932 cleanup_tcp_handler(data); 4933 /* closed */ 4934 } else if(want == SSL_ERROR_WANT_READ) { 4935 /* switch back to reading */ 4936 data->shake_state = tls_hs_read_event; 4937 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4938 } else if(want != SSL_ERROR_WANT_WRITE) { 4939 cleanup_tcp_handler(data); 4940 log_crypto_err("could not SSL_write"); 4941 } 4942 return; 4943 } 4944 4945 buffer_skip(write_buffer, sent); 4946 if(buffer_remaining(write_buffer) != 0) { 4947 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4948 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4949 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4950 } 4951 } 4952 4953 data->bytes_transmitted += sent; 4954 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4955 /* 4956 * Still more data to write when socket becomes 4957 * writable again. 4958 */ 4959 return; 4960 } 4961 4962 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4963 4964 if (data->query_state == QUERY_IN_AXFR || 4965 data->query_state == QUERY_IN_IXFR) { 4966 /* Continue processing AXFR and writing back results. */ 4967 buffer_clear(q->packet); 4968 if(data->query_state == QUERY_IN_AXFR) 4969 data->query_state = query_axfr(data->nsd, q, 0); 4970 else data->query_state = query_ixfr(data->nsd, q); 4971 if (data->query_state != QUERY_PROCESSED) { 4972 query_add_optional(data->query, data->nsd, &now); 4973 4974 /* Reset data. */ 4975 buffer_flip(q->packet); 4976 q->tcplen = buffer_remaining(q->packet); 4977 data->bytes_transmitted = 0; 4978 /* Reset to writing mode. */ 4979 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4980 4981 /* 4982 * Write data if/when the socket is writable 4983 * again. 4984 */ 4985 return; 4986 } 4987 } 4988 4989 /* 4990 * Done sending, wait for the next request to arrive on the 4991 * TCP socket by installing the TCP read handler. 4992 */ 4993 if ((data->nsd->tcp_query_count > 0 && 4994 data->query_count >= data->nsd->tcp_query_count) || 4995 data->tcp_no_more_queries) { 4996 4997 (void) shutdown(fd, SHUT_WR); 4998 } 4999 5000 data->bytes_transmitted = 0; 5001 data->query_needs_reset = 1; 5002 5003 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 5004 } 5005 #endif 5006 5007 static void 5008 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 5009 void* ATTR_UNUSED(arg)) 5010 { 5011 if(slowaccept) { 5012 configure_handler_event_types(EV_PERSIST | EV_READ); 5013 slowaccept = 0; 5014 } 5015 } 5016 5017 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 5018 { 5019 #ifndef HAVE_ACCEPT4 5020 int s = accept(fd, addr, addrlen); 5021 if (s != -1) { 5022 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 5023 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 5024 close(s); 5025 s = -1; 5026 errno=EINTR; /* stop error printout as error in accept4 5027 by setting this errno, it omits printout, in 5028 later code that calls nsd_accept4 */ 5029 } 5030 } 5031 return s; 5032 #else 5033 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 5034 #endif /* HAVE_ACCEPT4 */ 5035 } 5036 5037 /* 5038 * Handle an incoming TCP connection. The connection is accepted and 5039 * a new TCP reader event handler is added. The TCP handler 5040 * is responsible for cleanup when the connection is closed. 5041 */ 5042 static void 5043 handle_tcp_accept(int fd, short event, void* arg) 5044 { 5045 struct tcp_accept_handler_data *data 5046 = (struct tcp_accept_handler_data *) arg; 5047 int s; 5048 int reject = 0; 5049 struct tcp_handler_data *tcp_data; 5050 region_type *tcp_region; 5051 #ifdef INET6 5052 struct sockaddr_storage addr; 5053 #else 5054 struct sockaddr_in addr; 5055 #endif 5056 socklen_t addrlen; 5057 struct timeval timeout; 5058 5059 if (!(event & EV_READ)) { 5060 return; 5061 } 5062 5063 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 5064 reject = data->nsd->options->tcp_reject_overflow; 5065 if (!reject) { 5066 return; 5067 } 5068 } 5069 5070 /* Accept it... */ 5071 addrlen = sizeof(addr); 5072 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 5073 if (s == -1) { 5074 /** 5075 * EMFILE and ENFILE is a signal that the limit of open 5076 * file descriptors has been reached. Pause accept(). 5077 * EINTR is a signal interrupt. The others are various OS ways 5078 * of saying that the client has closed the connection. 5079 */ 5080 if (errno == EMFILE || errno == ENFILE) { 5081 if (!slowaccept) { 5082 /* disable accept events */ 5083 struct timeval tv; 5084 configure_handler_event_types(0); 5085 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 5086 tv.tv_usec = 0L; 5087 memset(&slowaccept_event, 0, 5088 sizeof(slowaccept_event)); 5089 event_set(&slowaccept_event, -1, EV_TIMEOUT, 5090 handle_slowaccept_timeout, NULL); 5091 (void)event_base_set(data->event.ev_base, 5092 &slowaccept_event); 5093 (void)event_add(&slowaccept_event, &tv); 5094 slowaccept = 1; 5095 /* We don't want to spam the logs here */ 5096 } 5097 } else if (errno != EINTR 5098 && errno != EWOULDBLOCK 5099 #ifdef ECONNABORTED 5100 && errno != ECONNABORTED 5101 #endif /* ECONNABORTED */ 5102 #ifdef EPROTO 5103 && errno != EPROTO 5104 #endif /* EPROTO */ 5105 ) { 5106 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 5107 } 5108 return; 5109 } 5110 5111 if (reject) { 5112 shutdown(s, SHUT_RDWR); 5113 close(s); 5114 return; 5115 } 5116 5117 /* 5118 * This region is deallocated when the TCP connection is 5119 * closed by the TCP handler. 5120 */ 5121 tcp_region = region_create(xalloc, free); 5122 tcp_data = (struct tcp_handler_data *) region_alloc( 5123 tcp_region, sizeof(struct tcp_handler_data)); 5124 tcp_data->region = tcp_region; 5125 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 5126 compression_table_size, compressed_dnames); 5127 tcp_data->nsd = data->nsd; 5128 tcp_data->query_count = 0; 5129 #ifdef HAVE_SSL 5130 tcp_data->shake_state = tls_hs_none; 5131 tcp_data->tls = NULL; 5132 #endif 5133 tcp_data->query_needs_reset = 1; 5134 tcp_data->pp2_enabled = data->pp2_enabled; 5135 tcp_data->pp2_header_state = pp2_header_none; 5136 tcp_data->prev = NULL; 5137 tcp_data->next = NULL; 5138 5139 tcp_data->query_state = QUERY_PROCESSED; 5140 tcp_data->bytes_transmitted = 0; 5141 memcpy(&tcp_data->query->remote_addr, &addr, addrlen); 5142 tcp_data->query->remote_addrlen = addrlen; 5143 /* Copy remote_address to client_address. 5144 * Simplest way/time for streams to do that. */ 5145 memcpy(&tcp_data->query->client_addr, &addr, addrlen); 5146 tcp_data->query->client_addrlen = addrlen; 5147 tcp_data->query->is_proxied = 0; 5148 5149 tcp_data->tcp_no_more_queries = 0; 5150 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 5151 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 5152 /* very busy, give smaller timeout */ 5153 tcp_data->tcp_timeout = 200; 5154 } 5155 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5156 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 5157 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 5158 5159 #ifdef USE_DNSTAP 5160 /* save the address of the connection */ 5161 tcp_data->socket = data->socket; 5162 #endif /* USE_DNSTAP */ 5163 5164 #ifdef HAVE_SSL 5165 if (data->tls_accept) { 5166 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 5167 if(!tcp_data->tls) { 5168 close(s); 5169 return; 5170 } 5171 tcp_data->shake_state = tls_hs_read; 5172 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5173 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 5174 handle_tls_reading, tcp_data); 5175 } else { 5176 #endif 5177 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5178 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 5179 handle_tcp_reading, tcp_data); 5180 #ifdef HAVE_SSL 5181 } 5182 #endif 5183 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 5184 log_msg(LOG_ERR, "cannot set tcp event base"); 5185 close(s); 5186 region_destroy(tcp_region); 5187 return; 5188 } 5189 if(event_add(&tcp_data->event, &timeout) != 0) { 5190 log_msg(LOG_ERR, "cannot add tcp to event base"); 5191 close(s); 5192 region_destroy(tcp_region); 5193 return; 5194 } 5195 if(tcp_active_list) { 5196 tcp_active_list->prev = tcp_data; 5197 tcp_data->next = tcp_active_list; 5198 } 5199 tcp_active_list = tcp_data; 5200 5201 /* 5202 * Keep track of the total number of TCP handlers installed so 5203 * we can stop accepting connections when the maximum number 5204 * of simultaneous TCP connections is reached. 5205 * 5206 * If tcp-reject-overflow is enabled, however, then we do not 5207 * change the handler event type; we keep it as-is and accept 5208 * overflow TCP connections only so that we can forcibly kill 5209 * them off. 5210 */ 5211 ++data->nsd->current_tcp_count; 5212 if (!data->nsd->options->tcp_reject_overflow && 5213 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 5214 { 5215 configure_handler_event_types(0); 5216 } 5217 } 5218 5219 static void 5220 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 5221 { 5222 size_t i; 5223 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 5224 for (i = 0; i < nsd->child_count; ++i) { 5225 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 5226 if (write(nsd->children[i].child_fd, 5227 &command, 5228 sizeof(command)) == -1) 5229 { 5230 if(errno != EAGAIN && errno != EINTR) 5231 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 5232 (int) command, 5233 (int) nsd->children[i].pid, 5234 strerror(errno)); 5235 } else if (timeout > 0) { 5236 (void)block_read(NULL, 5237 nsd->children[i].child_fd, 5238 &command, sizeof(command), timeout); 5239 } 5240 fsync(nsd->children[i].child_fd); 5241 close(nsd->children[i].child_fd); 5242 nsd->children[i].child_fd = -1; 5243 } 5244 } 5245 } 5246 5247 static void 5248 send_children_quit(struct nsd* nsd) 5249 { 5250 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 5251 send_children_command(nsd, NSD_QUIT, 0); 5252 } 5253 5254 static void 5255 send_children_quit_and_wait(struct nsd* nsd) 5256 { 5257 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 5258 send_children_command(nsd, NSD_QUIT_CHILD, 3); 5259 } 5260 5261 #ifdef BIND8_STATS 5262 static void 5263 set_children_stats(struct nsd* nsd) 5264 { 5265 size_t i; 5266 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 5267 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 5268 for (i = 0; i < nsd->child_count; ++i) { 5269 nsd->children[i].need_to_send_STATS = 1; 5270 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 5271 } 5272 } 5273 #endif /* BIND8_STATS */ 5274 5275 static void 5276 configure_handler_event_types(short event_types) 5277 { 5278 size_t i; 5279 5280 for (i = 0; i < tcp_accept_handler_count; ++i) { 5281 struct event* handler = &tcp_accept_handlers[i].event; 5282 if(event_types) { 5283 /* reassign */ 5284 int fd = handler->ev_fd; 5285 struct event_base* base = handler->ev_base; 5286 if(tcp_accept_handlers[i].event_added) 5287 event_del(handler); 5288 memset(handler, 0, sizeof(*handler)); 5289 event_set(handler, fd, event_types, 5290 handle_tcp_accept, &tcp_accept_handlers[i]); 5291 if(event_base_set(base, handler) != 0) 5292 log_msg(LOG_ERR, "conhand: cannot event_base"); 5293 if(event_add(handler, NULL) != 0) 5294 log_msg(LOG_ERR, "conhand: cannot event_add"); 5295 tcp_accept_handlers[i].event_added = 1; 5296 } else { 5297 /* remove */ 5298 if(tcp_accept_handlers[i].event_added) { 5299 event_del(handler); 5300 tcp_accept_handlers[i].event_added = 0; 5301 } 5302 } 5303 } 5304 } 5305