1 /* 2 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 10 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 13 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 14 * PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 /*! \file */ 18 19 #include <sys/socket.h> 20 #include <sys/time.h> 21 #include <sys/uio.h> 22 23 #include <netinet/tcp.h> 24 25 #include <errno.h> 26 #include <fcntl.h> 27 #include <stddef.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <unistd.h> 31 32 #include <isc/buffer.h> 33 #include <isc/bufferlist.h> 34 35 #include <isc/list.h> 36 #include <isc/log.h> 37 #include <isc/net.h> 38 #include <isc/region.h> 39 #include <isc/socket.h> 40 #include <isc/task.h> 41 #include <isc/util.h> 42 43 #include "errno2result.h" 44 45 #include "socket_p.h" 46 #include "../task_p.h" 47 48 struct isc_socketwait { 49 fd_set *readset; 50 fd_set *writeset; 51 int nfds; 52 int maxfd; 53 }; 54 55 /* 56 * Set by the -T dscp option on the command line. If set to a value 57 * other than -1, we check to make sure DSCP values match it, and 58 * assert if not. 59 */ 60 int isc_dscp_check_value = -1; 61 62 /*% 63 * Some systems define the socket length argument as an int, some as size_t, 64 * some as socklen_t. This is here so it can be easily changed if needed. 65 */ 66 67 /*% 68 * Define what the possible "soft" errors can be. These are non-fatal returns 69 * of various network related functions, like recv() and so on. 70 * 71 * For some reason, BSDI (and perhaps others) will sometimes return <0 72 * from recv() but will have errno==0. This is broken, but we have to 73 * work around it here. 74 */ 75 #define SOFT_ERROR(e) ((e) == EAGAIN || \ 76 (e) == EWOULDBLOCK || \ 77 (e) == EINTR || \ 78 (e) == 0) 79 80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 81 82 /*!< 83 * DLVL(90) -- Function entry/exit and other tracing. 84 * DLVL(60) -- Socket data send/receive 85 * DLVL(50) -- Event tracing, including receiving/sending completion events. 86 * DLVL(20) -- Socket creation/destruction. 87 */ 88 #define TRACE_LEVEL 90 89 #define IOEVENT_LEVEL 60 90 #define EVENT_LEVEL 50 91 #define CREATION_LEVEL 20 92 93 #define TRACE DLVL(TRACE_LEVEL) 94 #define IOEVENT DLVL(IOEVENT_LEVEL) 95 #define EVENT DLVL(EVENT_LEVEL) 96 #define CREATION DLVL(CREATION_LEVEL) 97 98 typedef isc_event_t intev_t; 99 100 /*! 101 * IPv6 control information. If the socket is an IPv6 socket we want 102 * to collect the destination address and interface so the client can 103 * set them on outgoing packets. 104 */ 105 106 /*% 107 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 108 * a setsockopt() like interface to request timestamps, and if the OS 109 * doesn't do it for us, call gettimeofday() on every UDP receive? 110 */ 111 112 /*% 113 * Instead of calculating the cmsgbuf lengths every time we take 114 * a rule of thumb approach - sizes are taken from x86_64 linux, 115 * multiplied by 2, everything should fit. Those sizes are not 116 * large enough to cause any concern. 117 */ 118 #define CMSG_SP_IN6PKT 40 119 120 #define CMSG_SP_TIMESTAMP 32 121 122 #define CMSG_SP_TCTOS 24 123 124 #define CMSG_SP_INT 24 125 126 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1) 127 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1) 128 129 /*% 130 * The number of times a send operation is repeated if the result is EINTR. 131 */ 132 #define NRETRIES 10 133 134 struct isc_socket { 135 /* Not locked. */ 136 isc_socketmgr_t *manager; 137 isc_sockettype_t type; 138 139 /* Locked by socket lock. */ 140 ISC_LINK(isc_socket_t) link; 141 unsigned int references; 142 int fd; 143 int pf; 144 145 ISC_LIST(isc_socketevent_t) send_list; 146 ISC_LIST(isc_socketevent_t) recv_list; 147 isc_socket_connev_t *connect_ev; 148 149 /* 150 * Internal events. Posted when a descriptor is readable or 151 * writable. These are statically allocated and never freed. 152 * They will be set to non-purgable before use. 153 */ 154 intev_t readable_ev; 155 intev_t writable_ev; 156 157 isc_sockaddr_t peer_address; /* remote address */ 158 159 unsigned int pending_recv : 1, 160 pending_send : 1, 161 connected : 1, 162 connecting : 1, /* connect pending */ 163 bound : 1, /* bound to local addr */ 164 active : 1, /* currently active */ 165 pktdscp : 1; /* per packet dscp */ 166 unsigned int dscp; 167 }; 168 169 struct isc_socketmgr { 170 /* Not locked. */ 171 int fd_bufsize; 172 unsigned int maxsocks; 173 174 isc_socket_t **fds; 175 int *fdstate; 176 177 /* Locked by manager lock. */ 178 ISC_LIST(isc_socket_t) socklist; 179 fd_set *read_fds; 180 fd_set *read_fds_copy; 181 fd_set *write_fds; 182 fd_set *write_fds_copy; 183 int maxfd; 184 unsigned int refs; 185 }; 186 187 static isc_socketmgr_t *socketmgr = NULL; 188 189 #define CLOSED 0 /* this one must be zero */ 190 #define MANAGED 1 191 #define CLOSE_PENDING 2 192 193 /* 194 * send() and recv() iovec counts 195 */ 196 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 197 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 198 199 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf, 200 isc_sockettype_t type, 201 isc_socket_t **socketp); 202 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **); 203 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **); 204 static void free_socket(isc_socket_t **); 205 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t, 206 isc_socket_t **); 207 static void destroy(isc_socket_t **); 208 static void internal_connect(isc_task_t *, isc_event_t *); 209 static void internal_recv(isc_task_t *, isc_event_t *); 210 static void internal_send(isc_task_t *, isc_event_t *); 211 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *); 212 static void build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, 213 struct msghdr *, struct iovec *, size_t *); 214 static void build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, 215 struct msghdr *, struct iovec *, size_t *); 216 217 #define SELECT_POKE_SHUTDOWN (-1) 218 #define SELECT_POKE_READ (-3) 219 #define SELECT_POKE_WRITE (-4) 220 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 221 #define SELECT_POKE_CLOSE (-5) 222 223 #define SOCK_DEAD(s) ((s)->references == 0) 224 225 /*% 226 * Shortcut index arrays to get access to statistics counters. 227 */ 228 enum { 229 STATID_OPEN = 0, 230 STATID_OPENFAIL = 1, 231 STATID_CLOSE = 2, 232 STATID_BINDFAIL = 3, 233 STATID_CONNECTFAIL = 4, 234 STATID_CONNECT = 5, 235 STATID_ACCEPTFAIL = 6, 236 STATID_ACCEPT = 7, 237 STATID_SENDFAIL = 8, 238 STATID_RECVFAIL = 9, 239 STATID_ACTIVE = 10 240 }; 241 242 static void 243 socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 244 isc_logcategory_t *category, isc_logmodule_t *module, int level, 245 const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7))); 246 static void 247 socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 248 isc_logcategory_t *category, isc_logmodule_t *module, int level, 249 const char *fmt, ...) 250 { 251 char msgbuf[2048]; 252 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 253 va_list ap; 254 255 if (! isc_log_wouldlog(isc_lctx, level)) 256 return; 257 258 va_start(ap, fmt); 259 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 260 va_end(ap); 261 262 if (address == NULL) { 263 isc_log_write(isc_lctx, category, module, level, 264 "socket %p: %s", sock, msgbuf); 265 } else { 266 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 267 isc_log_write(isc_lctx, category, module, level, 268 "socket %p %s: %s", sock, peerbuf, msgbuf); 269 } 270 } 271 272 static inline isc_result_t 273 watch_fd(isc_socketmgr_t *manager, int fd, int msg) { 274 isc_result_t result = ISC_R_SUCCESS; 275 276 if (msg == SELECT_POKE_READ) 277 FD_SET(fd, manager->read_fds); 278 if (msg == SELECT_POKE_WRITE) 279 FD_SET(fd, manager->write_fds); 280 281 return (result); 282 } 283 284 static inline isc_result_t 285 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) { 286 isc_result_t result = ISC_R_SUCCESS; 287 288 if (msg == SELECT_POKE_READ) 289 FD_CLR(fd, manager->read_fds); 290 else if (msg == SELECT_POKE_WRITE) 291 FD_CLR(fd, manager->write_fds); 292 293 return (result); 294 } 295 296 static void 297 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { 298 isc_result_t result; 299 300 /* 301 * This is a wakeup on a socket. If the socket is not in the 302 * process of being closed, start watching it for either reads 303 * or writes. 304 */ 305 306 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 307 308 if (msg == SELECT_POKE_CLOSE) { 309 /* No one should be updating fdstate, so no need to lock it */ 310 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 311 manager->fdstate[fd] = CLOSED; 312 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 313 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 314 (void)close(fd); 315 return; 316 } 317 318 if (manager->fdstate[fd] == CLOSE_PENDING) { 319 320 /* 321 * We accept (and ignore) any error from unwatch_fd() as we are 322 * closing the socket, hoping it doesn't leave dangling state in 323 * the kernel. 324 */ 325 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 326 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 327 return; 328 } 329 if (manager->fdstate[fd] != MANAGED) { 330 return; 331 } 332 333 /* 334 * Set requested bit. 335 */ 336 result = watch_fd(manager, fd, msg); 337 if (result != ISC_R_SUCCESS) { 338 /* 339 * XXXJT: what should we do? Ignoring the failure of watching 340 * a socket will make the application dysfunctional, but there 341 * seems to be no reasonable recovery process. 342 */ 343 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 344 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 345 "failed to start watching FD (%d): %s", 346 fd, isc_result_totext(result)); 347 } 348 } 349 350 /* 351 * Update the state of the socketmgr when something changes. 352 */ 353 static void 354 select_poke(isc_socketmgr_t *manager, int fd, int msg) { 355 if (msg == SELECT_POKE_SHUTDOWN) 356 return; 357 else if (fd >= 0) 358 wakeup_socket(manager, fd, msg); 359 return; 360 } 361 362 /* 363 * Make a fd non-blocking. 364 */ 365 static isc_result_t 366 make_nonblock(int fd) { 367 int ret; 368 int flags; 369 370 flags = fcntl(fd, F_GETFL, 0); 371 flags |= O_NONBLOCK; 372 ret = fcntl(fd, F_SETFL, flags); 373 374 if (ret == -1) { 375 UNEXPECTED_ERROR(__FILE__, __LINE__, 376 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 377 strerror(errno)); 378 return (ISC_R_UNEXPECTED); 379 } 380 381 return (ISC_R_SUCCESS); 382 } 383 384 /* 385 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 386 * In order to ensure as much portability as possible, we provide wrapper 387 * functions of these macros. 388 * Note that cmsg_space() could run slow on OSes that do not have 389 * CMSG_SPACE. 390 */ 391 static inline socklen_t 392 cmsg_len(socklen_t len) { 393 return (CMSG_LEN(len)); 394 } 395 396 static inline socklen_t 397 cmsg_space(socklen_t len) { 398 return (CMSG_SPACE(len)); 399 } 400 401 /* 402 * Process control messages received on a socket. 403 */ 404 static void 405 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 406 struct cmsghdr *cmsgp; 407 struct in6_pktinfo *pktinfop; 408 void *timevalp; 409 410 /* 411 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 412 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 413 * They are all here, outside of the CPP tests, because it is 414 * more consistent with the usual ISC coding style. 415 */ 416 UNUSED(sock); 417 UNUSED(msg); 418 UNUSED(dev); 419 420 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 421 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 422 423 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 424 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 425 426 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 427 return; 428 429 timevalp = NULL; 430 pktinfop = NULL; 431 432 cmsgp = CMSG_FIRSTHDR(msg); 433 while (cmsgp != NULL) { 434 socket_log(sock, NULL, TRACE, 435 "processing cmsg %p", cmsgp); 436 437 if (cmsgp->cmsg_level == IPPROTO_IPV6 438 && cmsgp->cmsg_type == IPV6_PKTINFO) { 439 440 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 441 memmove(&dev->pktinfo, pktinfop, 442 sizeof(struct in6_pktinfo)); 443 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 444 socket_log(sock, NULL, TRACE, 445 "interface received on ifindex %u", 446 dev->pktinfo.ipi6_ifindex); 447 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 448 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 449 goto next; 450 } 451 452 if (cmsgp->cmsg_level == SOL_SOCKET 453 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 454 struct timeval tv; 455 timevalp = CMSG_DATA(cmsgp); 456 memmove(&tv, timevalp, sizeof(tv)); 457 TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp); 458 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 459 goto next; 460 } 461 462 if (cmsgp->cmsg_level == IPPROTO_IPV6 463 && cmsgp->cmsg_type == IPV6_TCLASS) { 464 dev->dscp = *(int *)CMSG_DATA(cmsgp); 465 dev->dscp >>= 2; 466 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 467 goto next; 468 } 469 470 if (cmsgp->cmsg_level == IPPROTO_IP 471 && (cmsgp->cmsg_type == IP_TOS)) { 472 dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp); 473 dev->dscp >>= 2; 474 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 475 goto next; 476 } 477 next: 478 cmsgp = CMSG_NXTHDR(msg, cmsgp); 479 } 480 481 } 482 483 /* 484 * Construct an iov array and attach it to the msghdr passed in. This is 485 * the SEND constructor, which will use the used region of the buffer 486 * (if using a buffer list) or will use the internal region (if a single 487 * buffer I/O is requested). 488 * 489 * Nothing can be NULL, and the done event must list at least one buffer 490 * on the buffer linked list for this function to be meaningful. 491 * 492 * If write_countp != NULL, *write_countp will hold the number of bytes 493 * this transaction can send. 494 */ 495 static void 496 build_msghdr_send(isc_socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev, 497 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 498 { 499 unsigned int iovcount; 500 isc_buffer_t *buffer; 501 isc_region_t used; 502 size_t write_count; 503 size_t skip_count; 504 struct cmsghdr *cmsgp; 505 506 memset(msg, 0, sizeof(*msg)); 507 508 if (!sock->connected) { 509 msg->msg_name = (void *)&dev->address.type.sa; 510 msg->msg_namelen = dev->address.length; 511 } else { 512 msg->msg_name = NULL; 513 msg->msg_namelen = 0; 514 } 515 516 buffer = ISC_LIST_HEAD(dev->bufferlist); 517 write_count = 0; 518 iovcount = 0; 519 520 /* 521 * Single buffer I/O? Skip what we've done so far in this region. 522 */ 523 if (buffer == NULL) { 524 write_count = dev->region.length - dev->n; 525 iov[0].iov_base = (void *)(dev->region.base + dev->n); 526 iov[0].iov_len = write_count; 527 iovcount = 1; 528 529 goto config; 530 } 531 532 /* 533 * Multibuffer I/O. 534 * Skip the data in the buffer list that we have already written. 535 */ 536 skip_count = dev->n; 537 while (buffer != NULL) { 538 if (skip_count < isc_buffer_usedlength(buffer)) 539 break; 540 skip_count -= isc_buffer_usedlength(buffer); 541 buffer = ISC_LIST_NEXT(buffer, link); 542 } 543 544 while (buffer != NULL) { 545 INSIST(iovcount < MAXSCATTERGATHER_SEND); 546 547 isc_buffer_usedregion(buffer, &used); 548 549 if (used.length > 0) { 550 iov[iovcount].iov_base = (void *)(used.base 551 + skip_count); 552 iov[iovcount].iov_len = used.length - skip_count; 553 write_count += (used.length - skip_count); 554 skip_count = 0; 555 iovcount++; 556 } 557 buffer = ISC_LIST_NEXT(buffer, link); 558 } 559 560 INSIST(skip_count == 0U); 561 562 config: 563 msg->msg_iov = iov; 564 msg->msg_iovlen = iovcount; 565 566 msg->msg_control = NULL; 567 msg->msg_controllen = 0; 568 msg->msg_flags = 0; 569 570 if ((sock->type == isc_sockettype_udp) && 571 ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) 572 { 573 struct in6_pktinfo *pktinfop; 574 575 socket_log(sock, NULL, TRACE, 576 "sendto pktinfo data, ifindex %u", 577 dev->pktinfo.ipi6_ifindex); 578 579 msg->msg_control = (void *)cmsgbuf; 580 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 581 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 582 583 cmsgp = (struct cmsghdr *)cmsgbuf; 584 cmsgp->cmsg_level = IPPROTO_IPV6; 585 cmsgp->cmsg_type = IPV6_PKTINFO; 586 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 587 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 588 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 589 } 590 591 if ((sock->type == isc_sockettype_udp) && 592 ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0)) 593 { 594 int use_min_mtu = 1; /* -1, 0, 1 */ 595 596 cmsgp = (struct cmsghdr *)(cmsgbuf + 597 msg->msg_controllen); 598 599 msg->msg_control = (void *)cmsgbuf; 600 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu)); 601 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 602 603 cmsgp->cmsg_level = IPPROTO_IPV6; 604 cmsgp->cmsg_type = IPV6_USE_MIN_MTU; 605 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu)); 606 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu)); 607 } 608 609 if (isc_dscp_check_value > -1) { 610 if (sock->type == isc_sockettype_udp) 611 INSIST((int)dev->dscp == isc_dscp_check_value); 612 else if (sock->type == isc_sockettype_tcp) 613 INSIST((int)sock->dscp == isc_dscp_check_value); 614 } 615 616 if ((sock->type == isc_sockettype_udp) && 617 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0)) 618 { 619 int dscp = (dev->dscp << 2) & 0xff; 620 621 INSIST(dev->dscp < 0x40); 622 623 if (sock->pf == AF_INET && sock->pktdscp) { 624 cmsgp = (struct cmsghdr *)(cmsgbuf + 625 msg->msg_controllen); 626 msg->msg_control = (void *)cmsgbuf; 627 msg->msg_controllen += cmsg_space(sizeof(dscp)); 628 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 629 630 cmsgp->cmsg_level = IPPROTO_IP; 631 cmsgp->cmsg_type = IP_TOS; 632 cmsgp->cmsg_len = cmsg_len(sizeof(char)); 633 *(unsigned char*)CMSG_DATA(cmsgp) = dscp; 634 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) { 635 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, 636 (void *)&dscp, sizeof(int)) < 0) 637 { 638 UNEXPECTED_ERROR(__FILE__, __LINE__, 639 "setsockopt(%d, IP_TOS, %.02x)" 640 " %s: %s", 641 sock->fd, dscp >> 2, 642 "failed", strerror(errno)); 643 } else 644 sock->dscp = dscp; 645 } 646 647 if (sock->pf == AF_INET6 && sock->pktdscp) { 648 cmsgp = (struct cmsghdr *)(cmsgbuf + 649 msg->msg_controllen); 650 msg->msg_control = (void *)cmsgbuf; 651 msg->msg_controllen += cmsg_space(sizeof(dscp)); 652 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 653 654 cmsgp->cmsg_level = IPPROTO_IPV6; 655 cmsgp->cmsg_type = IPV6_TCLASS; 656 cmsgp->cmsg_len = cmsg_len(sizeof(dscp)); 657 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp)); 658 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) { 659 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS, 660 (void *)&dscp, sizeof(int)) < 0) { 661 UNEXPECTED_ERROR(__FILE__, __LINE__, 662 "setsockopt(%d, IPV6_TCLASS, " 663 "%.02x) %s: %s", 664 sock->fd, dscp >> 2, 665 "failed", strerror(errno)); 666 } else 667 sock->dscp = dscp; 668 } 669 670 if (msg->msg_controllen != 0 && 671 msg->msg_controllen < SENDCMSGBUFLEN) 672 { 673 memset(cmsgbuf + msg->msg_controllen, 0, 674 SENDCMSGBUFLEN - msg->msg_controllen); 675 } 676 } 677 678 if (write_countp != NULL) 679 *write_countp = write_count; 680 } 681 682 /* 683 * Construct an iov array and attach it to the msghdr passed in. This is 684 * the RECV constructor, which will use the available region of the buffer 685 * (if using a buffer list) or will use the internal region (if a single 686 * buffer I/O is requested). 687 * 688 * Nothing can be NULL, and the done event must list at least one buffer 689 * on the buffer linked list for this function to be meaningful. 690 * 691 * If read_countp != NULL, *read_countp will hold the number of bytes 692 * this transaction can receive. 693 */ 694 static void 695 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev, 696 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 697 { 698 unsigned int iovcount; 699 isc_buffer_t *buffer; 700 isc_region_t available; 701 size_t read_count; 702 703 memset(msg, 0, sizeof(struct msghdr)); 704 705 if (sock->type == isc_sockettype_udp) { 706 memset(&dev->address, 0, sizeof(dev->address)); 707 msg->msg_name = (void *)&dev->address.type.sa; 708 msg->msg_namelen = sizeof(dev->address.type); 709 } else { /* TCP */ 710 msg->msg_name = NULL; 711 msg->msg_namelen = 0; 712 dev->address = sock->peer_address; 713 } 714 715 buffer = ISC_LIST_HEAD(dev->bufferlist); 716 read_count = 0; 717 718 /* 719 * Single buffer I/O? Skip what we've done so far in this region. 720 */ 721 if (buffer == NULL) { 722 read_count = dev->region.length - dev->n; 723 iov[0].iov_base = (void *)(dev->region.base + dev->n); 724 iov[0].iov_len = read_count; 725 iovcount = 1; 726 727 goto config; 728 } 729 730 /* 731 * Multibuffer I/O. 732 * Skip empty buffers. 733 */ 734 while (buffer != NULL) { 735 if (isc_buffer_availablelength(buffer) != 0) 736 break; 737 buffer = ISC_LIST_NEXT(buffer, link); 738 } 739 740 iovcount = 0; 741 while (buffer != NULL) { 742 INSIST(iovcount < MAXSCATTERGATHER_RECV); 743 744 isc_buffer_availableregion(buffer, &available); 745 746 if (available.length > 0) { 747 iov[iovcount].iov_base = (void *)(available.base); 748 iov[iovcount].iov_len = available.length; 749 read_count += available.length; 750 iovcount++; 751 } 752 buffer = ISC_LIST_NEXT(buffer, link); 753 } 754 755 config: 756 757 /* 758 * If needed, set up to receive that one extra byte. 759 */ 760 msg->msg_iov = iov; 761 msg->msg_iovlen = iovcount; 762 763 msg->msg_control = cmsgbuf; 764 msg->msg_controllen = RECVCMSGBUFLEN; 765 msg->msg_flags = 0; 766 767 if (read_countp != NULL) 768 *read_countp = read_count; 769 } 770 771 static void 772 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock, 773 isc_socketevent_t *dev) 774 { 775 if (sock->type == isc_sockettype_udp) { 776 if (address != NULL) 777 dev->address = *address; 778 else 779 dev->address = sock->peer_address; 780 } else if (sock->type == isc_sockettype_tcp) { 781 INSIST(address == NULL); 782 dev->address = sock->peer_address; 783 } 784 } 785 786 static void 787 destroy_socketevent(isc_event_t *event) { 788 isc_socketevent_t *ev = (isc_socketevent_t *)event; 789 790 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 791 792 (ev->destroy)(event); 793 } 794 795 static isc_socketevent_t * 796 allocate_socketevent(void *sender, 797 isc_eventtype_t eventtype, isc_taskaction_t action, 798 void *arg) 799 { 800 isc_socketevent_t *ev; 801 802 ev = (isc_socketevent_t *)isc_event_allocate(sender, 803 eventtype, action, arg, 804 sizeof(*ev)); 805 806 if (ev == NULL) 807 return (NULL); 808 809 ev->result = ISC_R_UNSET; 810 ISC_LINK_INIT(ev, ev_link); 811 ISC_LIST_INIT(ev->bufferlist); 812 ev->region.base = NULL; 813 ev->n = 0; 814 ev->offset = 0; 815 ev->attributes = 0; 816 ev->destroy = ev->ev_destroy; 817 ev->ev_destroy = destroy_socketevent; 818 ev->dscp = 0; 819 820 return (ev); 821 } 822 823 #define DOIO_SUCCESS 0 /* i/o ok, event sent */ 824 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 825 #define DOIO_HARD 2 /* i/o error, event sent */ 826 #define DOIO_EOF 3 /* EOF, no event sent */ 827 828 static int 829 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { 830 int cc; 831 struct iovec iov[MAXSCATTERGATHER_RECV]; 832 size_t read_count; 833 size_t actual_count; 834 struct msghdr msghdr; 835 isc_buffer_t *buffer; 836 int recv_errno; 837 union { 838 struct msghdr msghdr; 839 char m[RECVCMSGBUFLEN]; 840 } cmsgbuf; 841 842 memset(&cmsgbuf, 0, sizeof(cmsgbuf)); 843 844 build_msghdr_recv(sock, cmsgbuf.m, dev, &msghdr, iov, &read_count); 845 846 cc = recvmsg(sock->fd, &msghdr, 0); 847 recv_errno = errno; 848 849 if (cc < 0) { 850 if (SOFT_ERROR(recv_errno)) 851 return (DOIO_SOFT); 852 853 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 854 socket_log(sock, NULL, IOEVENT, 855 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 856 sock->fd, cc, recv_errno, 857 strerror(recv_errno)); 858 } 859 860 #define SOFT_OR_HARD(_system, _isc) \ 861 if (recv_errno == _system) { \ 862 if (sock->connected) { \ 863 dev->result = _isc; \ 864 return (DOIO_HARD); \ 865 } \ 866 return (DOIO_SOFT); \ 867 } 868 #define ALWAYS_HARD(_system, _isc) \ 869 if (recv_errno == _system) { \ 870 dev->result = _isc; \ 871 return (DOIO_HARD); \ 872 } 873 874 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 875 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 876 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 877 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 878 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 879 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 880 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 881 /* Should never get this one but it was seen. */ 882 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH); 883 /* 884 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 885 * errors. 886 */ 887 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 888 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 889 890 #undef SOFT_OR_HARD 891 #undef ALWAYS_HARD 892 893 dev->result = isc__errno2result(recv_errno); 894 return (DOIO_HARD); 895 } 896 897 /* 898 * On TCP and UNIX sockets, zero length reads indicate EOF, 899 * while on UDP sockets, zero length reads are perfectly valid, 900 * although strange. 901 */ 902 switch (sock->type) { 903 case isc_sockettype_tcp: 904 if (cc == 0) 905 return (DOIO_EOF); 906 break; 907 case isc_sockettype_udp: 908 break; 909 default: 910 INSIST(0); 911 } 912 913 if (sock->type == isc_sockettype_udp) { 914 dev->address.length = msghdr.msg_namelen; 915 if (isc_sockaddr_getport(&dev->address) == 0) { 916 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 917 socket_log(sock, &dev->address, IOEVENT, 918 "dropping source port zero packet"); 919 } 920 return (DOIO_SOFT); 921 } 922 } 923 924 socket_log(sock, &dev->address, IOEVENT, 925 "packet received correctly"); 926 927 /* 928 * Overflow bit detection. If we received MORE bytes than we should, 929 * this indicates an overflow situation. Set the flag in the 930 * dev entry and adjust how much we read by one. 931 */ 932 /* 933 * If there are control messages attached, run through them and pull 934 * out the interesting bits. 935 */ 936 process_cmsg(sock, &msghdr, dev); 937 938 /* 939 * update the buffers (if any) and the i/o count 940 */ 941 dev->n += cc; 942 actual_count = cc; 943 buffer = ISC_LIST_HEAD(dev->bufferlist); 944 while (buffer != NULL && actual_count > 0U) { 945 if (isc_buffer_availablelength(buffer) <= actual_count) { 946 actual_count -= isc_buffer_availablelength(buffer); 947 isc_buffer_add(buffer, 948 isc_buffer_availablelength(buffer)); 949 } else { 950 isc_buffer_add(buffer, actual_count); 951 actual_count = 0; 952 POST(actual_count); 953 break; 954 } 955 buffer = ISC_LIST_NEXT(buffer, link); 956 if (buffer == NULL) { 957 INSIST(actual_count == 0U); 958 } 959 } 960 961 /* 962 * If we read less than we expected, update counters, 963 * and let the upper layer poke the descriptor. 964 */ 965 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 966 return (DOIO_SOFT); 967 968 /* 969 * Full reads are posted, or partials if partials are ok. 970 */ 971 dev->result = ISC_R_SUCCESS; 972 return (DOIO_SUCCESS); 973 } 974 975 /* 976 * Returns: 977 * DOIO_SUCCESS The operation succeeded. dev->result contains 978 * ISC_R_SUCCESS. 979 * 980 * DOIO_HARD A hard or unexpected I/O error was encountered. 981 * dev->result contains the appropriate error. 982 * 983 * DOIO_SOFT A soft I/O error was encountered. No senddone 984 * event was sent. The operation should be retried. 985 * 986 * No other return values are possible. 987 */ 988 static int 989 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { 990 int cc; 991 struct iovec iov[MAXSCATTERGATHER_SEND]; 992 size_t write_count; 993 struct msghdr msghdr; 994 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 995 int attempts = 0; 996 int send_errno; 997 union { 998 struct msghdr msghdr; 999 char m[SENDCMSGBUFLEN]; 1000 } cmsgbuf; 1001 1002 memset(&cmsgbuf, 0, sizeof(cmsgbuf)); 1003 1004 build_msghdr_send(sock, cmsgbuf.m, dev, &msghdr, iov, &write_count); 1005 1006 resend: 1007 cc = sendmsg(sock->fd, &msghdr, 0); 1008 send_errno = errno; 1009 1010 /* 1011 * Check for error or block condition. 1012 */ 1013 if (cc < 0) { 1014 if (send_errno == EINTR && ++attempts < NRETRIES) 1015 goto resend; 1016 1017 if (SOFT_ERROR(send_errno)) { 1018 if (errno == EWOULDBLOCK || errno == EAGAIN) 1019 dev->result = ISC_R_WOULDBLOCK; 1020 return (DOIO_SOFT); 1021 } 1022 1023 #define SOFT_OR_HARD(_system, _isc) \ 1024 if (send_errno == _system) { \ 1025 if (sock->connected) { \ 1026 dev->result = _isc; \ 1027 return (DOIO_HARD); \ 1028 } \ 1029 return (DOIO_SOFT); \ 1030 } 1031 #define ALWAYS_HARD(_system, _isc) \ 1032 if (send_errno == _system) { \ 1033 dev->result = _isc; \ 1034 return (DOIO_HARD); \ 1035 } 1036 1037 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1038 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1039 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1040 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1041 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1042 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1043 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1044 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1045 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1046 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1047 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1048 1049 #undef SOFT_OR_HARD 1050 #undef ALWAYS_HARD 1051 1052 /* 1053 * The other error types depend on whether or not the 1054 * socket is UDP or TCP. If it is UDP, some errors 1055 * that we expect to be fatal under TCP are merely 1056 * annoying, and are really soft errors. 1057 * 1058 * However, these soft errors are still returned as 1059 * a status. 1060 */ 1061 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1062 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1063 addrbuf, strerror(send_errno)); 1064 dev->result = isc__errno2result(send_errno); 1065 return (DOIO_HARD); 1066 } 1067 1068 if (cc == 0) { 1069 UNEXPECTED_ERROR(__FILE__, __LINE__, 1070 "doio_send: send() %s 0", "returned"); 1071 } 1072 1073 /* 1074 * If we write less than we expected, update counters, poke. 1075 */ 1076 dev->n += cc; 1077 if ((size_t)cc != write_count) 1078 return (DOIO_SOFT); 1079 1080 /* 1081 * Exactly what we wanted to write. We're done with this 1082 * entry. Post its completion event. 1083 */ 1084 dev->result = ISC_R_SUCCESS; 1085 return (DOIO_SUCCESS); 1086 } 1087 1088 /* 1089 * Kill. 1090 * 1091 * Caller must ensure that the socket is not locked and no external 1092 * references exist. 1093 */ 1094 static void 1095 socketclose(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) { 1096 /* 1097 * No one has this socket open, so the watcher doesn't have to be 1098 * poked, and the socket doesn't have to be locked. 1099 */ 1100 manager->fds[fd] = NULL; 1101 manager->fdstate[fd] = CLOSE_PENDING; 1102 select_poke(manager, fd, SELECT_POKE_CLOSE); 1103 1104 if (sock->active == 1) { 1105 sock->active = 0; 1106 } 1107 1108 /* 1109 * update manager->maxfd here (XXX: this should be implemented more 1110 * efficiently) 1111 */ 1112 if (manager->maxfd == fd) { 1113 int i; 1114 1115 manager->maxfd = 0; 1116 for (i = fd - 1; i >= 0; i--) { 1117 if (manager->fdstate[i] == MANAGED) { 1118 manager->maxfd = i; 1119 break; 1120 } 1121 } 1122 } 1123 1124 } 1125 1126 static void 1127 destroy(isc_socket_t **sockp) { 1128 int fd; 1129 isc_socket_t *sock = *sockp; 1130 isc_socketmgr_t *manager = sock->manager; 1131 1132 socket_log(sock, NULL, CREATION, "destroying"); 1133 1134 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1135 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1136 INSIST(sock->connect_ev == NULL); 1137 INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks); 1138 1139 if (sock->fd >= 0) { 1140 fd = sock->fd; 1141 sock->fd = -1; 1142 socketclose(manager, sock, fd); 1143 } 1144 1145 ISC_LIST_UNLINK(manager->socklist, sock, link); 1146 1147 /* can't unlock manager as its memory context is still used */ 1148 free_socket(sockp); 1149 } 1150 1151 static isc_result_t 1152 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, 1153 isc_socket_t **socketp) 1154 { 1155 isc_socket_t *sock; 1156 1157 sock = malloc(sizeof(*sock)); 1158 1159 if (sock == NULL) 1160 return (ISC_R_NOMEMORY); 1161 1162 sock->references = 0; 1163 1164 sock->manager = manager; 1165 sock->type = type; 1166 sock->fd = -1; 1167 sock->dscp = 0; /* TOS/TCLASS is zero until set. */ 1168 sock->active = 0; 1169 1170 ISC_LINK_INIT(sock, link); 1171 1172 /* 1173 * Set up list of readers and writers to be initially empty. 1174 */ 1175 ISC_LIST_INIT(sock->recv_list); 1176 ISC_LIST_INIT(sock->send_list); 1177 sock->connect_ev = NULL; 1178 sock->pending_recv = 0; 1179 sock->pending_send = 0; 1180 sock->connected = 0; 1181 sock->connecting = 0; 1182 sock->bound = 0; 1183 sock->pktdscp = 0; 1184 1185 /* 1186 * Initialize readable and writable events. 1187 */ 1188 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 1189 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 1190 NULL, sock, sock, NULL); 1191 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 1192 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 1193 NULL, sock, sock, NULL); 1194 1195 *socketp = sock; 1196 1197 return (ISC_R_SUCCESS); 1198 } 1199 1200 /* 1201 * This event requires that the various lists be empty, that the reference 1202 * count be 1. The other socket bits, 1203 * like the lock, must be initialized as well. The fd associated must be 1204 * marked as closed, by setting it to -1 on close, or this routine will 1205 * also close the socket. 1206 */ 1207 static void 1208 free_socket(isc_socket_t **socketp) { 1209 isc_socket_t *sock = *socketp; 1210 1211 INSIST(sock->references == 0); 1212 INSIST(!sock->connecting); 1213 INSIST(!sock->pending_recv); 1214 INSIST(!sock->pending_send); 1215 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1216 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1217 INSIST(!ISC_LINK_LINKED(sock, link)); 1218 1219 free(sock); 1220 1221 *socketp = NULL; 1222 } 1223 1224 static void 1225 use_min_mtu(isc_socket_t *sock) { 1226 /* use minimum MTU */ 1227 if (sock->pf == AF_INET6) { 1228 int on = 1; 1229 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, 1230 (void *)&on, sizeof(on)); 1231 } 1232 } 1233 1234 static void 1235 set_tcp_maxseg(isc_socket_t *sock, int size) { 1236 if (sock->type == isc_sockettype_tcp) 1237 (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG, 1238 (void *)&size, sizeof(size)); 1239 } 1240 1241 static isc_result_t 1242 opensocket(isc_socket_t *sock) 1243 { 1244 isc_result_t result; 1245 const char *err = "socket"; 1246 int on = 1; 1247 1248 switch (sock->type) { 1249 case isc_sockettype_udp: 1250 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 1251 break; 1252 case isc_sockettype_tcp: 1253 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 1254 break; 1255 } 1256 1257 if (sock->fd < 0) { 1258 switch (errno) { 1259 case EMFILE: 1260 case ENFILE: 1261 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 1262 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 1263 "%s: %s", err, strerror(errno)); 1264 /* fallthrough */ 1265 case ENOBUFS: 1266 return (ISC_R_NORESOURCES); 1267 1268 case EPROTONOSUPPORT: 1269 case EPFNOSUPPORT: 1270 case EAFNOSUPPORT: 1271 /* 1272 * Linux 2.2 (and maybe others) return EINVAL instead of 1273 * EAFNOSUPPORT. 1274 */ 1275 case EINVAL: 1276 return (ISC_R_FAMILYNOSUPPORT); 1277 1278 default: 1279 UNEXPECTED_ERROR(__FILE__, __LINE__, 1280 "%s() %s: %s", err, "failed", 1281 strerror(errno)); 1282 return (ISC_R_UNEXPECTED); 1283 } 1284 } 1285 1286 result = make_nonblock(sock->fd); 1287 if (result != ISC_R_SUCCESS) { 1288 (void)close(sock->fd); 1289 return (result); 1290 } 1291 1292 /* 1293 * Use minimum mtu if possible. 1294 */ 1295 if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) { 1296 use_min_mtu(sock); 1297 set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */ 1298 } 1299 1300 if (sock->type == isc_sockettype_udp) { 1301 1302 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 1303 (void *)&on, sizeof(on)) < 0 1304 && errno != ENOPROTOOPT) { 1305 UNEXPECTED_ERROR(__FILE__, __LINE__, 1306 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 1307 sock->fd, "failed", strerror(errno)); 1308 /* Press on... */ 1309 } 1310 1311 /* RFC 3542 */ 1312 if ((sock->pf == AF_INET6) 1313 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 1314 (void *)&on, sizeof(on)) < 0)) { 1315 UNEXPECTED_ERROR(__FILE__, __LINE__, 1316 "setsockopt(%d, IPV6_RECVPKTINFO) " 1317 "%s: %s", sock->fd, "failed", 1318 strerror(errno)); 1319 } 1320 } 1321 1322 if (sock->active == 0) { 1323 sock->active = 1; 1324 } 1325 1326 return (ISC_R_SUCCESS); 1327 } 1328 1329 /* 1330 * Create a 'type' socket managed 1331 * by 'manager'. Events will be posted to 'task' and when dispatched 1332 * 'action' will be called with 'arg' as the arg value. The new 1333 * socket is returned in 'socketp'. 1334 */ 1335 static isc_result_t 1336 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 1337 isc_socket_t **socketp) 1338 { 1339 isc_socket_t *sock = NULL; 1340 isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0; 1341 isc_result_t result; 1342 1343 REQUIRE(socketp != NULL && *socketp == NULL); 1344 1345 result = allocate_socket(manager, type, &sock); 1346 if (result != ISC_R_SUCCESS) 1347 return (result); 1348 1349 switch (sock->type) { 1350 case isc_sockettype_udp: 1351 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6) 1352 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0; 1353 break; 1354 case isc_sockettype_tcp: 1355 break; 1356 default: 1357 INSIST(0); 1358 } 1359 1360 sock->pf = pf; 1361 1362 result = opensocket(sock); 1363 if (result != ISC_R_SUCCESS) { 1364 free_socket(&sock); 1365 return (result); 1366 } 1367 1368 sock->references = 1; 1369 *socketp = (isc_socket_t *)sock; 1370 1371 /* 1372 * Note we don't have to lock the socket like we normally would because 1373 * there are no external references to it yet. 1374 */ 1375 1376 manager->fds[sock->fd] = sock; 1377 manager->fdstate[sock->fd] = MANAGED; 1378 1379 ISC_LIST_APPEND(manager->socklist, sock, link); 1380 if (manager->maxfd < sock->fd) 1381 manager->maxfd = sock->fd; 1382 1383 socket_log(sock, NULL, CREATION, "created"); 1384 1385 return (ISC_R_SUCCESS); 1386 } 1387 1388 /*% 1389 * Create a new 'type' socket managed by 'manager'. Events 1390 * will be posted to 'task' and when dispatched 'action' will be 1391 * called with 'arg' as the arg value. The new socket is returned 1392 * in 'socketp'. 1393 */ 1394 isc_result_t 1395 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 1396 isc_socket_t **socketp) 1397 { 1398 return (socket_create(manager0, pf, type, socketp)); 1399 } 1400 1401 /* 1402 * Attach to a socket. Caller must explicitly detach when it is done. 1403 */ 1404 void 1405 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) { 1406 isc_socket_t *sock = (isc_socket_t *)sock0; 1407 1408 REQUIRE(socketp != NULL && *socketp == NULL); 1409 1410 sock->references++; 1411 1412 *socketp = (isc_socket_t *)sock; 1413 } 1414 1415 /* 1416 * Dereference a socket. If this is the last reference to it, clean things 1417 * up by destroying the socket. 1418 */ 1419 void 1420 isc_socket_detach(isc_socket_t **socketp) { 1421 isc_socket_t *sock; 1422 isc_boolean_t kill_socket = ISC_FALSE; 1423 1424 REQUIRE(socketp != NULL); 1425 sock = (isc_socket_t *)*socketp; 1426 1427 REQUIRE(sock->references > 0); 1428 sock->references--; 1429 if (sock->references == 0) 1430 kill_socket = ISC_TRUE; 1431 1432 if (kill_socket) 1433 destroy(&sock); 1434 1435 *socketp = NULL; 1436 } 1437 1438 /* 1439 * I/O is possible on a given socket. Schedule an event to this task that 1440 * will call an internal function to do the I/O. This will charge the 1441 * task with the I/O operation and let our select loop handler get back 1442 * to doing something real as fast as possible. 1443 * 1444 * The socket and manager must be locked before calling this function. 1445 */ 1446 static void 1447 dispatch_recv(isc_socket_t *sock) { 1448 intev_t *iev; 1449 isc_socketevent_t *ev; 1450 isc_task_t *sender; 1451 1452 INSIST(!sock->pending_recv); 1453 1454 ev = ISC_LIST_HEAD(sock->recv_list); 1455 if (ev == NULL) 1456 return; 1457 socket_log(sock, NULL, EVENT, 1458 "dispatch_recv: event %p -> task %p", 1459 ev, ev->ev_sender); 1460 sender = ev->ev_sender; 1461 1462 sock->pending_recv = 1; 1463 iev = &sock->readable_ev; 1464 1465 sock->references++; 1466 iev->ev_sender = sock; 1467 iev->ev_action = internal_recv; 1468 iev->ev_arg = sock; 1469 1470 isc_task_send(sender, (isc_event_t **)&iev); 1471 } 1472 1473 static void 1474 dispatch_send(isc_socket_t *sock) { 1475 intev_t *iev; 1476 isc_socketevent_t *ev; 1477 isc_task_t *sender; 1478 1479 INSIST(!sock->pending_send); 1480 1481 ev = ISC_LIST_HEAD(sock->send_list); 1482 if (ev == NULL) 1483 return; 1484 socket_log(sock, NULL, EVENT, 1485 "dispatch_send: event %p -> task %p", 1486 ev, ev->ev_sender); 1487 sender = ev->ev_sender; 1488 1489 sock->pending_send = 1; 1490 iev = &sock->writable_ev; 1491 1492 sock->references++; 1493 iev->ev_sender = sock; 1494 iev->ev_action = internal_send; 1495 iev->ev_arg = sock; 1496 1497 isc_task_send(sender, (isc_event_t **)&iev); 1498 } 1499 1500 static void 1501 dispatch_connect(isc_socket_t *sock) { 1502 intev_t *iev; 1503 isc_socket_connev_t *ev; 1504 1505 iev = &sock->writable_ev; 1506 1507 ev = sock->connect_ev; 1508 INSIST(ev != NULL); /* XXX */ 1509 1510 INSIST(sock->connecting); 1511 1512 sock->references++; /* keep socket around for this internal event */ 1513 iev->ev_sender = sock; 1514 iev->ev_action = internal_connect; 1515 iev->ev_arg = sock; 1516 1517 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 1518 } 1519 1520 /* 1521 * Dequeue an item off the given socket's read queue, set the result code 1522 * in the done event to the one provided, and send it to the task it was 1523 * destined for. 1524 * 1525 * If the event to be sent is on a list, remove it before sending. If 1526 * asked to, send and detach from the socket as well. 1527 * 1528 * Caller must have the socket locked if the event is attached to the socket. 1529 */ 1530 static void 1531 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 1532 isc_task_t *task; 1533 1534 task = (*dev)->ev_sender; 1535 1536 (*dev)->ev_sender = sock; 1537 1538 if (ISC_LINK_LINKED(*dev, ev_link)) 1539 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 1540 1541 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 1542 == ISC_SOCKEVENTATTR_ATTACHED) 1543 isc_task_sendanddetach(&task, (isc_event_t **)dev); 1544 else 1545 isc_task_send(task, (isc_event_t **)dev); 1546 } 1547 1548 /* 1549 * See comments for send_recvdone_event() above. 1550 * 1551 * Caller must have the socket locked if the event is attached to the socket. 1552 */ 1553 static void 1554 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 1555 isc_task_t *task; 1556 1557 INSIST(dev != NULL && *dev != NULL); 1558 1559 task = (*dev)->ev_sender; 1560 (*dev)->ev_sender = sock; 1561 1562 if (ISC_LINK_LINKED(*dev, ev_link)) 1563 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 1564 1565 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 1566 == ISC_SOCKEVENTATTR_ATTACHED) 1567 isc_task_sendanddetach(&task, (isc_event_t **)dev); 1568 else 1569 isc_task_send(task, (isc_event_t **)dev); 1570 } 1571 1572 static void 1573 internal_recv(isc_task_t *me, isc_event_t *ev) { 1574 isc_socketevent_t *dev; 1575 isc_socket_t *sock; 1576 1577 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 1578 1579 sock = ev->ev_sender; 1580 1581 socket_log(sock, NULL, IOEVENT, 1582 "internal_recv: task %p got event %p", me, ev); 1583 1584 INSIST(sock->pending_recv == 1); 1585 sock->pending_recv = 0; 1586 1587 INSIST(sock->references > 0); 1588 sock->references--; /* the internal event is done with this socket */ 1589 if (sock->references == 0) { 1590 destroy(&sock); 1591 return; 1592 } 1593 1594 /* 1595 * Try to do as much I/O as possible on this socket. There are no 1596 * limits here, currently. 1597 */ 1598 dev = ISC_LIST_HEAD(sock->recv_list); 1599 while (dev != NULL) { 1600 switch (doio_recv(sock, dev)) { 1601 case DOIO_SOFT: 1602 goto poke; 1603 1604 case DOIO_EOF: 1605 /* 1606 * read of 0 means the remote end was closed. 1607 * Run through the event queue and dispatch all 1608 * the events with an EOF result code. 1609 */ 1610 do { 1611 dev->result = ISC_R_EOF; 1612 send_recvdone_event(sock, &dev); 1613 dev = ISC_LIST_HEAD(sock->recv_list); 1614 } while (dev != NULL); 1615 goto poke; 1616 1617 case DOIO_SUCCESS: 1618 case DOIO_HARD: 1619 send_recvdone_event(sock, &dev); 1620 break; 1621 } 1622 1623 dev = ISC_LIST_HEAD(sock->recv_list); 1624 } 1625 1626 poke: 1627 if (!ISC_LIST_EMPTY(sock->recv_list)) 1628 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 1629 } 1630 1631 static void 1632 internal_send(isc_task_t *me, isc_event_t *ev) { 1633 isc_socketevent_t *dev; 1634 isc_socket_t *sock; 1635 1636 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 1637 1638 /* 1639 * Find out what socket this is and lock it. 1640 */ 1641 sock = (isc_socket_t *)ev->ev_sender; 1642 socket_log(sock, NULL, IOEVENT, 1643 "internal_send: task %p got event %p", me, ev); 1644 1645 INSIST(sock->pending_send == 1); 1646 sock->pending_send = 0; 1647 1648 INSIST(sock->references > 0); 1649 sock->references--; /* the internal event is done with this socket */ 1650 if (sock->references == 0) { 1651 destroy(&sock); 1652 return; 1653 } 1654 1655 /* 1656 * Try to do as much I/O as possible on this socket. There are no 1657 * limits here, currently. 1658 */ 1659 dev = ISC_LIST_HEAD(sock->send_list); 1660 while (dev != NULL) { 1661 switch (doio_send(sock, dev)) { 1662 case DOIO_SOFT: 1663 goto poke; 1664 1665 case DOIO_HARD: 1666 case DOIO_SUCCESS: 1667 send_senddone_event(sock, &dev); 1668 break; 1669 } 1670 1671 dev = ISC_LIST_HEAD(sock->send_list); 1672 } 1673 1674 poke: 1675 if (!ISC_LIST_EMPTY(sock->send_list)) 1676 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 1677 } 1678 1679 /* 1680 * Process read/writes on each fd here. Avoid locking 1681 * and unlocking twice if both reads and writes are possible. 1682 */ 1683 static void 1684 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable, 1685 isc_boolean_t writeable) 1686 { 1687 isc_socket_t *sock; 1688 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 1689 1690 /* 1691 * If the socket is going to be closed, don't do more I/O. 1692 */ 1693 if (manager->fdstate[fd] == CLOSE_PENDING) { 1694 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1695 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1696 return; 1697 } 1698 1699 sock = manager->fds[fd]; 1700 if (readable) { 1701 if (sock == NULL) { 1702 unwatch_read = ISC_TRUE; 1703 goto check_write; 1704 } 1705 if (!SOCK_DEAD(sock)) { 1706 dispatch_recv(sock); 1707 } 1708 unwatch_read = ISC_TRUE; 1709 } 1710 check_write: 1711 if (writeable) { 1712 if (sock == NULL) { 1713 unwatch_write = ISC_TRUE; 1714 goto unlock_fd; 1715 } 1716 if (!SOCK_DEAD(sock)) { 1717 if (sock->connecting) 1718 dispatch_connect(sock); 1719 else 1720 dispatch_send(sock); 1721 } 1722 unwatch_write = ISC_TRUE; 1723 } 1724 1725 unlock_fd: 1726 if (unwatch_read) 1727 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1728 if (unwatch_write) 1729 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1730 1731 } 1732 1733 static void 1734 process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds, 1735 fd_set *writefds) 1736 { 1737 int i; 1738 1739 REQUIRE(maxfd <= (int)manager->maxsocks); 1740 1741 for (i = 0; i < maxfd; i++) { 1742 process_fd(manager, i, FD_ISSET(i, readfds), 1743 FD_ISSET(i, writefds)); 1744 } 1745 } 1746 1747 /* 1748 * Create a new socket manager. 1749 */ 1750 1751 static isc_result_t 1752 setup_watcher(isc_socketmgr_t *manager) { 1753 isc_result_t result; 1754 1755 UNUSED(result); 1756 1757 manager->fd_bufsize = sizeof(fd_set); 1758 1759 manager->read_fds = NULL; 1760 manager->read_fds_copy = NULL; 1761 manager->write_fds = NULL; 1762 manager->write_fds_copy = NULL; 1763 1764 manager->read_fds = malloc(manager->fd_bufsize); 1765 if (manager->read_fds != NULL) 1766 manager->read_fds_copy = malloc(manager->fd_bufsize); 1767 if (manager->read_fds_copy != NULL) 1768 manager->write_fds = malloc(manager->fd_bufsize); 1769 if (manager->write_fds != NULL) { 1770 manager->write_fds_copy = malloc(manager->fd_bufsize); 1771 } 1772 if (manager->write_fds_copy == NULL) { 1773 if (manager->write_fds != NULL) { 1774 free(manager->write_fds); 1775 } 1776 if (manager->read_fds_copy != NULL) { 1777 free(manager->read_fds_copy); 1778 } 1779 if (manager->read_fds != NULL) { 1780 free(manager->read_fds); 1781 } 1782 return (ISC_R_NOMEMORY); 1783 } 1784 memset(manager->read_fds, 0, manager->fd_bufsize); 1785 memset(manager->write_fds, 0, manager->fd_bufsize); 1786 1787 manager->maxfd = 0; 1788 1789 return (ISC_R_SUCCESS); 1790 } 1791 1792 static void 1793 cleanup_watcher(isc_socketmgr_t *manager) { 1794 1795 if (manager->read_fds != NULL) 1796 free(manager->read_fds); 1797 if (manager->read_fds_copy != NULL) 1798 free(manager->read_fds_copy); 1799 if (manager->write_fds != NULL) 1800 free(manager->write_fds); 1801 if (manager->write_fds_copy != NULL) 1802 free(manager->write_fds_copy); 1803 } 1804 1805 static isc_result_t 1806 isc_socketmgr_create2(isc_socketmgr_t **managerp, 1807 unsigned int maxsocks) 1808 { 1809 isc_socketmgr_t *manager; 1810 isc_result_t result; 1811 1812 REQUIRE(managerp != NULL && *managerp == NULL); 1813 1814 if (socketmgr != NULL) { 1815 /* Don't allow maxsocks to be updated */ 1816 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 1817 return (ISC_R_EXISTS); 1818 1819 socketmgr->refs++; 1820 *managerp = (isc_socketmgr_t *)socketmgr; 1821 return (ISC_R_SUCCESS); 1822 } 1823 1824 if (maxsocks == 0) 1825 maxsocks = FD_SETSIZE; 1826 1827 manager = malloc(sizeof(*manager)); 1828 if (manager == NULL) 1829 return (ISC_R_NOMEMORY); 1830 1831 /* zero-clear so that necessary cleanup on failure will be easy */ 1832 memset(manager, 0, sizeof(*manager)); 1833 manager->maxsocks = maxsocks; 1834 manager->fds = reallocarray(NULL, manager->maxsocks, sizeof(isc_socket_t *)); 1835 if (manager->fds == NULL) { 1836 result = ISC_R_NOMEMORY; 1837 goto free_manager; 1838 } 1839 manager->fdstate = reallocarray(NULL, manager->maxsocks, sizeof(int)); 1840 if (manager->fdstate == NULL) { 1841 result = ISC_R_NOMEMORY; 1842 goto free_manager; 1843 } 1844 1845 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 1846 ISC_LIST_INIT(manager->socklist); 1847 1848 manager->refs = 1; 1849 1850 /* 1851 * Set up initial state for the select loop 1852 */ 1853 result = setup_watcher(manager); 1854 if (result != ISC_R_SUCCESS) 1855 goto cleanup; 1856 1857 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 1858 1859 socketmgr = manager; 1860 *managerp = (isc_socketmgr_t *)manager; 1861 1862 return (ISC_R_SUCCESS); 1863 1864 cleanup: 1865 1866 free_manager: 1867 if (manager->fdstate != NULL) { 1868 free(manager->fdstate); 1869 } 1870 if (manager->fds != NULL) { 1871 free(manager->fds); 1872 } 1873 free(manager); 1874 1875 return (result); 1876 } 1877 1878 isc_result_t 1879 isc_socketmgr_create(isc_socketmgr_t **managerp) { 1880 return (isc_socketmgr_create2(managerp, 0)); 1881 } 1882 1883 void 1884 isc_socketmgr_destroy(isc_socketmgr_t **managerp) { 1885 isc_socketmgr_t *manager; 1886 int i; 1887 1888 /* 1889 * Destroy a socket manager. 1890 */ 1891 1892 REQUIRE(managerp != NULL); 1893 manager = (isc_socketmgr_t *)*managerp; 1894 1895 manager->refs--; 1896 if (manager->refs > 0) { 1897 *managerp = NULL; 1898 return; 1899 } 1900 socketmgr = NULL; 1901 1902 /* 1903 * Wait for all sockets to be destroyed. 1904 */ 1905 while (!ISC_LIST_EMPTY(manager->socklist)) { 1906 isc_taskmgr_dispatch(NULL); 1907 } 1908 1909 /* 1910 * Here, poke our select/poll thread. Do this by closing the write 1911 * half of the pipe, which will send EOF to the read half. 1912 * This is currently a no-op in the non-threaded case. 1913 */ 1914 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 1915 1916 /* 1917 * Clean up. 1918 */ 1919 cleanup_watcher(manager); 1920 1921 for (i = 0; i < (int)manager->maxsocks; i++) 1922 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 1923 (void)close(i); 1924 1925 free(manager->fds); 1926 free(manager->fdstate); 1927 1928 free(manager); 1929 1930 *managerp = NULL; 1931 1932 socketmgr = NULL; 1933 } 1934 1935 static isc_result_t 1936 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 1937 unsigned int flags) 1938 { 1939 int io_state; 1940 isc_task_t *ntask = NULL; 1941 isc_result_t result = ISC_R_SUCCESS; 1942 1943 dev->ev_sender = task; 1944 1945 if (sock->type == isc_sockettype_udp) { 1946 io_state = doio_recv(sock, dev); 1947 } else { 1948 if (ISC_LIST_EMPTY(sock->recv_list)) 1949 io_state = doio_recv(sock, dev); 1950 else 1951 io_state = DOIO_SOFT; 1952 } 1953 1954 switch (io_state) { 1955 case DOIO_SOFT: 1956 /* 1957 * We couldn't read all or part of the request right now, so 1958 * queue it. 1959 * 1960 * Attach to socket and to task 1961 */ 1962 isc_task_attach(task, &ntask); 1963 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 1964 1965 /* 1966 * Enqueue the request. If the socket was previously not being 1967 * watched, poke the watcher to start paying attention to it. 1968 */ 1969 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 1970 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 1971 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 1972 1973 socket_log(sock, NULL, EVENT, 1974 "socket_recv: event %p -> task %p", 1975 dev, ntask); 1976 1977 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 1978 result = ISC_R_INPROGRESS; 1979 break; 1980 1981 case DOIO_EOF: 1982 dev->result = ISC_R_EOF; 1983 /* fallthrough */ 1984 1985 case DOIO_HARD: 1986 case DOIO_SUCCESS: 1987 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 1988 send_recvdone_event(sock, &dev); 1989 break; 1990 } 1991 1992 return (result); 1993 } 1994 1995 isc_result_t 1996 isc_socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist, 1997 unsigned int minimum, isc_task_t *task, 1998 isc_taskaction_t action, void *arg) 1999 { 2000 isc_socket_t *sock = (isc_socket_t *)sock0; 2001 isc_socketevent_t *dev; 2002 unsigned int iocount; 2003 isc_buffer_t *buffer; 2004 2005 REQUIRE(buflist != NULL); 2006 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 2007 REQUIRE(task != NULL); 2008 REQUIRE(action != NULL); 2009 2010 iocount = isc_bufferlist_availablecount(buflist); 2011 REQUIRE(iocount > 0); 2012 2013 INSIST(sock->bound); 2014 2015 dev = allocate_socketevent(sock, 2016 ISC_SOCKEVENT_RECVDONE, action, arg); 2017 if (dev == NULL) 2018 return (ISC_R_NOMEMORY); 2019 2020 /* 2021 * UDP sockets are always partial read 2022 */ 2023 if (sock->type == isc_sockettype_udp) 2024 dev->minimum = 1; 2025 else { 2026 if (minimum == 0) 2027 dev->minimum = iocount; 2028 else 2029 dev->minimum = minimum; 2030 } 2031 2032 /* 2033 * Move each buffer from the passed in list to our internal one. 2034 */ 2035 buffer = ISC_LIST_HEAD(*buflist); 2036 while (buffer != NULL) { 2037 ISC_LIST_DEQUEUE(*buflist, buffer, link); 2038 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 2039 buffer = ISC_LIST_HEAD(*buflist); 2040 } 2041 2042 return (socket_recv(sock, dev, task, 0)); 2043 } 2044 2045 static isc_result_t 2046 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 2047 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 2048 unsigned int flags) 2049 { 2050 int io_state; 2051 isc_task_t *ntask = NULL; 2052 isc_result_t result = ISC_R_SUCCESS; 2053 2054 dev->ev_sender = task; 2055 2056 set_dev_address(address, sock, dev); 2057 if (pktinfo != NULL) { 2058 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 2059 dev->pktinfo = *pktinfo; 2060 2061 if (!isc_sockaddr_issitelocal(&dev->address) && 2062 !isc_sockaddr_islinklocal(&dev->address)) { 2063 socket_log(sock, NULL, TRACE, 2064 "pktinfo structure provided, ifindex %u " 2065 "(set to 0)", pktinfo->ipi6_ifindex); 2066 2067 /* 2068 * Set the pktinfo index to 0 here, to let the 2069 * kernel decide what interface it should send on. 2070 */ 2071 dev->pktinfo.ipi6_ifindex = 0; 2072 } 2073 } 2074 2075 if (sock->type == isc_sockettype_udp) 2076 io_state = doio_send(sock, dev); 2077 else { 2078 if (ISC_LIST_EMPTY(sock->send_list)) 2079 io_state = doio_send(sock, dev); 2080 else 2081 io_state = DOIO_SOFT; 2082 } 2083 2084 switch (io_state) { 2085 case DOIO_SOFT: 2086 /* 2087 * We couldn't send all or part of the request right now, so 2088 * queue it unless ISC_SOCKFLAG_NORETRY is set. 2089 */ 2090 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 2091 isc_task_attach(task, &ntask); 2092 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 2093 2094 /* 2095 * Enqueue the request. If the socket was previously 2096 * not being watched, poke the watcher to start 2097 * paying attention to it. 2098 */ 2099 if (ISC_LIST_EMPTY(sock->send_list) && 2100 !sock->pending_send) 2101 select_poke(sock->manager, sock->fd, 2102 SELECT_POKE_WRITE); 2103 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 2104 2105 socket_log(sock, NULL, EVENT, 2106 "socket_send: event %p -> task %p", 2107 dev, ntask); 2108 2109 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 2110 result = ISC_R_INPROGRESS; 2111 break; 2112 } 2113 2114 /* FALLTHROUGH */ 2115 2116 case DOIO_HARD: 2117 case DOIO_SUCCESS: 2118 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 2119 send_senddone_event(sock, &dev); 2120 break; 2121 } 2122 2123 return (result); 2124 } 2125 2126 isc_result_t 2127 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 2128 isc_task_t *task, isc_taskaction_t action, void *arg) 2129 { 2130 return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL, 2131 NULL, 0)); 2132 } 2133 2134 isc_result_t 2135 isc_socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist, 2136 isc_task_t *task, isc_taskaction_t action, void *arg, 2137 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 2138 unsigned int flags) 2139 { 2140 isc_socket_t *sock = (isc_socket_t *)sock0; 2141 isc_socketevent_t *dev; 2142 unsigned int iocount; 2143 isc_buffer_t *buffer; 2144 2145 REQUIRE(buflist != NULL); 2146 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 2147 REQUIRE(task != NULL); 2148 REQUIRE(action != NULL); 2149 2150 iocount = isc_bufferlist_usedcount(buflist); 2151 REQUIRE(iocount > 0); 2152 2153 dev = allocate_socketevent(sock, 2154 ISC_SOCKEVENT_SENDDONE, action, arg); 2155 if (dev == NULL) 2156 return (ISC_R_NOMEMORY); 2157 2158 /* 2159 * Move each buffer from the passed in list to our internal one. 2160 */ 2161 buffer = ISC_LIST_HEAD(*buflist); 2162 while (buffer != NULL) { 2163 ISC_LIST_DEQUEUE(*buflist, buffer, link); 2164 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 2165 buffer = ISC_LIST_HEAD(*buflist); 2166 } 2167 2168 return (socket_send(sock, dev, task, address, pktinfo, flags)); 2169 } 2170 2171 isc_result_t 2172 isc_socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr, 2173 unsigned int options) { 2174 isc_socket_t *sock = (isc_socket_t *)sock0; 2175 int on = 1; 2176 2177 INSIST(!sock->bound); 2178 2179 if (sock->pf != sockaddr->type.sa.sa_family) { 2180 return (ISC_R_FAMILYMISMATCH); 2181 } 2182 2183 /* 2184 * Only set SO_REUSEADDR when we want a specific port. 2185 */ 2186 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 2187 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 2188 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 2189 sizeof(on)) < 0) { 2190 UNEXPECTED_ERROR(__FILE__, __LINE__, 2191 "setsockopt(%d) %s", sock->fd, "failed"); 2192 /* Press on... */ 2193 } 2194 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 2195 switch (errno) { 2196 case EACCES: 2197 return (ISC_R_NOPERM); 2198 case EADDRNOTAVAIL: 2199 return (ISC_R_ADDRNOTAVAIL); 2200 case EADDRINUSE: 2201 return (ISC_R_ADDRINUSE); 2202 case EINVAL: 2203 return (ISC_R_BOUND); 2204 default: 2205 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 2206 strerror(errno)); 2207 return (ISC_R_UNEXPECTED); 2208 } 2209 } 2210 2211 socket_log(sock, sockaddr, TRACE, "bound"); 2212 sock->bound = 1; 2213 2214 return (ISC_R_SUCCESS); 2215 } 2216 2217 isc_result_t 2218 isc_socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr, 2219 isc_task_t *task, isc_taskaction_t action, void *arg) 2220 { 2221 isc_socket_t *sock = (isc_socket_t *)sock0; 2222 isc_socket_connev_t *dev; 2223 isc_task_t *ntask = NULL; 2224 isc_socketmgr_t *manager; 2225 int cc; 2226 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 2227 2228 REQUIRE(addr != NULL); 2229 REQUIRE(task != NULL); 2230 REQUIRE(action != NULL); 2231 2232 manager = sock->manager; 2233 REQUIRE(addr != NULL); 2234 2235 if (isc_sockaddr_ismulticast(addr)) 2236 return (ISC_R_MULTICAST); 2237 2238 REQUIRE(!sock->connecting); 2239 2240 dev = (isc_socket_connev_t *)isc_event_allocate(sock, 2241 ISC_SOCKEVENT_CONNECT, 2242 action, arg, 2243 sizeof(*dev)); 2244 if (dev == NULL) { 2245 return (ISC_R_NOMEMORY); 2246 } 2247 ISC_LINK_INIT(dev, ev_link); 2248 2249 /* 2250 * Try to do the connect right away, as there can be only one 2251 * outstanding, and it might happen to complete. 2252 */ 2253 sock->peer_address = *addr; 2254 cc = connect(sock->fd, &addr->type.sa, addr->length); 2255 if (cc < 0) { 2256 /* 2257 * HP-UX "fails" to connect a UDP socket and sets errno to 2258 * EINPROGRESS if it's non-blocking. We'd rather regard this as 2259 * a success and let the user detect it if it's really an error 2260 * at the time of sending a packet on the socket. 2261 */ 2262 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 2263 cc = 0; 2264 goto success; 2265 } 2266 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 2267 goto queue; 2268 2269 switch (errno) { 2270 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 2271 ERROR_MATCH(EACCES, ISC_R_NOPERM); 2272 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 2273 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 2274 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 2275 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 2276 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 2277 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 2278 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 2279 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 2280 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 2281 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 2282 #undef ERROR_MATCH 2283 } 2284 2285 sock->connected = 0; 2286 2287 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf)); 2288 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s", 2289 addrbuf, errno, strerror(errno)); 2290 2291 isc_event_free(ISC_EVENT_PTR(&dev)); 2292 return (ISC_R_UNEXPECTED); 2293 2294 err_exit: 2295 sock->connected = 0; 2296 isc_task_send(task, ISC_EVENT_PTR(&dev)); 2297 2298 return (ISC_R_SUCCESS); 2299 } 2300 2301 /* 2302 * If connect completed, fire off the done event. 2303 */ 2304 success: 2305 if (cc == 0) { 2306 sock->connected = 1; 2307 sock->bound = 1; 2308 dev->result = ISC_R_SUCCESS; 2309 isc_task_send(task, ISC_EVENT_PTR(&dev)); 2310 2311 return (ISC_R_SUCCESS); 2312 } 2313 2314 queue: 2315 2316 /* 2317 * Attach to task. 2318 */ 2319 isc_task_attach(task, &ntask); 2320 2321 sock->connecting = 1; 2322 2323 dev->ev_sender = ntask; 2324 2325 /* 2326 * Poke watcher here. We still have the socket locked, so there 2327 * is no race condition. We will keep the lock for such a short 2328 * bit of time waking it up now or later won't matter all that much. 2329 */ 2330 if (sock->connect_ev == NULL) 2331 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 2332 2333 sock->connect_ev = dev; 2334 2335 return (ISC_R_SUCCESS); 2336 } 2337 2338 /* 2339 * Called when a socket with a pending connect() finishes. 2340 */ 2341 static void 2342 internal_connect(isc_task_t *me, isc_event_t *ev) { 2343 isc_socket_t *sock; 2344 isc_socket_connev_t *dev; 2345 isc_task_t *task; 2346 int cc; 2347 socklen_t optlen; 2348 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 2349 2350 UNUSED(me); 2351 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 2352 2353 sock = ev->ev_sender; 2354 2355 /* 2356 * When the internal event was sent the reference count was bumped 2357 * to keep the socket around for us. Decrement the count here. 2358 */ 2359 INSIST(sock->references > 0); 2360 sock->references--; 2361 if (sock->references == 0) { 2362 destroy(&sock); 2363 return; 2364 } 2365 2366 /* 2367 * Has this event been canceled? 2368 */ 2369 dev = sock->connect_ev; 2370 if (dev == NULL) { 2371 INSIST(!sock->connecting); 2372 return; 2373 } 2374 2375 INSIST(sock->connecting); 2376 sock->connecting = 0; 2377 2378 /* 2379 * Get any possible error status here. 2380 */ 2381 optlen = sizeof(cc); 2382 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 2383 (void *)&cc, (void *)&optlen) < 0) 2384 cc = errno; 2385 else 2386 errno = cc; 2387 2388 if (errno != 0) { 2389 /* 2390 * If the error is EAGAIN, just re-select on this 2391 * fd and pretend nothing strange happened. 2392 */ 2393 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 2394 sock->connecting = 1; 2395 select_poke(sock->manager, sock->fd, 2396 SELECT_POKE_CONNECT); 2397 return; 2398 } 2399 2400 /* 2401 * Translate other errors into ISC_R_* flavors. 2402 */ 2403 switch (errno) { 2404 #define ERROR_MATCH(a, b) case a: dev->result = b; break; 2405 ERROR_MATCH(EACCES, ISC_R_NOPERM); 2406 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 2407 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 2408 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 2409 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 2410 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 2411 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 2412 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 2413 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 2414 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 2415 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 2416 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 2417 #undef ERROR_MATCH 2418 default: 2419 dev->result = ISC_R_UNEXPECTED; 2420 isc_sockaddr_format(&sock->peer_address, peerbuf, 2421 sizeof(peerbuf)); 2422 UNEXPECTED_ERROR(__FILE__, __LINE__, 2423 "internal_connect: connect(%s) %s", 2424 peerbuf, strerror(errno)); 2425 } 2426 } else { 2427 dev->result = ISC_R_SUCCESS; 2428 sock->connected = 1; 2429 sock->bound = 1; 2430 } 2431 2432 sock->connect_ev = NULL; 2433 2434 task = dev->ev_sender; 2435 dev->ev_sender = sock; 2436 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 2437 } 2438 2439 /* 2440 * Run through the list of events on this socket, and cancel the ones 2441 * queued for task "task" of type "how". "how" is a bitmask. 2442 */ 2443 void 2444 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) { 2445 isc_socket_t *sock = (isc_socket_t *)sock0; 2446 2447 /* 2448 * Quick exit if there is nothing to do. Don't even bother locking 2449 * in this case. 2450 */ 2451 if (how == 0) 2452 return; 2453 2454 /* 2455 * All of these do the same thing, more or less. 2456 * Each will: 2457 * o If the internal event is marked as "posted" try to 2458 * remove it from the task's queue. If this fails, mark it 2459 * as canceled instead, and let the task clean it up later. 2460 * o For each I/O request for that task of that type, post 2461 * its done event with status of "ISC_R_CANCELED". 2462 * o Reset any state needed. 2463 */ 2464 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 2465 && !ISC_LIST_EMPTY(sock->recv_list)) { 2466 isc_socketevent_t *dev; 2467 isc_socketevent_t *next; 2468 isc_task_t *current_task; 2469 2470 dev = ISC_LIST_HEAD(sock->recv_list); 2471 2472 while (dev != NULL) { 2473 current_task = dev->ev_sender; 2474 next = ISC_LIST_NEXT(dev, ev_link); 2475 2476 if ((task == NULL) || (task == current_task)) { 2477 dev->result = ISC_R_CANCELED; 2478 send_recvdone_event(sock, &dev); 2479 } 2480 dev = next; 2481 } 2482 } 2483 2484 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 2485 && !ISC_LIST_EMPTY(sock->send_list)) { 2486 isc_socketevent_t *dev; 2487 isc_socketevent_t *next; 2488 isc_task_t *current_task; 2489 2490 dev = ISC_LIST_HEAD(sock->send_list); 2491 2492 while (dev != NULL) { 2493 current_task = dev->ev_sender; 2494 next = ISC_LIST_NEXT(dev, ev_link); 2495 2496 if ((task == NULL) || (task == current_task)) { 2497 dev->result = ISC_R_CANCELED; 2498 send_senddone_event(sock, &dev); 2499 } 2500 dev = next; 2501 } 2502 } 2503 2504 /* 2505 * Connecting is not a list. 2506 */ 2507 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 2508 && sock->connect_ev != NULL) { 2509 isc_socket_connev_t *dev; 2510 isc_task_t *current_task; 2511 2512 INSIST(sock->connecting); 2513 sock->connecting = 0; 2514 2515 dev = sock->connect_ev; 2516 current_task = dev->ev_sender; 2517 2518 if ((task == NULL) || (task == current_task)) { 2519 sock->connect_ev = NULL; 2520 2521 dev->result = ISC_R_CANCELED; 2522 dev->ev_sender = sock; 2523 isc_task_sendanddetach(¤t_task, 2524 ISC_EVENT_PTR(&dev)); 2525 } 2526 } 2527 2528 } 2529 2530 /* 2531 * In our assumed scenario, we can simply use a single static object. 2532 * XXX: this is not true if the application uses multiple threads with 2533 * 'multi-context' mode. Fixing this is a future TODO item. 2534 */ 2535 static isc_socketwait_t swait_private; 2536 2537 int 2538 isc_socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp, 2539 isc_socketwait_t **swaitp) 2540 { 2541 isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0; 2542 int n; 2543 2544 REQUIRE(swaitp != NULL && *swaitp == NULL); 2545 2546 if (manager == NULL) 2547 manager = socketmgr; 2548 if (manager == NULL) 2549 return (0); 2550 2551 memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); 2552 memmove(manager->write_fds_copy, manager->write_fds, 2553 manager->fd_bufsize); 2554 2555 swait_private.readset = manager->read_fds_copy; 2556 swait_private.writeset = manager->write_fds_copy; 2557 swait_private.maxfd = manager->maxfd + 1; 2558 2559 n = select(swait_private.maxfd, swait_private.readset, 2560 swait_private.writeset, NULL, tvp); 2561 2562 *swaitp = &swait_private; 2563 return (n); 2564 } 2565 2566 isc_result_t 2567 isc_socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) { 2568 isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0; 2569 2570 REQUIRE(swait == &swait_private); 2571 2572 if (manager == NULL) 2573 manager = socketmgr; 2574 if (manager == NULL) 2575 return (ISC_R_NOTFOUND); 2576 2577 process_fds(manager, swait->maxfd, swait->readset, swait->writeset); 2578 return (ISC_R_SUCCESS); 2579 } 2580