xref: /openbsd/usr.bin/dig/lib/isc/unix/socket.c (revision 09467b48)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Some systems define the socket length argument as an int, some as size_t,
64  * some as socklen_t.  This is here so it can be easily changed if needed.
65  */
66 
67 /*%
68  * Define what the possible "soft" errors can be.  These are non-fatal returns
69  * of various network related functions, like recv() and so on.
70  *
71  * For some reason, BSDI (and perhaps others) will sometimes return <0
72  * from recv() but will have errno==0.  This is broken, but we have to
73  * work around it here.
74  */
75 #define SOFT_ERROR(e)	((e) == EAGAIN || \
76 			 (e) == EWOULDBLOCK || \
77 			 (e) == EINTR || \
78 			 (e) == 0)
79 
80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
81 
82 /*!<
83  * DLVL(90)  --  Function entry/exit and other tracing.
84  * DLVL(60)  --  Socket data send/receive
85  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
86  * DLVL(20)  --  Socket creation/destruction.
87  */
88 #define TRACE_LEVEL		90
89 #define IOEVENT_LEVEL		60
90 #define EVENT_LEVEL		50
91 #define CREATION_LEVEL		20
92 
93 #define TRACE		DLVL(TRACE_LEVEL)
94 #define IOEVENT		DLVL(IOEVENT_LEVEL)
95 #define EVENT		DLVL(EVENT_LEVEL)
96 #define CREATION	DLVL(CREATION_LEVEL)
97 
98 typedef isc_event_t intev_t;
99 
100 /*!
101  * IPv6 control information.  If the socket is an IPv6 socket we want
102  * to collect the destination address and interface so the client can
103  * set them on outgoing packets.
104  */
105 
106 /*%
107  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
108  * a setsockopt() like interface to request timestamps, and if the OS
109  * doesn't do it for us, call gettimeofday() on every UDP receive?
110  */
111 
112 /*%
113  * Instead of calculating the cmsgbuf lengths every time we take
114  * a rule of thumb approach - sizes are taken from x86_64 linux,
115  * multiplied by 2, everything should fit. Those sizes are not
116  * large enough to cause any concern.
117  */
118 #define CMSG_SP_IN6PKT 40
119 
120 #define CMSG_SP_TIMESTAMP 32
121 
122 #define CMSG_SP_TCTOS 24
123 
124 #define CMSG_SP_INT 24
125 
126 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
127 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
128 
129 /*%
130  * The number of times a send operation is repeated if the result is EINTR.
131  */
132 #define NRETRIES 10
133 
134 struct isc_socket {
135 	/* Not locked. */
136 	isc_socketmgr_t	*manager;
137 	isc_sockettype_t	type;
138 
139 	/* Locked by socket lock. */
140 	ISC_LINK(isc_socket_t)	link;
141 	unsigned int		references;
142 	int			fd;
143 	int			pf;
144 
145 	ISC_LIST(isc_socketevent_t)		send_list;
146 	ISC_LIST(isc_socketevent_t)		recv_list;
147 	isc_socket_connev_t		       *connect_ev;
148 
149 	/*
150 	 * Internal events.  Posted when a descriptor is readable or
151 	 * writable.  These are statically allocated and never freed.
152 	 * They will be set to non-purgable before use.
153 	 */
154 	intev_t			readable_ev;
155 	intev_t			writable_ev;
156 
157 	isc_sockaddr_t		peer_address;       /* remote address */
158 
159 	unsigned int		pending_recv : 1,
160 				pending_send : 1,
161 				connected : 1,
162 				connecting : 1,     /* connect pending */
163 				bound : 1,          /* bound to local addr */
164 				active : 1,         /* currently active */
165 				pktdscp : 1;	    /* per packet dscp */
166 	unsigned int		dscp;
167 };
168 
169 struct isc_socketmgr {
170 	/* Not locked. */
171 	int			fd_bufsize;
172 	unsigned int		maxsocks;
173 
174 	isc_socket_t	       **fds;
175 	int			*fdstate;
176 
177 	/* Locked by manager lock. */
178 	ISC_LIST(isc_socket_t)	socklist;
179 	fd_set			*read_fds;
180 	fd_set			*read_fds_copy;
181 	fd_set			*write_fds;
182 	fd_set			*write_fds_copy;
183 	int			maxfd;
184 	unsigned int		refs;
185 };
186 
187 static isc_socketmgr_t *socketmgr = NULL;
188 
189 #define CLOSED			0	/* this one must be zero */
190 #define MANAGED			1
191 #define CLOSE_PENDING		2
192 
193 /*
194  * send() and recv() iovec counts
195  */
196 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
197 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
198 
199 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
200 				  isc_sockettype_t type,
201 				  isc_socket_t **socketp);
202 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
203 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
204 static void free_socket(isc_socket_t **);
205 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
206 				    isc_socket_t **);
207 static void destroy(isc_socket_t **);
208 static void internal_connect(isc_task_t *, isc_event_t *);
209 static void internal_recv(isc_task_t *, isc_event_t *);
210 static void internal_send(isc_task_t *, isc_event_t *);
211 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
212 static void build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *,
213 			      struct msghdr *, struct iovec *, size_t *);
214 static void build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *,
215 			      struct msghdr *, struct iovec *, size_t *);
216 
217 #define SELECT_POKE_SHUTDOWN		(-1)
218 #define SELECT_POKE_READ		(-3)
219 #define SELECT_POKE_WRITE		(-4)
220 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
221 #define SELECT_POKE_CLOSE		(-5)
222 
223 #define SOCK_DEAD(s)			((s)->references == 0)
224 
225 /*%
226  * Shortcut index arrays to get access to statistics counters.
227  */
228 enum {
229 	STATID_OPEN = 0,
230 	STATID_OPENFAIL = 1,
231 	STATID_CLOSE = 2,
232 	STATID_BINDFAIL = 3,
233 	STATID_CONNECTFAIL = 4,
234 	STATID_CONNECT = 5,
235 	STATID_ACCEPTFAIL = 6,
236 	STATID_ACCEPT = 7,
237 	STATID_SENDFAIL = 8,
238 	STATID_RECVFAIL = 9,
239 	STATID_ACTIVE = 10
240 };
241 
242 static void
243 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
244 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
245 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
246 static void
247 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
248 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
249 	   const char *fmt, ...)
250 {
251 	char msgbuf[2048];
252 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
253 	va_list ap;
254 
255 	if (! isc_log_wouldlog(isc_lctx, level))
256 		return;
257 
258 	va_start(ap, fmt);
259 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
260 	va_end(ap);
261 
262 	if (address == NULL) {
263 		isc_log_write(isc_lctx, category, module, level,
264 			       "socket %p: %s", sock, msgbuf);
265 	} else {
266 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
267 		isc_log_write(isc_lctx, category, module, level,
268 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
269 	}
270 }
271 
272 static inline isc_result_t
273 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
274 	isc_result_t result = ISC_R_SUCCESS;
275 
276 	if (msg == SELECT_POKE_READ)
277 		FD_SET(fd, manager->read_fds);
278 	if (msg == SELECT_POKE_WRITE)
279 		FD_SET(fd, manager->write_fds);
280 
281 	return (result);
282 }
283 
284 static inline isc_result_t
285 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
286 	isc_result_t result = ISC_R_SUCCESS;
287 
288 	if (msg == SELECT_POKE_READ)
289 		FD_CLR(fd, manager->read_fds);
290 	else if (msg == SELECT_POKE_WRITE)
291 		FD_CLR(fd, manager->write_fds);
292 
293 	return (result);
294 }
295 
296 static void
297 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
298 	isc_result_t result;
299 
300 	/*
301 	 * This is a wakeup on a socket.  If the socket is not in the
302 	 * process of being closed, start watching it for either reads
303 	 * or writes.
304 	 */
305 
306 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
307 
308 	if (msg == SELECT_POKE_CLOSE) {
309 		/* No one should be updating fdstate, so no need to lock it */
310 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
311 		manager->fdstate[fd] = CLOSED;
312 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
313 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
314 		(void)close(fd);
315 		return;
316 	}
317 
318 	if (manager->fdstate[fd] == CLOSE_PENDING) {
319 
320 		/*
321 		 * We accept (and ignore) any error from unwatch_fd() as we are
322 		 * closing the socket, hoping it doesn't leave dangling state in
323 		 * the kernel.
324 		 */
325 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
326 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
327 		return;
328 	}
329 	if (manager->fdstate[fd] != MANAGED) {
330 		return;
331 	}
332 
333 	/*
334 	 * Set requested bit.
335 	 */
336 	result = watch_fd(manager, fd, msg);
337 	if (result != ISC_R_SUCCESS) {
338 		/*
339 		 * XXXJT: what should we do?  Ignoring the failure of watching
340 		 * a socket will make the application dysfunctional, but there
341 		 * seems to be no reasonable recovery process.
342 		 */
343 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
344 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
345 			      "failed to start watching FD (%d): %s",
346 			      fd, isc_result_totext(result));
347 	}
348 }
349 
350 /*
351  * Update the state of the socketmgr when something changes.
352  */
353 static void
354 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
355 	if (msg == SELECT_POKE_SHUTDOWN)
356 		return;
357 	else if (fd >= 0)
358 		wakeup_socket(manager, fd, msg);
359 	return;
360 }
361 
362 /*
363  * Make a fd non-blocking.
364  */
365 static isc_result_t
366 make_nonblock(int fd) {
367 	int ret;
368 	int flags;
369 
370 	flags = fcntl(fd, F_GETFL, 0);
371 	flags |= O_NONBLOCK;
372 	ret = fcntl(fd, F_SETFL, flags);
373 
374 	if (ret == -1) {
375 		UNEXPECTED_ERROR(__FILE__, __LINE__,
376 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
377 				 strerror(errno));
378 		return (ISC_R_UNEXPECTED);
379 	}
380 
381 	return (ISC_R_SUCCESS);
382 }
383 
384 /*
385  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
386  * In order to ensure as much portability as possible, we provide wrapper
387  * functions of these macros.
388  * Note that cmsg_space() could run slow on OSes that do not have
389  * CMSG_SPACE.
390  */
391 static inline socklen_t
392 cmsg_len(socklen_t len) {
393 	return (CMSG_LEN(len));
394 }
395 
396 static inline socklen_t
397 cmsg_space(socklen_t len) {
398 	return (CMSG_SPACE(len));
399 }
400 
401 /*
402  * Process control messages received on a socket.
403  */
404 static void
405 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
406 	struct cmsghdr *cmsgp;
407 	struct in6_pktinfo *pktinfop;
408 	void *timevalp;
409 
410 	/*
411 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
412 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
413 	 * They are all here, outside of the CPP tests, because it is
414 	 * more consistent with the usual ISC coding style.
415 	 */
416 	UNUSED(sock);
417 	UNUSED(msg);
418 	UNUSED(dev);
419 
420 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
421 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
422 
423 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
424 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
425 
426 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
427 		return;
428 
429 	timevalp = NULL;
430 	pktinfop = NULL;
431 
432 	cmsgp = CMSG_FIRSTHDR(msg);
433 	while (cmsgp != NULL) {
434 		socket_log(sock, NULL, TRACE,
435 			   "processing cmsg %p", cmsgp);
436 
437 		if (cmsgp->cmsg_level == IPPROTO_IPV6
438 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
439 
440 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
441 			memmove(&dev->pktinfo, pktinfop,
442 				sizeof(struct in6_pktinfo));
443 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
444 			socket_log(sock, NULL, TRACE,
445 				   "interface received on ifindex %u",
446 				   dev->pktinfo.ipi6_ifindex);
447 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
448 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
449 			goto next;
450 		}
451 
452 		if (cmsgp->cmsg_level == SOL_SOCKET
453 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
454 			struct timeval tv;
455 			timevalp = CMSG_DATA(cmsgp);
456 			memmove(&tv, timevalp, sizeof(tv));
457 			TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp);
458 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
459 			goto next;
460 		}
461 
462 		if (cmsgp->cmsg_level == IPPROTO_IPV6
463 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
464 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
465 			dev->dscp >>= 2;
466 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
467 			goto next;
468 		}
469 
470 		if (cmsgp->cmsg_level == IPPROTO_IP
471 		    && (cmsgp->cmsg_type == IP_TOS)) {
472 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
473 			dev->dscp >>= 2;
474 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
475 			goto next;
476 		}
477 	next:
478 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
479 	}
480 
481 }
482 
483 /*
484  * Construct an iov array and attach it to the msghdr passed in.  This is
485  * the SEND constructor, which will use the used region of the buffer
486  * (if using a buffer list) or will use the internal region (if a single
487  * buffer I/O is requested).
488  *
489  * Nothing can be NULL, and the done event must list at least one buffer
490  * on the buffer linked list for this function to be meaningful.
491  *
492  * If write_countp != NULL, *write_countp will hold the number of bytes
493  * this transaction can send.
494  */
495 static void
496 build_msghdr_send(isc_socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
497 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
498 {
499 	unsigned int iovcount;
500 	isc_buffer_t *buffer;
501 	isc_region_t used;
502 	size_t write_count;
503 	size_t skip_count;
504 	struct cmsghdr *cmsgp;
505 
506 	memset(msg, 0, sizeof(*msg));
507 
508 	if (!sock->connected) {
509 		msg->msg_name = (void *)&dev->address.type.sa;
510 		msg->msg_namelen = dev->address.length;
511 	} else {
512 		msg->msg_name = NULL;
513 		msg->msg_namelen = 0;
514 	}
515 
516 	buffer = ISC_LIST_HEAD(dev->bufferlist);
517 	write_count = 0;
518 	iovcount = 0;
519 
520 	/*
521 	 * Single buffer I/O?  Skip what we've done so far in this region.
522 	 */
523 	if (buffer == NULL) {
524 		write_count = dev->region.length - dev->n;
525 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
526 		iov[0].iov_len = write_count;
527 		iovcount = 1;
528 
529 		goto config;
530 	}
531 
532 	/*
533 	 * Multibuffer I/O.
534 	 * Skip the data in the buffer list that we have already written.
535 	 */
536 	skip_count = dev->n;
537 	while (buffer != NULL) {
538 		if (skip_count < isc_buffer_usedlength(buffer))
539 			break;
540 		skip_count -= isc_buffer_usedlength(buffer);
541 		buffer = ISC_LIST_NEXT(buffer, link);
542 	}
543 
544 	while (buffer != NULL) {
545 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
546 
547 		isc_buffer_usedregion(buffer, &used);
548 
549 		if (used.length > 0) {
550 			iov[iovcount].iov_base = (void *)(used.base
551 							  + skip_count);
552 			iov[iovcount].iov_len = used.length - skip_count;
553 			write_count += (used.length - skip_count);
554 			skip_count = 0;
555 			iovcount++;
556 		}
557 		buffer = ISC_LIST_NEXT(buffer, link);
558 	}
559 
560 	INSIST(skip_count == 0U);
561 
562  config:
563 	msg->msg_iov = iov;
564 	msg->msg_iovlen = iovcount;
565 
566 	msg->msg_control = NULL;
567 	msg->msg_controllen = 0;
568 	msg->msg_flags = 0;
569 
570 	if ((sock->type == isc_sockettype_udp) &&
571 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
572 	{
573 		struct in6_pktinfo *pktinfop;
574 
575 		socket_log(sock, NULL, TRACE,
576 			   "sendto pktinfo data, ifindex %u",
577 			   dev->pktinfo.ipi6_ifindex);
578 
579 		msg->msg_control = (void *)cmsgbuf;
580 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
581 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
582 
583 		cmsgp = (struct cmsghdr *)cmsgbuf;
584 		cmsgp->cmsg_level = IPPROTO_IPV6;
585 		cmsgp->cmsg_type = IPV6_PKTINFO;
586 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
587 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
588 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
589 	}
590 
591 	if ((sock->type == isc_sockettype_udp) &&
592 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
593 	{
594 		int use_min_mtu = 1;	/* -1, 0, 1 */
595 
596 		cmsgp = (struct cmsghdr *)(cmsgbuf +
597 					   msg->msg_controllen);
598 
599 		msg->msg_control = (void *)cmsgbuf;
600 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
601 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
602 
603 		cmsgp->cmsg_level = IPPROTO_IPV6;
604 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
605 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
606 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
607 	}
608 
609 	if (isc_dscp_check_value > -1) {
610 		if (sock->type == isc_sockettype_udp)
611 			INSIST((int)dev->dscp == isc_dscp_check_value);
612 		else if (sock->type == isc_sockettype_tcp)
613 			INSIST((int)sock->dscp == isc_dscp_check_value);
614 	}
615 
616 	if ((sock->type == isc_sockettype_udp) &&
617 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
618 	{
619 		int dscp = (dev->dscp << 2) & 0xff;
620 
621 		INSIST(dev->dscp < 0x40);
622 
623 		if (sock->pf == AF_INET && sock->pktdscp) {
624 			cmsgp = (struct cmsghdr *)(cmsgbuf +
625 						   msg->msg_controllen);
626 			msg->msg_control = (void *)cmsgbuf;
627 			msg->msg_controllen += cmsg_space(sizeof(dscp));
628 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
629 
630 			cmsgp->cmsg_level = IPPROTO_IP;
631 			cmsgp->cmsg_type = IP_TOS;
632 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
633 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
634 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
635 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
636 			       (void *)&dscp, sizeof(int)) < 0)
637 			{
638 				UNEXPECTED_ERROR(__FILE__, __LINE__,
639 						 "setsockopt(%d, IP_TOS, %.02x)"
640 						 " %s: %s",
641 						 sock->fd, dscp >> 2,
642 						 "failed", strerror(errno));
643 			} else
644 				sock->dscp = dscp;
645 		}
646 
647 		if (sock->pf == AF_INET6 && sock->pktdscp) {
648 			cmsgp = (struct cmsghdr *)(cmsgbuf +
649 						   msg->msg_controllen);
650 			msg->msg_control = (void *)cmsgbuf;
651 			msg->msg_controllen += cmsg_space(sizeof(dscp));
652 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
653 
654 			cmsgp->cmsg_level = IPPROTO_IPV6;
655 			cmsgp->cmsg_type = IPV6_TCLASS;
656 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
657 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
658 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
659 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
660 				       (void *)&dscp, sizeof(int)) < 0) {
661 				UNEXPECTED_ERROR(__FILE__, __LINE__,
662 						 "setsockopt(%d, IPV6_TCLASS, "
663 						 "%.02x) %s: %s",
664 						 sock->fd, dscp >> 2,
665 						 "failed", strerror(errno));
666 			} else
667 				sock->dscp = dscp;
668 		}
669 
670 		if (msg->msg_controllen != 0 &&
671 		    msg->msg_controllen < SENDCMSGBUFLEN)
672 		{
673 			memset(cmsgbuf + msg->msg_controllen, 0,
674 			       SENDCMSGBUFLEN - msg->msg_controllen);
675 		}
676 	}
677 
678 	if (write_countp != NULL)
679 		*write_countp = write_count;
680 }
681 
682 /*
683  * Construct an iov array and attach it to the msghdr passed in.  This is
684  * the RECV constructor, which will use the available region of the buffer
685  * (if using a buffer list) or will use the internal region (if a single
686  * buffer I/O is requested).
687  *
688  * Nothing can be NULL, and the done event must list at least one buffer
689  * on the buffer linked list for this function to be meaningful.
690  *
691  * If read_countp != NULL, *read_countp will hold the number of bytes
692  * this transaction can receive.
693  */
694 static void
695 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
696 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
697 {
698 	unsigned int iovcount;
699 	isc_buffer_t *buffer;
700 	isc_region_t available;
701 	size_t read_count;
702 
703 	memset(msg, 0, sizeof(struct msghdr));
704 
705 	if (sock->type == isc_sockettype_udp) {
706 		memset(&dev->address, 0, sizeof(dev->address));
707 		msg->msg_name = (void *)&dev->address.type.sa;
708 		msg->msg_namelen = sizeof(dev->address.type);
709 	} else { /* TCP */
710 		msg->msg_name = NULL;
711 		msg->msg_namelen = 0;
712 		dev->address = sock->peer_address;
713 	}
714 
715 	buffer = ISC_LIST_HEAD(dev->bufferlist);
716 	read_count = 0;
717 
718 	/*
719 	 * Single buffer I/O?  Skip what we've done so far in this region.
720 	 */
721 	if (buffer == NULL) {
722 		read_count = dev->region.length - dev->n;
723 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
724 		iov[0].iov_len = read_count;
725 		iovcount = 1;
726 
727 		goto config;
728 	}
729 
730 	/*
731 	 * Multibuffer I/O.
732 	 * Skip empty buffers.
733 	 */
734 	while (buffer != NULL) {
735 		if (isc_buffer_availablelength(buffer) != 0)
736 			break;
737 		buffer = ISC_LIST_NEXT(buffer, link);
738 	}
739 
740 	iovcount = 0;
741 	while (buffer != NULL) {
742 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
743 
744 		isc_buffer_availableregion(buffer, &available);
745 
746 		if (available.length > 0) {
747 			iov[iovcount].iov_base = (void *)(available.base);
748 			iov[iovcount].iov_len = available.length;
749 			read_count += available.length;
750 			iovcount++;
751 		}
752 		buffer = ISC_LIST_NEXT(buffer, link);
753 	}
754 
755  config:
756 
757 	/*
758 	 * If needed, set up to receive that one extra byte.
759 	 */
760 	msg->msg_iov = iov;
761 	msg->msg_iovlen = iovcount;
762 
763 	msg->msg_control = cmsgbuf;
764 	msg->msg_controllen = RECVCMSGBUFLEN;
765 	msg->msg_flags = 0;
766 
767 	if (read_countp != NULL)
768 		*read_countp = read_count;
769 }
770 
771 static void
772 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
773 		isc_socketevent_t *dev)
774 {
775 	if (sock->type == isc_sockettype_udp) {
776 		if (address != NULL)
777 			dev->address = *address;
778 		else
779 			dev->address = sock->peer_address;
780 	} else if (sock->type == isc_sockettype_tcp) {
781 		INSIST(address == NULL);
782 		dev->address = sock->peer_address;
783 	}
784 }
785 
786 static void
787 destroy_socketevent(isc_event_t *event) {
788 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
789 
790 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
791 
792 	(ev->destroy)(event);
793 }
794 
795 static isc_socketevent_t *
796 allocate_socketevent(void *sender,
797 		     isc_eventtype_t eventtype, isc_taskaction_t action,
798 		     void *arg)
799 {
800 	isc_socketevent_t *ev;
801 
802 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
803 						     eventtype, action, arg,
804 						     sizeof(*ev));
805 
806 	if (ev == NULL)
807 		return (NULL);
808 
809 	ev->result = ISC_R_UNSET;
810 	ISC_LINK_INIT(ev, ev_link);
811 	ISC_LIST_INIT(ev->bufferlist);
812 	ev->region.base = NULL;
813 	ev->n = 0;
814 	ev->offset = 0;
815 	ev->attributes = 0;
816 	ev->destroy = ev->ev_destroy;
817 	ev->ev_destroy = destroy_socketevent;
818 	ev->dscp = 0;
819 
820 	return (ev);
821 }
822 
823 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
824 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
825 #define DOIO_HARD		2	/* i/o error, event sent */
826 #define DOIO_EOF		3	/* EOF, no event sent */
827 
828 static int
829 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
830 	int cc;
831 	struct iovec iov[MAXSCATTERGATHER_RECV];
832 	size_t read_count;
833 	size_t actual_count;
834 	struct msghdr msghdr;
835 	isc_buffer_t *buffer;
836 	int recv_errno;
837 	union {
838 		struct msghdr msghdr;
839 		char m[RECVCMSGBUFLEN];
840 	} cmsgbuf;
841 
842 	memset(&cmsgbuf, 0, sizeof(cmsgbuf));
843 
844 	build_msghdr_recv(sock, cmsgbuf.m, dev, &msghdr, iov, &read_count);
845 
846 	cc = recvmsg(sock->fd, &msghdr, 0);
847 	recv_errno = errno;
848 
849 	if (cc < 0) {
850 		if (SOFT_ERROR(recv_errno))
851 			return (DOIO_SOFT);
852 
853 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
854 			socket_log(sock, NULL, IOEVENT,
855 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
856 				   sock->fd, cc, recv_errno,
857 				   strerror(recv_errno));
858 		}
859 
860 #define SOFT_OR_HARD(_system, _isc) \
861 	if (recv_errno == _system) { \
862 		if (sock->connected) { \
863 			dev->result = _isc; \
864 			return (DOIO_HARD); \
865 		} \
866 		return (DOIO_SOFT); \
867 	}
868 #define ALWAYS_HARD(_system, _isc) \
869 	if (recv_errno == _system) { \
870 		dev->result = _isc; \
871 		return (DOIO_HARD); \
872 	}
873 
874 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
875 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
876 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
877 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
878 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
879 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
880 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
881 		/* Should never get this one but it was seen. */
882 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
883 		/*
884 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
885 		 * errors.
886 		 */
887 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
888 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
889 
890 #undef SOFT_OR_HARD
891 #undef ALWAYS_HARD
892 
893 		dev->result = isc__errno2result(recv_errno);
894 		return (DOIO_HARD);
895 	}
896 
897 	/*
898 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
899 	 * while on UDP sockets, zero length reads are perfectly valid,
900 	 * although strange.
901 	 */
902 	switch (sock->type) {
903 	case isc_sockettype_tcp:
904 		if (cc == 0)
905 			return (DOIO_EOF);
906 		break;
907 	case isc_sockettype_udp:
908 		break;
909 	default:
910 		INSIST(0);
911 	}
912 
913 	if (sock->type == isc_sockettype_udp) {
914 		dev->address.length = msghdr.msg_namelen;
915 		if (isc_sockaddr_getport(&dev->address) == 0) {
916 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
917 				socket_log(sock, &dev->address, IOEVENT,
918 					   "dropping source port zero packet");
919 			}
920 			return (DOIO_SOFT);
921 		}
922 	}
923 
924 	socket_log(sock, &dev->address, IOEVENT,
925 		   "packet received correctly");
926 
927 	/*
928 	 * Overflow bit detection.  If we received MORE bytes than we should,
929 	 * this indicates an overflow situation.  Set the flag in the
930 	 * dev entry and adjust how much we read by one.
931 	 */
932 	/*
933 	 * If there are control messages attached, run through them and pull
934 	 * out the interesting bits.
935 	 */
936 	process_cmsg(sock, &msghdr, dev);
937 
938 	/*
939 	 * update the buffers (if any) and the i/o count
940 	 */
941 	dev->n += cc;
942 	actual_count = cc;
943 	buffer = ISC_LIST_HEAD(dev->bufferlist);
944 	while (buffer != NULL && actual_count > 0U) {
945 		if (isc_buffer_availablelength(buffer) <= actual_count) {
946 			actual_count -= isc_buffer_availablelength(buffer);
947 			isc_buffer_add(buffer,
948 				       isc_buffer_availablelength(buffer));
949 		} else {
950 			isc_buffer_add(buffer, actual_count);
951 			actual_count = 0;
952 			POST(actual_count);
953 			break;
954 		}
955 		buffer = ISC_LIST_NEXT(buffer, link);
956 		if (buffer == NULL) {
957 			INSIST(actual_count == 0U);
958 		}
959 	}
960 
961 	/*
962 	 * If we read less than we expected, update counters,
963 	 * and let the upper layer poke the descriptor.
964 	 */
965 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
966 		return (DOIO_SOFT);
967 
968 	/*
969 	 * Full reads are posted, or partials if partials are ok.
970 	 */
971 	dev->result = ISC_R_SUCCESS;
972 	return (DOIO_SUCCESS);
973 }
974 
975 /*
976  * Returns:
977  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
978  *			ISC_R_SUCCESS.
979  *
980  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
981  *			dev->result contains the appropriate error.
982  *
983  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
984  *			event was sent.  The operation should be retried.
985  *
986  *	No other return values are possible.
987  */
988 static int
989 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
990 	int cc;
991 	struct iovec iov[MAXSCATTERGATHER_SEND];
992 	size_t write_count;
993 	struct msghdr msghdr;
994 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
995 	int attempts = 0;
996 	int send_errno;
997 	union {
998 		struct msghdr msghdr;
999 		char m[SENDCMSGBUFLEN];
1000 	} cmsgbuf;
1001 
1002 	memset(&cmsgbuf, 0, sizeof(cmsgbuf));
1003 
1004 	build_msghdr_send(sock, cmsgbuf.m, dev, &msghdr, iov, &write_count);
1005 
1006  resend:
1007 	cc = sendmsg(sock->fd, &msghdr, 0);
1008 	send_errno = errno;
1009 
1010 	/*
1011 	 * Check for error or block condition.
1012 	 */
1013 	if (cc < 0) {
1014 		if (send_errno == EINTR && ++attempts < NRETRIES)
1015 			goto resend;
1016 
1017 		if (SOFT_ERROR(send_errno)) {
1018 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1019 				dev->result = ISC_R_WOULDBLOCK;
1020 			return (DOIO_SOFT);
1021 		}
1022 
1023 #define SOFT_OR_HARD(_system, _isc) \
1024 	if (send_errno == _system) { \
1025 		if (sock->connected) { \
1026 			dev->result = _isc; \
1027 			return (DOIO_HARD); \
1028 		} \
1029 		return (DOIO_SOFT); \
1030 	}
1031 #define ALWAYS_HARD(_system, _isc) \
1032 	if (send_errno == _system) { \
1033 		dev->result = _isc; \
1034 		return (DOIO_HARD); \
1035 	}
1036 
1037 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1038 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1039 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1040 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1041 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1042 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1043 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1044 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1045 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1046 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1047 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1048 
1049 #undef SOFT_OR_HARD
1050 #undef ALWAYS_HARD
1051 
1052 		/*
1053 		 * The other error types depend on whether or not the
1054 		 * socket is UDP or TCP.  If it is UDP, some errors
1055 		 * that we expect to be fatal under TCP are merely
1056 		 * annoying, and are really soft errors.
1057 		 *
1058 		 * However, these soft errors are still returned as
1059 		 * a status.
1060 		 */
1061 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1062 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1063 				 addrbuf, strerror(send_errno));
1064 		dev->result = isc__errno2result(send_errno);
1065 		return (DOIO_HARD);
1066 	}
1067 
1068 	if (cc == 0) {
1069 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1070 				 "doio_send: send() %s 0", "returned");
1071 	}
1072 
1073 	/*
1074 	 * If we write less than we expected, update counters, poke.
1075 	 */
1076 	dev->n += cc;
1077 	if ((size_t)cc != write_count)
1078 		return (DOIO_SOFT);
1079 
1080 	/*
1081 	 * Exactly what we wanted to write.  We're done with this
1082 	 * entry.  Post its completion event.
1083 	 */
1084 	dev->result = ISC_R_SUCCESS;
1085 	return (DOIO_SUCCESS);
1086 }
1087 
1088 /*
1089  * Kill.
1090  *
1091  * Caller must ensure that the socket is not locked and no external
1092  * references exist.
1093  */
1094 static void
1095 socketclose(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1096 	/*
1097 	 * No one has this socket open, so the watcher doesn't have to be
1098 	 * poked, and the socket doesn't have to be locked.
1099 	 */
1100 	manager->fds[fd] = NULL;
1101 	manager->fdstate[fd] = CLOSE_PENDING;
1102 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1103 
1104 	if (sock->active == 1) {
1105 		sock->active = 0;
1106 	}
1107 
1108 	/*
1109 	 * update manager->maxfd here (XXX: this should be implemented more
1110 	 * efficiently)
1111 	 */
1112 	if (manager->maxfd == fd) {
1113 		int i;
1114 
1115 		manager->maxfd = 0;
1116 		for (i = fd - 1; i >= 0; i--) {
1117 			if (manager->fdstate[i] == MANAGED) {
1118 				manager->maxfd = i;
1119 				break;
1120 			}
1121 		}
1122 	}
1123 
1124 }
1125 
1126 static void
1127 destroy(isc_socket_t **sockp) {
1128 	int fd;
1129 	isc_socket_t *sock = *sockp;
1130 	isc_socketmgr_t *manager = sock->manager;
1131 
1132 	socket_log(sock, NULL, CREATION, "destroying");
1133 
1134 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1135 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1136 	INSIST(sock->connect_ev == NULL);
1137 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1138 
1139 	if (sock->fd >= 0) {
1140 		fd = sock->fd;
1141 		sock->fd = -1;
1142 		socketclose(manager, sock, fd);
1143 	}
1144 
1145 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1146 
1147 	/* can't unlock manager as its memory context is still used */
1148 	free_socket(sockp);
1149 }
1150 
1151 static isc_result_t
1152 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1153 		isc_socket_t **socketp)
1154 {
1155 	isc_socket_t *sock;
1156 
1157 	sock = malloc(sizeof(*sock));
1158 
1159 	if (sock == NULL)
1160 		return (ISC_R_NOMEMORY);
1161 
1162 	sock->references = 0;
1163 
1164 	sock->manager = manager;
1165 	sock->type = type;
1166 	sock->fd = -1;
1167 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1168 	sock->active = 0;
1169 
1170 	ISC_LINK_INIT(sock, link);
1171 
1172 	/*
1173 	 * Set up list of readers and writers to be initially empty.
1174 	 */
1175 	ISC_LIST_INIT(sock->recv_list);
1176 	ISC_LIST_INIT(sock->send_list);
1177 	sock->connect_ev = NULL;
1178 	sock->pending_recv = 0;
1179 	sock->pending_send = 0;
1180 	sock->connected = 0;
1181 	sock->connecting = 0;
1182 	sock->bound = 0;
1183 	sock->pktdscp = 0;
1184 
1185 	/*
1186 	 * Initialize readable and writable events.
1187 	 */
1188 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1189 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1190 		       NULL, sock, sock, NULL);
1191 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1192 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1193 		       NULL, sock, sock, NULL);
1194 
1195 	*socketp = sock;
1196 
1197 	return (ISC_R_SUCCESS);
1198 }
1199 
1200 /*
1201  * This event requires that the various lists be empty, that the reference
1202  * count be 1.  The other socket bits,
1203  * like the lock, must be initialized as well.  The fd associated must be
1204  * marked as closed, by setting it to -1 on close, or this routine will
1205  * also close the socket.
1206  */
1207 static void
1208 free_socket(isc_socket_t **socketp) {
1209 	isc_socket_t *sock = *socketp;
1210 
1211 	INSIST(sock->references == 0);
1212 	INSIST(!sock->connecting);
1213 	INSIST(!sock->pending_recv);
1214 	INSIST(!sock->pending_send);
1215 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1216 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1217 	INSIST(!ISC_LINK_LINKED(sock, link));
1218 
1219 	free(sock);
1220 
1221 	*socketp = NULL;
1222 }
1223 
1224 static void
1225 use_min_mtu(isc_socket_t *sock) {
1226 	/* use minimum MTU */
1227 	if (sock->pf == AF_INET6) {
1228 		int on = 1;
1229 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1230 				(void *)&on, sizeof(on));
1231 	}
1232 }
1233 
1234 static void
1235 set_tcp_maxseg(isc_socket_t *sock, int size) {
1236 	if (sock->type == isc_sockettype_tcp)
1237 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1238 				(void *)&size, sizeof(size));
1239 }
1240 
1241 static isc_result_t
1242 opensocket(isc_socket_t *sock)
1243 {
1244 	isc_result_t result;
1245 	const char *err = "socket";
1246 	int on = 1;
1247 
1248 	switch (sock->type) {
1249 	case isc_sockettype_udp:
1250 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1251 		break;
1252 	case isc_sockettype_tcp:
1253 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1254 		break;
1255 	}
1256 
1257 	if (sock->fd < 0) {
1258 		switch (errno) {
1259 		case EMFILE:
1260 		case ENFILE:
1261 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1262 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1263 				       "%s: %s", err, strerror(errno));
1264 			/* fallthrough */
1265 		case ENOBUFS:
1266 			return (ISC_R_NORESOURCES);
1267 
1268 		case EPROTONOSUPPORT:
1269 		case EPFNOSUPPORT:
1270 		case EAFNOSUPPORT:
1271 		/*
1272 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1273 		 * EAFNOSUPPORT.
1274 		 */
1275 		case EINVAL:
1276 			return (ISC_R_FAMILYNOSUPPORT);
1277 
1278 		default:
1279 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1280 					 "%s() %s: %s", err, "failed",
1281 					 strerror(errno));
1282 			return (ISC_R_UNEXPECTED);
1283 		}
1284 	}
1285 
1286 	result = make_nonblock(sock->fd);
1287 	if (result != ISC_R_SUCCESS) {
1288 		(void)close(sock->fd);
1289 		return (result);
1290 	}
1291 
1292 	/*
1293 	 * Use minimum mtu if possible.
1294 	 */
1295 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1296 		use_min_mtu(sock);
1297 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1298 	}
1299 
1300 	if (sock->type == isc_sockettype_udp) {
1301 
1302 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1303 			       (void *)&on, sizeof(on)) < 0
1304 		    && errno != ENOPROTOOPT) {
1305 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1306 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1307 					 sock->fd, "failed", strerror(errno));
1308 			/* Press on... */
1309 		}
1310 
1311 		/* RFC 3542 */
1312 		if ((sock->pf == AF_INET6)
1313 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1314 				   (void *)&on, sizeof(on)) < 0)) {
1315 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1316 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1317 					 "%s: %s", sock->fd, "failed",
1318 					 strerror(errno));
1319 		}
1320 	}
1321 
1322 	if (sock->active == 0) {
1323 		sock->active = 1;
1324 	}
1325 
1326 	return (ISC_R_SUCCESS);
1327 }
1328 
1329 /*
1330  * Create a 'type' socket managed
1331  * by 'manager'.  Events will be posted to 'task' and when dispatched
1332  * 'action' will be called with 'arg' as the arg value.  The new
1333  * socket is returned in 'socketp'.
1334  */
1335 static isc_result_t
1336 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1337 	      isc_socket_t **socketp)
1338 {
1339 	isc_socket_t *sock = NULL;
1340 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
1341 	isc_result_t result;
1342 
1343 	REQUIRE(socketp != NULL && *socketp == NULL);
1344 
1345 	result = allocate_socket(manager, type, &sock);
1346 	if (result != ISC_R_SUCCESS)
1347 		return (result);
1348 
1349 	switch (sock->type) {
1350 	case isc_sockettype_udp:
1351 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1352 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1353 		break;
1354 	case isc_sockettype_tcp:
1355 		break;
1356 	default:
1357 		INSIST(0);
1358 	}
1359 
1360 	sock->pf = pf;
1361 
1362 	result = opensocket(sock);
1363 	if (result != ISC_R_SUCCESS) {
1364 		free_socket(&sock);
1365 		return (result);
1366 	}
1367 
1368 	sock->references = 1;
1369 	*socketp = (isc_socket_t *)sock;
1370 
1371 	/*
1372 	 * Note we don't have to lock the socket like we normally would because
1373 	 * there are no external references to it yet.
1374 	 */
1375 
1376 	manager->fds[sock->fd] = sock;
1377 	manager->fdstate[sock->fd] = MANAGED;
1378 
1379 	ISC_LIST_APPEND(manager->socklist, sock, link);
1380 	if (manager->maxfd < sock->fd)
1381 		manager->maxfd = sock->fd;
1382 
1383 	socket_log(sock, NULL, CREATION, "created");
1384 
1385 	return (ISC_R_SUCCESS);
1386 }
1387 
1388 /*%
1389  * Create a new 'type' socket managed by 'manager'.  Events
1390  * will be posted to 'task' and when dispatched 'action' will be
1391  * called with 'arg' as the arg value.  The new socket is returned
1392  * in 'socketp'.
1393  */
1394 isc_result_t
1395 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1396 		   isc_socket_t **socketp)
1397 {
1398 	return (socket_create(manager0, pf, type, socketp));
1399 }
1400 
1401 /*
1402  * Attach to a socket.  Caller must explicitly detach when it is done.
1403  */
1404 void
1405 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1406 	isc_socket_t *sock = (isc_socket_t *)sock0;
1407 
1408 	REQUIRE(socketp != NULL && *socketp == NULL);
1409 
1410 	sock->references++;
1411 
1412 	*socketp = (isc_socket_t *)sock;
1413 }
1414 
1415 /*
1416  * Dereference a socket.  If this is the last reference to it, clean things
1417  * up by destroying the socket.
1418  */
1419 void
1420 isc_socket_detach(isc_socket_t **socketp) {
1421 	isc_socket_t *sock;
1422 	isc_boolean_t kill_socket = ISC_FALSE;
1423 
1424 	REQUIRE(socketp != NULL);
1425 	sock = (isc_socket_t *)*socketp;
1426 
1427 	REQUIRE(sock->references > 0);
1428 	sock->references--;
1429 	if (sock->references == 0)
1430 		kill_socket = ISC_TRUE;
1431 
1432 	if (kill_socket)
1433 		destroy(&sock);
1434 
1435 	*socketp = NULL;
1436 }
1437 
1438 /*
1439  * I/O is possible on a given socket.  Schedule an event to this task that
1440  * will call an internal function to do the I/O.  This will charge the
1441  * task with the I/O operation and let our select loop handler get back
1442  * to doing something real as fast as possible.
1443  *
1444  * The socket and manager must be locked before calling this function.
1445  */
1446 static void
1447 dispatch_recv(isc_socket_t *sock) {
1448 	intev_t *iev;
1449 	isc_socketevent_t *ev;
1450 	isc_task_t *sender;
1451 
1452 	INSIST(!sock->pending_recv);
1453 
1454 	ev = ISC_LIST_HEAD(sock->recv_list);
1455 	if (ev == NULL)
1456 		return;
1457 	socket_log(sock, NULL, EVENT,
1458 		   "dispatch_recv:  event %p -> task %p",
1459 		   ev, ev->ev_sender);
1460 	sender = ev->ev_sender;
1461 
1462 	sock->pending_recv = 1;
1463 	iev = &sock->readable_ev;
1464 
1465 	sock->references++;
1466 	iev->ev_sender = sock;
1467 	iev->ev_action = internal_recv;
1468 	iev->ev_arg = sock;
1469 
1470 	isc_task_send(sender, (isc_event_t **)&iev);
1471 }
1472 
1473 static void
1474 dispatch_send(isc_socket_t *sock) {
1475 	intev_t *iev;
1476 	isc_socketevent_t *ev;
1477 	isc_task_t *sender;
1478 
1479 	INSIST(!sock->pending_send);
1480 
1481 	ev = ISC_LIST_HEAD(sock->send_list);
1482 	if (ev == NULL)
1483 		return;
1484 	socket_log(sock, NULL, EVENT,
1485 		   "dispatch_send:  event %p -> task %p",
1486 		   ev, ev->ev_sender);
1487 	sender = ev->ev_sender;
1488 
1489 	sock->pending_send = 1;
1490 	iev = &sock->writable_ev;
1491 
1492 	sock->references++;
1493 	iev->ev_sender = sock;
1494 	iev->ev_action = internal_send;
1495 	iev->ev_arg = sock;
1496 
1497 	isc_task_send(sender, (isc_event_t **)&iev);
1498 }
1499 
1500 static void
1501 dispatch_connect(isc_socket_t *sock) {
1502 	intev_t *iev;
1503 	isc_socket_connev_t *ev;
1504 
1505 	iev = &sock->writable_ev;
1506 
1507 	ev = sock->connect_ev;
1508 	INSIST(ev != NULL); /* XXX */
1509 
1510 	INSIST(sock->connecting);
1511 
1512 	sock->references++;  /* keep socket around for this internal event */
1513 	iev->ev_sender = sock;
1514 	iev->ev_action = internal_connect;
1515 	iev->ev_arg = sock;
1516 
1517 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1518 }
1519 
1520 /*
1521  * Dequeue an item off the given socket's read queue, set the result code
1522  * in the done event to the one provided, and send it to the task it was
1523  * destined for.
1524  *
1525  * If the event to be sent is on a list, remove it before sending.  If
1526  * asked to, send and detach from the socket as well.
1527  *
1528  * Caller must have the socket locked if the event is attached to the socket.
1529  */
1530 static void
1531 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1532 	isc_task_t *task;
1533 
1534 	task = (*dev)->ev_sender;
1535 
1536 	(*dev)->ev_sender = sock;
1537 
1538 	if (ISC_LINK_LINKED(*dev, ev_link))
1539 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1540 
1541 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1542 	    == ISC_SOCKEVENTATTR_ATTACHED)
1543 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1544 	else
1545 		isc_task_send(task, (isc_event_t **)dev);
1546 }
1547 
1548 /*
1549  * See comments for send_recvdone_event() above.
1550  *
1551  * Caller must have the socket locked if the event is attached to the socket.
1552  */
1553 static void
1554 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1555 	isc_task_t *task;
1556 
1557 	INSIST(dev != NULL && *dev != NULL);
1558 
1559 	task = (*dev)->ev_sender;
1560 	(*dev)->ev_sender = sock;
1561 
1562 	if (ISC_LINK_LINKED(*dev, ev_link))
1563 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1564 
1565 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1566 	    == ISC_SOCKEVENTATTR_ATTACHED)
1567 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1568 	else
1569 		isc_task_send(task, (isc_event_t **)dev);
1570 }
1571 
1572 static void
1573 internal_recv(isc_task_t *me, isc_event_t *ev) {
1574 	isc_socketevent_t *dev;
1575 	isc_socket_t *sock;
1576 
1577 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1578 
1579 	sock = ev->ev_sender;
1580 
1581 	socket_log(sock, NULL, IOEVENT,
1582 		   "internal_recv: task %p got event %p", me, ev);
1583 
1584 	INSIST(sock->pending_recv == 1);
1585 	sock->pending_recv = 0;
1586 
1587 	INSIST(sock->references > 0);
1588 	sock->references--;  /* the internal event is done with this socket */
1589 	if (sock->references == 0) {
1590 		destroy(&sock);
1591 		return;
1592 	}
1593 
1594 	/*
1595 	 * Try to do as much I/O as possible on this socket.  There are no
1596 	 * limits here, currently.
1597 	 */
1598 	dev = ISC_LIST_HEAD(sock->recv_list);
1599 	while (dev != NULL) {
1600 		switch (doio_recv(sock, dev)) {
1601 		case DOIO_SOFT:
1602 			goto poke;
1603 
1604 		case DOIO_EOF:
1605 			/*
1606 			 * read of 0 means the remote end was closed.
1607 			 * Run through the event queue and dispatch all
1608 			 * the events with an EOF result code.
1609 			 */
1610 			do {
1611 				dev->result = ISC_R_EOF;
1612 				send_recvdone_event(sock, &dev);
1613 				dev = ISC_LIST_HEAD(sock->recv_list);
1614 			} while (dev != NULL);
1615 			goto poke;
1616 
1617 		case DOIO_SUCCESS:
1618 		case DOIO_HARD:
1619 			send_recvdone_event(sock, &dev);
1620 			break;
1621 		}
1622 
1623 		dev = ISC_LIST_HEAD(sock->recv_list);
1624 	}
1625 
1626  poke:
1627 	if (!ISC_LIST_EMPTY(sock->recv_list))
1628 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1629 }
1630 
1631 static void
1632 internal_send(isc_task_t *me, isc_event_t *ev) {
1633 	isc_socketevent_t *dev;
1634 	isc_socket_t *sock;
1635 
1636 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1637 
1638 	/*
1639 	 * Find out what socket this is and lock it.
1640 	 */
1641 	sock = (isc_socket_t *)ev->ev_sender;
1642 	socket_log(sock, NULL, IOEVENT,
1643 		   "internal_send: task %p got event %p", me, ev);
1644 
1645 	INSIST(sock->pending_send == 1);
1646 	sock->pending_send = 0;
1647 
1648 	INSIST(sock->references > 0);
1649 	sock->references--;  /* the internal event is done with this socket */
1650 	if (sock->references == 0) {
1651 		destroy(&sock);
1652 		return;
1653 	}
1654 
1655 	/*
1656 	 * Try to do as much I/O as possible on this socket.  There are no
1657 	 * limits here, currently.
1658 	 */
1659 	dev = ISC_LIST_HEAD(sock->send_list);
1660 	while (dev != NULL) {
1661 		switch (doio_send(sock, dev)) {
1662 		case DOIO_SOFT:
1663 			goto poke;
1664 
1665 		case DOIO_HARD:
1666 		case DOIO_SUCCESS:
1667 			send_senddone_event(sock, &dev);
1668 			break;
1669 		}
1670 
1671 		dev = ISC_LIST_HEAD(sock->send_list);
1672 	}
1673 
1674  poke:
1675 	if (!ISC_LIST_EMPTY(sock->send_list))
1676 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1677 }
1678 
1679 /*
1680  * Process read/writes on each fd here.  Avoid locking
1681  * and unlocking twice if both reads and writes are possible.
1682  */
1683 static void
1684 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
1685 	   isc_boolean_t writeable)
1686 {
1687 	isc_socket_t *sock;
1688 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1689 
1690 	/*
1691 	 * If the socket is going to be closed, don't do more I/O.
1692 	 */
1693 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1694 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1695 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1696 		return;
1697 	}
1698 
1699 	sock = manager->fds[fd];
1700 	if (readable) {
1701 		if (sock == NULL) {
1702 			unwatch_read = ISC_TRUE;
1703 			goto check_write;
1704 		}
1705 		if (!SOCK_DEAD(sock)) {
1706 			dispatch_recv(sock);
1707 		}
1708 		unwatch_read = ISC_TRUE;
1709 	}
1710 check_write:
1711 	if (writeable) {
1712 		if (sock == NULL) {
1713 			unwatch_write = ISC_TRUE;
1714 			goto unlock_fd;
1715 		}
1716 		if (!SOCK_DEAD(sock)) {
1717 			if (sock->connecting)
1718 				dispatch_connect(sock);
1719 			else
1720 				dispatch_send(sock);
1721 		}
1722 		unwatch_write = ISC_TRUE;
1723 	}
1724 
1725  unlock_fd:
1726 	if (unwatch_read)
1727 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1728 	if (unwatch_write)
1729 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1730 
1731 }
1732 
1733 static void
1734 process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds,
1735 	    fd_set *writefds)
1736 {
1737 	int i;
1738 
1739 	REQUIRE(maxfd <= (int)manager->maxsocks);
1740 
1741 	for (i = 0; i < maxfd; i++) {
1742 		process_fd(manager, i, FD_ISSET(i, readfds),
1743 			   FD_ISSET(i, writefds));
1744 	}
1745 }
1746 
1747 /*
1748  * Create a new socket manager.
1749  */
1750 
1751 static isc_result_t
1752 setup_watcher(isc_socketmgr_t *manager) {
1753 	isc_result_t result;
1754 
1755 	UNUSED(result);
1756 
1757 	manager->fd_bufsize = sizeof(fd_set);
1758 
1759 	manager->read_fds = NULL;
1760 	manager->read_fds_copy = NULL;
1761 	manager->write_fds = NULL;
1762 	manager->write_fds_copy = NULL;
1763 
1764 	manager->read_fds = malloc(manager->fd_bufsize);
1765 	if (manager->read_fds != NULL)
1766 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1767 	if (manager->read_fds_copy != NULL)
1768 		manager->write_fds = malloc(manager->fd_bufsize);
1769 	if (manager->write_fds != NULL) {
1770 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1771 	}
1772 	if (manager->write_fds_copy == NULL) {
1773 		if (manager->write_fds != NULL) {
1774 			free(manager->write_fds);
1775 		}
1776 		if (manager->read_fds_copy != NULL) {
1777 			free(manager->read_fds_copy);
1778 		}
1779 		if (manager->read_fds != NULL) {
1780 			free(manager->read_fds);
1781 		}
1782 		return (ISC_R_NOMEMORY);
1783 	}
1784 	memset(manager->read_fds, 0, manager->fd_bufsize);
1785 	memset(manager->write_fds, 0, manager->fd_bufsize);
1786 
1787 	manager->maxfd = 0;
1788 
1789 	return (ISC_R_SUCCESS);
1790 }
1791 
1792 static void
1793 cleanup_watcher(isc_socketmgr_t *manager) {
1794 
1795 	if (manager->read_fds != NULL)
1796 		free(manager->read_fds);
1797 	if (manager->read_fds_copy != NULL)
1798 		free(manager->read_fds_copy);
1799 	if (manager->write_fds != NULL)
1800 		free(manager->write_fds);
1801 	if (manager->write_fds_copy != NULL)
1802 		free(manager->write_fds_copy);
1803 }
1804 
1805 static isc_result_t
1806 isc_socketmgr_create2(isc_socketmgr_t **managerp,
1807 		       unsigned int maxsocks)
1808 {
1809 	isc_socketmgr_t *manager;
1810 	isc_result_t result;
1811 
1812 	REQUIRE(managerp != NULL && *managerp == NULL);
1813 
1814 	if (socketmgr != NULL) {
1815 		/* Don't allow maxsocks to be updated */
1816 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1817 			return (ISC_R_EXISTS);
1818 
1819 		socketmgr->refs++;
1820 		*managerp = (isc_socketmgr_t *)socketmgr;
1821 		return (ISC_R_SUCCESS);
1822 	}
1823 
1824 	if (maxsocks == 0)
1825 		maxsocks = FD_SETSIZE;
1826 
1827 	manager = malloc(sizeof(*manager));
1828 	if (manager == NULL)
1829 		return (ISC_R_NOMEMORY);
1830 
1831 	/* zero-clear so that necessary cleanup on failure will be easy */
1832 	memset(manager, 0, sizeof(*manager));
1833 	manager->maxsocks = maxsocks;
1834 	manager->fds = reallocarray(NULL, manager->maxsocks, sizeof(isc_socket_t *));
1835 	if (manager->fds == NULL) {
1836 		result = ISC_R_NOMEMORY;
1837 		goto free_manager;
1838 	}
1839 	manager->fdstate = reallocarray(NULL, manager->maxsocks, sizeof(int));
1840 	if (manager->fdstate == NULL) {
1841 		result = ISC_R_NOMEMORY;
1842 		goto free_manager;
1843 	}
1844 
1845 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1846 	ISC_LIST_INIT(manager->socklist);
1847 
1848 	manager->refs = 1;
1849 
1850 	/*
1851 	 * Set up initial state for the select loop
1852 	 */
1853 	result = setup_watcher(manager);
1854 	if (result != ISC_R_SUCCESS)
1855 		goto cleanup;
1856 
1857 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1858 
1859 	socketmgr = manager;
1860 	*managerp = (isc_socketmgr_t *)manager;
1861 
1862 	return (ISC_R_SUCCESS);
1863 
1864 cleanup:
1865 
1866 free_manager:
1867 	if (manager->fdstate != NULL) {
1868 		free(manager->fdstate);
1869 	}
1870 	if (manager->fds != NULL) {
1871 		free(manager->fds);
1872 	}
1873 	free(manager);
1874 
1875 	return (result);
1876 }
1877 
1878 isc_result_t
1879 isc_socketmgr_create(isc_socketmgr_t **managerp) {
1880 	return (isc_socketmgr_create2(managerp, 0));
1881 }
1882 
1883 void
1884 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
1885 	isc_socketmgr_t *manager;
1886 	int i;
1887 
1888 	/*
1889 	 * Destroy a socket manager.
1890 	 */
1891 
1892 	REQUIRE(managerp != NULL);
1893 	manager = (isc_socketmgr_t *)*managerp;
1894 
1895 	manager->refs--;
1896 	if (manager->refs > 0) {
1897 		*managerp = NULL;
1898 		return;
1899 	}
1900 	socketmgr = NULL;
1901 
1902 	/*
1903 	 * Wait for all sockets to be destroyed.
1904 	 */
1905 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1906 		isc_taskmgr_dispatch(NULL);
1907 	}
1908 
1909 	/*
1910 	 * Here, poke our select/poll thread.  Do this by closing the write
1911 	 * half of the pipe, which will send EOF to the read half.
1912 	 * This is currently a no-op in the non-threaded case.
1913 	 */
1914 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
1915 
1916 	/*
1917 	 * Clean up.
1918 	 */
1919 	cleanup_watcher(manager);
1920 
1921 	for (i = 0; i < (int)manager->maxsocks; i++)
1922 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
1923 			(void)close(i);
1924 
1925 	free(manager->fds);
1926 	free(manager->fdstate);
1927 
1928 	free(manager);
1929 
1930 	*managerp = NULL;
1931 
1932 	socketmgr = NULL;
1933 }
1934 
1935 static isc_result_t
1936 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
1937 	    unsigned int flags)
1938 {
1939 	int io_state;
1940 	isc_task_t *ntask = NULL;
1941 	isc_result_t result = ISC_R_SUCCESS;
1942 
1943 	dev->ev_sender = task;
1944 
1945 	if (sock->type == isc_sockettype_udp) {
1946 		io_state = doio_recv(sock, dev);
1947 	} else {
1948 		if (ISC_LIST_EMPTY(sock->recv_list))
1949 			io_state = doio_recv(sock, dev);
1950 		else
1951 			io_state = DOIO_SOFT;
1952 	}
1953 
1954 	switch (io_state) {
1955 	case DOIO_SOFT:
1956 		/*
1957 		 * We couldn't read all or part of the request right now, so
1958 		 * queue it.
1959 		 *
1960 		 * Attach to socket and to task
1961 		 */
1962 		isc_task_attach(task, &ntask);
1963 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
1964 
1965 		/*
1966 		 * Enqueue the request.  If the socket was previously not being
1967 		 * watched, poke the watcher to start paying attention to it.
1968 		 */
1969 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
1970 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1971 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
1972 
1973 		socket_log(sock, NULL, EVENT,
1974 			   "socket_recv: event %p -> task %p",
1975 			   dev, ntask);
1976 
1977 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
1978 			result = ISC_R_INPROGRESS;
1979 		break;
1980 
1981 	case DOIO_EOF:
1982 		dev->result = ISC_R_EOF;
1983 		/* fallthrough */
1984 
1985 	case DOIO_HARD:
1986 	case DOIO_SUCCESS:
1987 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
1988 			send_recvdone_event(sock, &dev);
1989 		break;
1990 	}
1991 
1992 	return (result);
1993 }
1994 
1995 isc_result_t
1996 isc_socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
1997 		  unsigned int minimum, isc_task_t *task,
1998 		  isc_taskaction_t action, void *arg)
1999 {
2000 	isc_socket_t *sock = (isc_socket_t *)sock0;
2001 	isc_socketevent_t *dev;
2002 	unsigned int iocount;
2003 	isc_buffer_t *buffer;
2004 
2005 	REQUIRE(buflist != NULL);
2006 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2007 	REQUIRE(task != NULL);
2008 	REQUIRE(action != NULL);
2009 
2010 	iocount = isc_bufferlist_availablecount(buflist);
2011 	REQUIRE(iocount > 0);
2012 
2013 	INSIST(sock->bound);
2014 
2015 	dev = allocate_socketevent(sock,
2016 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2017 	if (dev == NULL)
2018 		return (ISC_R_NOMEMORY);
2019 
2020 	/*
2021 	 * UDP sockets are always partial read
2022 	 */
2023 	if (sock->type == isc_sockettype_udp)
2024 		dev->minimum = 1;
2025 	else {
2026 		if (minimum == 0)
2027 			dev->minimum = iocount;
2028 		else
2029 			dev->minimum = minimum;
2030 	}
2031 
2032 	/*
2033 	 * Move each buffer from the passed in list to our internal one.
2034 	 */
2035 	buffer = ISC_LIST_HEAD(*buflist);
2036 	while (buffer != NULL) {
2037 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2038 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2039 		buffer = ISC_LIST_HEAD(*buflist);
2040 	}
2041 
2042 	return (socket_recv(sock, dev, task, 0));
2043 }
2044 
2045 static isc_result_t
2046 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2047 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2048 	    unsigned int flags)
2049 {
2050 	int io_state;
2051 	isc_task_t *ntask = NULL;
2052 	isc_result_t result = ISC_R_SUCCESS;
2053 
2054 	dev->ev_sender = task;
2055 
2056 	set_dev_address(address, sock, dev);
2057 	if (pktinfo != NULL) {
2058 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2059 		dev->pktinfo = *pktinfo;
2060 
2061 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2062 		    !isc_sockaddr_islinklocal(&dev->address)) {
2063 			socket_log(sock, NULL, TRACE,
2064 				   "pktinfo structure provided, ifindex %u "
2065 				   "(set to 0)", pktinfo->ipi6_ifindex);
2066 
2067 			/*
2068 			 * Set the pktinfo index to 0 here, to let the
2069 			 * kernel decide what interface it should send on.
2070 			 */
2071 			dev->pktinfo.ipi6_ifindex = 0;
2072 		}
2073 	}
2074 
2075 	if (sock->type == isc_sockettype_udp)
2076 		io_state = doio_send(sock, dev);
2077 	else {
2078 		if (ISC_LIST_EMPTY(sock->send_list))
2079 			io_state = doio_send(sock, dev);
2080 		else
2081 			io_state = DOIO_SOFT;
2082 	}
2083 
2084 	switch (io_state) {
2085 	case DOIO_SOFT:
2086 		/*
2087 		 * We couldn't send all or part of the request right now, so
2088 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2089 		 */
2090 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2091 			isc_task_attach(task, &ntask);
2092 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2093 
2094 			/*
2095 			 * Enqueue the request.  If the socket was previously
2096 			 * not being watched, poke the watcher to start
2097 			 * paying attention to it.
2098 			 */
2099 			if (ISC_LIST_EMPTY(sock->send_list) &&
2100 			    !sock->pending_send)
2101 				select_poke(sock->manager, sock->fd,
2102 					    SELECT_POKE_WRITE);
2103 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2104 
2105 			socket_log(sock, NULL, EVENT,
2106 				   "socket_send: event %p -> task %p",
2107 				   dev, ntask);
2108 
2109 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2110 				result = ISC_R_INPROGRESS;
2111 			break;
2112 		}
2113 
2114 		/* FALLTHROUGH */
2115 
2116 	case DOIO_HARD:
2117 	case DOIO_SUCCESS:
2118 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2119 			send_senddone_event(sock, &dev);
2120 		break;
2121 	}
2122 
2123 	return (result);
2124 }
2125 
2126 isc_result_t
2127 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2128 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2129 {
2130 	return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL,
2131 				     NULL, 0));
2132 }
2133 
2134 isc_result_t
2135 isc_socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2136 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2137 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2138 		     unsigned int flags)
2139 {
2140 	isc_socket_t *sock = (isc_socket_t *)sock0;
2141 	isc_socketevent_t *dev;
2142 	unsigned int iocount;
2143 	isc_buffer_t *buffer;
2144 
2145 	REQUIRE(buflist != NULL);
2146 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2147 	REQUIRE(task != NULL);
2148 	REQUIRE(action != NULL);
2149 
2150 	iocount = isc_bufferlist_usedcount(buflist);
2151 	REQUIRE(iocount > 0);
2152 
2153 	dev = allocate_socketevent(sock,
2154 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2155 	if (dev == NULL)
2156 		return (ISC_R_NOMEMORY);
2157 
2158 	/*
2159 	 * Move each buffer from the passed in list to our internal one.
2160 	 */
2161 	buffer = ISC_LIST_HEAD(*buflist);
2162 	while (buffer != NULL) {
2163 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2164 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2165 		buffer = ISC_LIST_HEAD(*buflist);
2166 	}
2167 
2168 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2169 }
2170 
2171 isc_result_t
2172 isc_socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2173 		 unsigned int options) {
2174 	isc_socket_t *sock = (isc_socket_t *)sock0;
2175 	int on = 1;
2176 
2177 	INSIST(!sock->bound);
2178 
2179 	if (sock->pf != sockaddr->type.sa.sa_family) {
2180 		return (ISC_R_FAMILYMISMATCH);
2181 	}
2182 
2183 	/*
2184 	 * Only set SO_REUSEADDR when we want a specific port.
2185 	 */
2186 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2187 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2188 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2189 		       sizeof(on)) < 0) {
2190 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2191 				 "setsockopt(%d) %s", sock->fd, "failed");
2192 		/* Press on... */
2193 	}
2194 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2195 		switch (errno) {
2196 		case EACCES:
2197 			return (ISC_R_NOPERM);
2198 		case EADDRNOTAVAIL:
2199 			return (ISC_R_ADDRNOTAVAIL);
2200 		case EADDRINUSE:
2201 			return (ISC_R_ADDRINUSE);
2202 		case EINVAL:
2203 			return (ISC_R_BOUND);
2204 		default:
2205 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2206 					 strerror(errno));
2207 			return (ISC_R_UNEXPECTED);
2208 		}
2209 	}
2210 
2211 	socket_log(sock, sockaddr, TRACE, "bound");
2212 	sock->bound = 1;
2213 
2214 	return (ISC_R_SUCCESS);
2215 }
2216 
2217 isc_result_t
2218 isc_socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2219 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2220 {
2221 	isc_socket_t *sock = (isc_socket_t *)sock0;
2222 	isc_socket_connev_t *dev;
2223 	isc_task_t *ntask = NULL;
2224 	isc_socketmgr_t *manager;
2225 	int cc;
2226 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2227 
2228 	REQUIRE(addr != NULL);
2229 	REQUIRE(task != NULL);
2230 	REQUIRE(action != NULL);
2231 
2232 	manager = sock->manager;
2233 	REQUIRE(addr != NULL);
2234 
2235 	if (isc_sockaddr_ismulticast(addr))
2236 		return (ISC_R_MULTICAST);
2237 
2238 	REQUIRE(!sock->connecting);
2239 
2240 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2241 							ISC_SOCKEVENT_CONNECT,
2242 							action,	arg,
2243 							sizeof(*dev));
2244 	if (dev == NULL) {
2245 		return (ISC_R_NOMEMORY);
2246 	}
2247 	ISC_LINK_INIT(dev, ev_link);
2248 
2249 	/*
2250 	 * Try to do the connect right away, as there can be only one
2251 	 * outstanding, and it might happen to complete.
2252 	 */
2253 	sock->peer_address = *addr;
2254 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2255 	if (cc < 0) {
2256 		/*
2257 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2258 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2259 		 * a success and let the user detect it if it's really an error
2260 		 * at the time of sending a packet on the socket.
2261 		 */
2262 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2263 			cc = 0;
2264 			goto success;
2265 		}
2266 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2267 			goto queue;
2268 
2269 		switch (errno) {
2270 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2271 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2272 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2273 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2274 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2275 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2276 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2277 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2278 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2279 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2280 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2281 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2282 #undef ERROR_MATCH
2283 		}
2284 
2285 		sock->connected = 0;
2286 
2287 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2288 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2289 				 addrbuf, errno, strerror(errno));
2290 
2291 		isc_event_free(ISC_EVENT_PTR(&dev));
2292 		return (ISC_R_UNEXPECTED);
2293 
2294 	err_exit:
2295 		sock->connected = 0;
2296 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2297 
2298 		return (ISC_R_SUCCESS);
2299 	}
2300 
2301 	/*
2302 	 * If connect completed, fire off the done event.
2303 	 */
2304  success:
2305 	if (cc == 0) {
2306 		sock->connected = 1;
2307 		sock->bound = 1;
2308 		dev->result = ISC_R_SUCCESS;
2309 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2310 
2311 		return (ISC_R_SUCCESS);
2312 	}
2313 
2314  queue:
2315 
2316 	/*
2317 	 * Attach to task.
2318 	 */
2319 	isc_task_attach(task, &ntask);
2320 
2321 	sock->connecting = 1;
2322 
2323 	dev->ev_sender = ntask;
2324 
2325 	/*
2326 	 * Poke watcher here.  We still have the socket locked, so there
2327 	 * is no race condition.  We will keep the lock for such a short
2328 	 * bit of time waking it up now or later won't matter all that much.
2329 	 */
2330 	if (sock->connect_ev == NULL)
2331 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2332 
2333 	sock->connect_ev = dev;
2334 
2335 	return (ISC_R_SUCCESS);
2336 }
2337 
2338 /*
2339  * Called when a socket with a pending connect() finishes.
2340  */
2341 static void
2342 internal_connect(isc_task_t *me, isc_event_t *ev) {
2343 	isc_socket_t *sock;
2344 	isc_socket_connev_t *dev;
2345 	isc_task_t *task;
2346 	int cc;
2347 	socklen_t optlen;
2348 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2349 
2350 	UNUSED(me);
2351 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2352 
2353 	sock = ev->ev_sender;
2354 
2355 	/*
2356 	 * When the internal event was sent the reference count was bumped
2357 	 * to keep the socket around for us.  Decrement the count here.
2358 	 */
2359 	INSIST(sock->references > 0);
2360 	sock->references--;
2361 	if (sock->references == 0) {
2362 		destroy(&sock);
2363 		return;
2364 	}
2365 
2366 	/*
2367 	 * Has this event been canceled?
2368 	 */
2369 	dev = sock->connect_ev;
2370 	if (dev == NULL) {
2371 		INSIST(!sock->connecting);
2372 		return;
2373 	}
2374 
2375 	INSIST(sock->connecting);
2376 	sock->connecting = 0;
2377 
2378 	/*
2379 	 * Get any possible error status here.
2380 	 */
2381 	optlen = sizeof(cc);
2382 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2383 		       (void *)&cc, (void *)&optlen) < 0)
2384 		cc = errno;
2385 	else
2386 		errno = cc;
2387 
2388 	if (errno != 0) {
2389 		/*
2390 		 * If the error is EAGAIN, just re-select on this
2391 		 * fd and pretend nothing strange happened.
2392 		 */
2393 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2394 			sock->connecting = 1;
2395 			select_poke(sock->manager, sock->fd,
2396 				    SELECT_POKE_CONNECT);
2397 			return;
2398 		}
2399 
2400 		/*
2401 		 * Translate other errors into ISC_R_* flavors.
2402 		 */
2403 		switch (errno) {
2404 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2405 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2406 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2407 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2408 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2409 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2410 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2411 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2412 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2413 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2414 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2415 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2416 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2417 #undef ERROR_MATCH
2418 		default:
2419 			dev->result = ISC_R_UNEXPECTED;
2420 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2421 					    sizeof(peerbuf));
2422 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2423 					 "internal_connect: connect(%s) %s",
2424 					 peerbuf, strerror(errno));
2425 		}
2426 	} else {
2427 		dev->result = ISC_R_SUCCESS;
2428 		sock->connected = 1;
2429 		sock->bound = 1;
2430 	}
2431 
2432 	sock->connect_ev = NULL;
2433 
2434 	task = dev->ev_sender;
2435 	dev->ev_sender = sock;
2436 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2437 }
2438 
2439 /*
2440  * Run through the list of events on this socket, and cancel the ones
2441  * queued for task "task" of type "how".  "how" is a bitmask.
2442  */
2443 void
2444 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2445 	isc_socket_t *sock = (isc_socket_t *)sock0;
2446 
2447 	/*
2448 	 * Quick exit if there is nothing to do.  Don't even bother locking
2449 	 * in this case.
2450 	 */
2451 	if (how == 0)
2452 		return;
2453 
2454 	/*
2455 	 * All of these do the same thing, more or less.
2456 	 * Each will:
2457 	 *	o If the internal event is marked as "posted" try to
2458 	 *	  remove it from the task's queue.  If this fails, mark it
2459 	 *	  as canceled instead, and let the task clean it up later.
2460 	 *	o For each I/O request for that task of that type, post
2461 	 *	  its done event with status of "ISC_R_CANCELED".
2462 	 *	o Reset any state needed.
2463 	 */
2464 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2465 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2466 		isc_socketevent_t      *dev;
2467 		isc_socketevent_t      *next;
2468 		isc_task_t	       *current_task;
2469 
2470 		dev = ISC_LIST_HEAD(sock->recv_list);
2471 
2472 		while (dev != NULL) {
2473 			current_task = dev->ev_sender;
2474 			next = ISC_LIST_NEXT(dev, ev_link);
2475 
2476 			if ((task == NULL) || (task == current_task)) {
2477 				dev->result = ISC_R_CANCELED;
2478 				send_recvdone_event(sock, &dev);
2479 			}
2480 			dev = next;
2481 		}
2482 	}
2483 
2484 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2485 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2486 		isc_socketevent_t      *dev;
2487 		isc_socketevent_t      *next;
2488 		isc_task_t	       *current_task;
2489 
2490 		dev = ISC_LIST_HEAD(sock->send_list);
2491 
2492 		while (dev != NULL) {
2493 			current_task = dev->ev_sender;
2494 			next = ISC_LIST_NEXT(dev, ev_link);
2495 
2496 			if ((task == NULL) || (task == current_task)) {
2497 				dev->result = ISC_R_CANCELED;
2498 				send_senddone_event(sock, &dev);
2499 			}
2500 			dev = next;
2501 		}
2502 	}
2503 
2504 	/*
2505 	 * Connecting is not a list.
2506 	 */
2507 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2508 	    && sock->connect_ev != NULL) {
2509 		isc_socket_connev_t    *dev;
2510 		isc_task_t	       *current_task;
2511 
2512 		INSIST(sock->connecting);
2513 		sock->connecting = 0;
2514 
2515 		dev = sock->connect_ev;
2516 		current_task = dev->ev_sender;
2517 
2518 		if ((task == NULL) || (task == current_task)) {
2519 			sock->connect_ev = NULL;
2520 
2521 			dev->result = ISC_R_CANCELED;
2522 			dev->ev_sender = sock;
2523 			isc_task_sendanddetach(&current_task,
2524 					       ISC_EVENT_PTR(&dev));
2525 		}
2526 	}
2527 
2528 }
2529 
2530 /*
2531  * In our assumed scenario, we can simply use a single static object.
2532  * XXX: this is not true if the application uses multiple threads with
2533  *      'multi-context' mode.  Fixing this is a future TODO item.
2534  */
2535 static isc_socketwait_t swait_private;
2536 
2537 int
2538 isc_socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2539 			  isc_socketwait_t **swaitp)
2540 {
2541 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2542 	int n;
2543 
2544 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2545 
2546 	if (manager == NULL)
2547 		manager = socketmgr;
2548 	if (manager == NULL)
2549 		return (0);
2550 
2551 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2552 	memmove(manager->write_fds_copy, manager->write_fds,
2553 		manager->fd_bufsize);
2554 
2555 	swait_private.readset = manager->read_fds_copy;
2556 	swait_private.writeset = manager->write_fds_copy;
2557 	swait_private.maxfd = manager->maxfd + 1;
2558 
2559 	n = select(swait_private.maxfd, swait_private.readset,
2560 		   swait_private.writeset, NULL, tvp);
2561 
2562 	*swaitp = &swait_private;
2563 	return (n);
2564 }
2565 
2566 isc_result_t
2567 isc_socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2568 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2569 
2570 	REQUIRE(swait == &swait_private);
2571 
2572 	if (manager == NULL)
2573 		manager = socketmgr;
2574 	if (manager == NULL)
2575 		return (ISC_R_NOTFOUND);
2576 
2577 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2578 	return (ISC_R_SUCCESS);
2579 }
2580