xref: /openbsd/usr.bin/dig/lib/isc/unix/socket.c (revision b1a294b5)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/region.h>
38 #include <isc/socket.h>
39 #include <isc/task.h>
40 #include <isc/util.h>
41 
42 #include "errno2result.h"
43 
44 #include "socket_p.h"
45 #include "../task_p.h"
46 
47 struct isc_socketwait {
48 	fd_set *readset;
49 	fd_set *writeset;
50 	int nfds;
51 	int maxfd;
52 };
53 
54 /*
55  * Set by the -T dscp option on the command line. If set to a value
56  * other than -1, we check to make sure DSCP values match it, and
57  * assert if not.
58  */
59 int isc_dscp_check_value = -1;
60 
61 /*%
62  * Some systems define the socket length argument as an int, some as size_t,
63  * some as socklen_t.  This is here so it can be easily changed if needed.
64  */
65 
66 /*%
67  * Define what the possible "soft" errors can be.  These are non-fatal returns
68  * of various network related functions, like recv() and so on.
69  *
70  * For some reason, BSDI (and perhaps others) will sometimes return <0
71  * from recv() but will have errno==0.  This is broken, but we have to
72  * work around it here.
73  */
74 #define SOFT_ERROR(e)	((e) == EAGAIN || \
75 			 (e) == EWOULDBLOCK || \
76 			 (e) == EINTR || \
77 			 (e) == 0)
78 
79 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
80 
81 /*!<
82  * DLVL(90)  --  Function entry/exit and other tracing.
83  * DLVL(60)  --  Socket data send/receive
84  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
85  * DLVL(20)  --  Socket creation/destruction.
86  */
87 #define TRACE_LEVEL		90
88 #define IOEVENT_LEVEL		60
89 #define EVENT_LEVEL		50
90 #define CREATION_LEVEL		20
91 
92 #define TRACE		DLVL(TRACE_LEVEL)
93 #define IOEVENT		DLVL(IOEVENT_LEVEL)
94 #define EVENT		DLVL(EVENT_LEVEL)
95 #define CREATION	DLVL(CREATION_LEVEL)
96 
97 typedef isc_event_t intev_t;
98 
99 /*!
100  * IPv6 control information.  If the socket is an IPv6 socket we want
101  * to collect the destination address and interface so the client can
102  * set them on outgoing packets.
103  */
104 
105 /*%
106  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
107  * a setsockopt() like interface to request timestamps, and if the OS
108  * doesn't do it for us, call gettimeofday() on every UDP receive?
109  */
110 
111 /*%
112  * Instead of calculating the cmsgbuf lengths every time we take
113  * a rule of thumb approach - sizes are taken from x86_64 linux,
114  * multiplied by 2, everything should fit. Those sizes are not
115  * large enough to cause any concern.
116  */
117 #define CMSG_SP_IN6PKT 40
118 
119 #define CMSG_SP_TIMESTAMP 32
120 
121 #define CMSG_SP_TCTOS 24
122 
123 #define CMSG_SP_INT 24
124 
125 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
126 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
127 
128 /*%
129  * The number of times a send operation is repeated if the result is EINTR.
130  */
131 #define NRETRIES 10
132 
133 struct isc_socket {
134 	/* Not locked. */
135 	isc_socketmgr_t	*manager;
136 	isc_sockettype_t	type;
137 
138 	/* Locked by socket lock. */
139 	ISC_LINK(isc_socket_t)	link;
140 	unsigned int		references;
141 	int			fd;
142 	int			pf;
143 
144 	ISC_LIST(isc_socketevent_t)		send_list;
145 	ISC_LIST(isc_socketevent_t)		recv_list;
146 	isc_socket_connev_t		       *connect_ev;
147 
148 	/*
149 	 * Internal events.  Posted when a descriptor is readable or
150 	 * writable.  These are statically allocated and never freed.
151 	 * They will be set to non-purgable before use.
152 	 */
153 	intev_t			readable_ev;
154 	intev_t			writable_ev;
155 
156 	struct sockaddr_storage		peer_address;       /* remote address */
157 
158 	unsigned int		pending_recv : 1,
159 				pending_send : 1,
160 				connected : 1,
161 				connecting : 1,     /* connect pending */
162 				bound : 1,          /* bound to local addr */
163 				active : 1,         /* currently active */
164 				pktdscp : 1;	    /* per packet dscp */
165 	unsigned int		dscp;
166 };
167 
168 struct isc_socketmgr {
169 	/* Not locked. */
170 	int			fd_bufsize;
171 	unsigned int		maxsocks;
172 
173 	isc_socket_t	       **fds;
174 	int			*fdstate;
175 
176 	/* Locked by manager lock. */
177 	ISC_LIST(isc_socket_t)	socklist;
178 	fd_set			*read_fds;
179 	fd_set			*read_fds_copy;
180 	fd_set			*write_fds;
181 	fd_set			*write_fds_copy;
182 	int			maxfd;
183 	unsigned int		refs;
184 };
185 
186 static isc_socketmgr_t *socketmgr = NULL;
187 
188 #define CLOSED			0	/* this one must be zero */
189 #define MANAGED			1
190 #define CLOSE_PENDING		2
191 
192 /*
193  * send() and recv() iovec counts
194  */
195 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
196 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
197 
198 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
199 				  isc_sockettype_t type,
200 				  isc_socket_t **socketp);
201 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
202 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
203 static void free_socket(isc_socket_t **);
204 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
205 				    isc_socket_t **);
206 static void destroy(isc_socket_t **);
207 static void internal_connect(isc_task_t *, isc_event_t *);
208 static void internal_recv(isc_task_t *, isc_event_t *);
209 static void internal_send(isc_task_t *, isc_event_t *);
210 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
211 static void build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *,
212 			      struct msghdr *, struct iovec *, size_t *);
213 static void build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *,
214 			      struct msghdr *, struct iovec *, size_t *);
215 
216 #define SELECT_POKE_SHUTDOWN		(-1)
217 #define SELECT_POKE_READ		(-3)
218 #define SELECT_POKE_WRITE		(-4)
219 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
220 #define SELECT_POKE_CLOSE		(-5)
221 
222 #define SOCK_DEAD(s)			((s)->references == 0)
223 
224 /*%
225  * Shortcut index arrays to get access to statistics counters.
226  */
227 enum {
228 	STATID_OPEN = 0,
229 	STATID_OPENFAIL = 1,
230 	STATID_CLOSE = 2,
231 	STATID_BINDFAIL = 3,
232 	STATID_CONNECTFAIL = 4,
233 	STATID_CONNECT = 5,
234 	STATID_ACCEPTFAIL = 6,
235 	STATID_ACCEPT = 7,
236 	STATID_SENDFAIL = 8,
237 	STATID_RECVFAIL = 9,
238 	STATID_ACTIVE = 10
239 };
240 
241 static void
242 socket_log(isc_socket_t *sock, struct sockaddr_storage *address,
243 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
244 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
245 static void
socket_log(isc_socket_t * sock,struct sockaddr_storage * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)246 socket_log(isc_socket_t *sock, struct sockaddr_storage *address,
247 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
248 	   const char *fmt, ...)
249 {
250 	char msgbuf[2048];
251 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
252 	va_list ap;
253 
254 	if (! isc_log_wouldlog(isc_lctx, level))
255 		return;
256 
257 	va_start(ap, fmt);
258 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
259 	va_end(ap);
260 
261 	if (address == NULL) {
262 		isc_log_write(isc_lctx, category, module, level,
263 			       "socket %p: %s", sock, msgbuf);
264 	} else {
265 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
266 		isc_log_write(isc_lctx, category, module, level,
267 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
268 	}
269 }
270 
271 static inline isc_result_t
watch_fd(isc_socketmgr_t * manager,int fd,int msg)272 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
273 	isc_result_t result = ISC_R_SUCCESS;
274 
275 	if (msg == SELECT_POKE_READ)
276 		FD_SET(fd, manager->read_fds);
277 	if (msg == SELECT_POKE_WRITE)
278 		FD_SET(fd, manager->write_fds);
279 
280 	return (result);
281 }
282 
283 static inline isc_result_t
unwatch_fd(isc_socketmgr_t * manager,int fd,int msg)284 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
285 	isc_result_t result = ISC_R_SUCCESS;
286 
287 	if (msg == SELECT_POKE_READ)
288 		FD_CLR(fd, manager->read_fds);
289 	else if (msg == SELECT_POKE_WRITE)
290 		FD_CLR(fd, manager->write_fds);
291 
292 	return (result);
293 }
294 
295 static void
wakeup_socket(isc_socketmgr_t * manager,int fd,int msg)296 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
297 	isc_result_t result;
298 
299 	/*
300 	 * This is a wakeup on a socket.  If the socket is not in the
301 	 * process of being closed, start watching it for either reads
302 	 * or writes.
303 	 */
304 
305 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
306 
307 	if (msg == SELECT_POKE_CLOSE) {
308 		/* No one should be updating fdstate, so no need to lock it */
309 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
310 		manager->fdstate[fd] = CLOSED;
311 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
312 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
313 		(void)close(fd);
314 		return;
315 	}
316 
317 	if (manager->fdstate[fd] == CLOSE_PENDING) {
318 
319 		/*
320 		 * We accept (and ignore) any error from unwatch_fd() as we are
321 		 * closing the socket, hoping it doesn't leave dangling state in
322 		 * the kernel.
323 		 */
324 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
325 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
326 		return;
327 	}
328 	if (manager->fdstate[fd] != MANAGED) {
329 		return;
330 	}
331 
332 	/*
333 	 * Set requested bit.
334 	 */
335 	result = watch_fd(manager, fd, msg);
336 	if (result != ISC_R_SUCCESS) {
337 		/*
338 		 * XXXJT: what should we do?  Ignoring the failure of watching
339 		 * a socket will make the application dysfunctional, but there
340 		 * seems to be no reasonable recovery process.
341 		 */
342 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
343 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
344 			      "failed to start watching FD (%d): %s",
345 			      fd, isc_result_totext(result));
346 	}
347 }
348 
349 /*
350  * Update the state of the socketmgr when something changes.
351  */
352 static void
select_poke(isc_socketmgr_t * manager,int fd,int msg)353 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
354 	if (msg == SELECT_POKE_SHUTDOWN)
355 		return;
356 	else if (fd >= 0)
357 		wakeup_socket(manager, fd, msg);
358 	return;
359 }
360 
361 /*
362  * Make a fd non-blocking.
363  */
364 static isc_result_t
make_nonblock(int fd)365 make_nonblock(int fd) {
366 	int ret;
367 	int flags;
368 
369 	flags = fcntl(fd, F_GETFL, 0);
370 	flags |= O_NONBLOCK;
371 	ret = fcntl(fd, F_SETFL, flags);
372 
373 	if (ret == -1) {
374 		UNEXPECTED_ERROR(__FILE__, __LINE__,
375 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
376 				 strerror(errno));
377 		return (ISC_R_UNEXPECTED);
378 	}
379 
380 	return (ISC_R_SUCCESS);
381 }
382 
383 /*
384  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
385  * In order to ensure as much portability as possible, we provide wrapper
386  * functions of these macros.
387  * Note that cmsg_space() could run slow on OSes that do not have
388  * CMSG_SPACE.
389  */
390 static inline socklen_t
cmsg_len(socklen_t len)391 cmsg_len(socklen_t len) {
392 	return (CMSG_LEN(len));
393 }
394 
395 static inline socklen_t
cmsg_space(socklen_t len)396 cmsg_space(socklen_t len) {
397 	return (CMSG_SPACE(len));
398 }
399 
400 /*
401  * Process control messages received on a socket.
402  */
403 static void
process_cmsg(isc_socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)404 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
405 	struct cmsghdr *cmsgp;
406 	struct in6_pktinfo *pktinfop;
407 	void *timevalp;
408 
409 	/*
410 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
411 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
412 	 * They are all here, outside of the CPP tests, because it is
413 	 * more consistent with the usual ISC coding style.
414 	 */
415 	UNUSED(sock);
416 	UNUSED(msg);
417 	UNUSED(dev);
418 
419 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
420 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
421 
422 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
423 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
424 
425 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
426 		return;
427 
428 	timevalp = NULL;
429 	pktinfop = NULL;
430 
431 	cmsgp = CMSG_FIRSTHDR(msg);
432 	while (cmsgp != NULL) {
433 		socket_log(sock, NULL, TRACE,
434 			   "processing cmsg %p", cmsgp);
435 
436 		if (cmsgp->cmsg_level == IPPROTO_IPV6
437 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
438 
439 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
440 			memmove(&dev->pktinfo, pktinfop,
441 				sizeof(struct in6_pktinfo));
442 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
443 			socket_log(sock, NULL, TRACE,
444 				   "interface received on ifindex %u",
445 				   dev->pktinfo.ipi6_ifindex);
446 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
447 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
448 			goto next;
449 		}
450 
451 		if (cmsgp->cmsg_level == SOL_SOCKET
452 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
453 			struct timeval tv;
454 			timevalp = CMSG_DATA(cmsgp);
455 			memmove(&tv, timevalp, sizeof(tv));
456 			TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp);
457 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
458 			goto next;
459 		}
460 
461 		if (cmsgp->cmsg_level == IPPROTO_IPV6
462 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
463 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
464 			dev->dscp >>= 2;
465 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
466 			goto next;
467 		}
468 
469 		if (cmsgp->cmsg_level == IPPROTO_IP
470 		    && (cmsgp->cmsg_type == IP_TOS)) {
471 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
472 			dev->dscp >>= 2;
473 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
474 			goto next;
475 		}
476 	next:
477 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
478 	}
479 
480 }
481 
482 /*
483  * Construct an iov array and attach it to the msghdr passed in.  This is
484  * the SEND constructor, which will use the used region of the buffer
485  * (if using a buffer list) or will use the internal region (if a single
486  * buffer I/O is requested).
487  *
488  * Nothing can be NULL, and the done event must list at least one buffer
489  * on the buffer linked list for this function to be meaningful.
490  *
491  * If write_countp != NULL, *write_countp will hold the number of bytes
492  * this transaction can send.
493  */
494 static void
build_msghdr_send(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)495 build_msghdr_send(isc_socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
496 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
497 {
498 	unsigned int iovcount;
499 	isc_buffer_t *buffer;
500 	isc_region_t used;
501 	size_t write_count;
502 	size_t skip_count;
503 	struct cmsghdr *cmsgp;
504 
505 	memset(msg, 0, sizeof(*msg));
506 
507 	if (!sock->connected) {
508 		msg->msg_name = (void *)&dev->address;
509 		msg->msg_namelen = dev->address.ss_len;
510 	} else {
511 		msg->msg_name = NULL;
512 		msg->msg_namelen = 0;
513 	}
514 
515 	buffer = ISC_LIST_HEAD(dev->bufferlist);
516 	write_count = 0;
517 	iovcount = 0;
518 
519 	/*
520 	 * Single buffer I/O?  Skip what we've done so far in this region.
521 	 */
522 	if (buffer == NULL) {
523 		write_count = dev->region.length - dev->n;
524 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
525 		iov[0].iov_len = write_count;
526 		iovcount = 1;
527 
528 		goto config;
529 	}
530 
531 	/*
532 	 * Multibuffer I/O.
533 	 * Skip the data in the buffer list that we have already written.
534 	 */
535 	skip_count = dev->n;
536 	while (buffer != NULL) {
537 		if (skip_count < isc_buffer_usedlength(buffer))
538 			break;
539 		skip_count -= isc_buffer_usedlength(buffer);
540 		buffer = ISC_LIST_NEXT(buffer, link);
541 	}
542 
543 	while (buffer != NULL) {
544 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
545 
546 		isc_buffer_usedregion(buffer, &used);
547 
548 		if (used.length > 0) {
549 			iov[iovcount].iov_base = (void *)(used.base
550 							  + skip_count);
551 			iov[iovcount].iov_len = used.length - skip_count;
552 			write_count += (used.length - skip_count);
553 			skip_count = 0;
554 			iovcount++;
555 		}
556 		buffer = ISC_LIST_NEXT(buffer, link);
557 	}
558 
559 	INSIST(skip_count == 0U);
560 
561  config:
562 	msg->msg_iov = iov;
563 	msg->msg_iovlen = iovcount;
564 
565 	msg->msg_control = NULL;
566 	msg->msg_controllen = 0;
567 	msg->msg_flags = 0;
568 
569 	if ((sock->type == isc_sockettype_udp) &&
570 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
571 	{
572 		struct in6_pktinfo *pktinfop;
573 
574 		socket_log(sock, NULL, TRACE,
575 			   "sendto pktinfo data, ifindex %u",
576 			   dev->pktinfo.ipi6_ifindex);
577 
578 		msg->msg_control = (void *)cmsgbuf;
579 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
580 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
581 
582 		cmsgp = (struct cmsghdr *)cmsgbuf;
583 		cmsgp->cmsg_level = IPPROTO_IPV6;
584 		cmsgp->cmsg_type = IPV6_PKTINFO;
585 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
586 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
587 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
588 	}
589 
590 	if ((sock->type == isc_sockettype_udp) &&
591 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
592 	{
593 		int use_min_mtu = 1;	/* -1, 0, 1 */
594 
595 		cmsgp = (struct cmsghdr *)(cmsgbuf +
596 					   msg->msg_controllen);
597 
598 		msg->msg_control = (void *)cmsgbuf;
599 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
600 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
601 
602 		cmsgp->cmsg_level = IPPROTO_IPV6;
603 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
604 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
605 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
606 	}
607 
608 	if (isc_dscp_check_value > -1) {
609 		if (sock->type == isc_sockettype_udp)
610 			INSIST((int)dev->dscp == isc_dscp_check_value);
611 		else if (sock->type == isc_sockettype_tcp)
612 			INSIST((int)sock->dscp == isc_dscp_check_value);
613 	}
614 
615 	if ((sock->type == isc_sockettype_udp) &&
616 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
617 	{
618 		int dscp = (dev->dscp << 2) & 0xff;
619 
620 		INSIST(dev->dscp < 0x40);
621 
622 		if (sock->pf == AF_INET && sock->pktdscp) {
623 			cmsgp = (struct cmsghdr *)(cmsgbuf +
624 						   msg->msg_controllen);
625 			msg->msg_control = (void *)cmsgbuf;
626 			msg->msg_controllen += cmsg_space(sizeof(dscp));
627 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
628 
629 			cmsgp->cmsg_level = IPPROTO_IP;
630 			cmsgp->cmsg_type = IP_TOS;
631 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
632 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
633 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
634 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
635 			       (void *)&dscp, sizeof(int)) < 0)
636 			{
637 				UNEXPECTED_ERROR(__FILE__, __LINE__,
638 						 "setsockopt(%d, IP_TOS, %.02x)"
639 						 " %s: %s",
640 						 sock->fd, dscp >> 2,
641 						 "failed", strerror(errno));
642 			} else
643 				sock->dscp = dscp;
644 		}
645 
646 		if (sock->pf == AF_INET6 && sock->pktdscp) {
647 			cmsgp = (struct cmsghdr *)(cmsgbuf +
648 						   msg->msg_controllen);
649 			msg->msg_control = (void *)cmsgbuf;
650 			msg->msg_controllen += cmsg_space(sizeof(dscp));
651 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
652 
653 			cmsgp->cmsg_level = IPPROTO_IPV6;
654 			cmsgp->cmsg_type = IPV6_TCLASS;
655 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
656 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
657 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
658 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
659 				       (void *)&dscp, sizeof(int)) < 0) {
660 				UNEXPECTED_ERROR(__FILE__, __LINE__,
661 						 "setsockopt(%d, IPV6_TCLASS, "
662 						 "%.02x) %s: %s",
663 						 sock->fd, dscp >> 2,
664 						 "failed", strerror(errno));
665 			} else
666 				sock->dscp = dscp;
667 		}
668 
669 		if (msg->msg_controllen != 0 &&
670 		    msg->msg_controllen < SENDCMSGBUFLEN)
671 		{
672 			memset(cmsgbuf + msg->msg_controllen, 0,
673 			       SENDCMSGBUFLEN - msg->msg_controllen);
674 		}
675 	}
676 
677 	if (write_countp != NULL)
678 		*write_countp = write_count;
679 }
680 
681 /*
682  * Construct an iov array and attach it to the msghdr passed in.  This is
683  * the RECV constructor, which will use the available region of the buffer
684  * (if using a buffer list) or will use the internal region (if a single
685  * buffer I/O is requested).
686  *
687  * Nothing can be NULL, and the done event must list at least one buffer
688  * on the buffer linked list for this function to be meaningful.
689  *
690  * If read_countp != NULL, *read_countp will hold the number of bytes
691  * this transaction can receive.
692  */
693 static void
build_msghdr_recv(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)694 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
695 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
696 {
697 	unsigned int iovcount;
698 	isc_buffer_t *buffer;
699 	isc_region_t available;
700 	size_t read_count;
701 
702 	memset(msg, 0, sizeof(struct msghdr));
703 
704 	if (sock->type == isc_sockettype_udp) {
705 		memset(&dev->address, 0, sizeof(dev->address));
706 		msg->msg_name = (void *)&dev->address;
707 		msg->msg_namelen = sizeof(dev->address);
708 	} else { /* TCP */
709 		msg->msg_name = NULL;
710 		msg->msg_namelen = 0;
711 		dev->address = sock->peer_address;
712 	}
713 
714 	buffer = ISC_LIST_HEAD(dev->bufferlist);
715 	read_count = 0;
716 
717 	/*
718 	 * Single buffer I/O?  Skip what we've done so far in this region.
719 	 */
720 	if (buffer == NULL) {
721 		read_count = dev->region.length - dev->n;
722 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
723 		iov[0].iov_len = read_count;
724 		iovcount = 1;
725 
726 		goto config;
727 	}
728 
729 	/*
730 	 * Multibuffer I/O.
731 	 * Skip empty buffers.
732 	 */
733 	while (buffer != NULL) {
734 		if (isc_buffer_availablelength(buffer) != 0)
735 			break;
736 		buffer = ISC_LIST_NEXT(buffer, link);
737 	}
738 
739 	iovcount = 0;
740 	while (buffer != NULL) {
741 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
742 
743 		isc_buffer_availableregion(buffer, &available);
744 
745 		if (available.length > 0) {
746 			iov[iovcount].iov_base = (void *)(available.base);
747 			iov[iovcount].iov_len = available.length;
748 			read_count += available.length;
749 			iovcount++;
750 		}
751 		buffer = ISC_LIST_NEXT(buffer, link);
752 	}
753 
754  config:
755 
756 	/*
757 	 * If needed, set up to receive that one extra byte.
758 	 */
759 	msg->msg_iov = iov;
760 	msg->msg_iovlen = iovcount;
761 
762 	msg->msg_control = cmsgbuf;
763 	msg->msg_controllen = RECVCMSGBUFLEN;
764 	msg->msg_flags = 0;
765 
766 	if (read_countp != NULL)
767 		*read_countp = read_count;
768 }
769 
770 static void
set_dev_address(struct sockaddr_storage * address,isc_socket_t * sock,isc_socketevent_t * dev)771 set_dev_address(struct sockaddr_storage *address, isc_socket_t *sock,
772 		isc_socketevent_t *dev)
773 {
774 	if (sock->type == isc_sockettype_udp) {
775 		if (address != NULL)
776 			dev->address = *address;
777 		else
778 			dev->address = sock->peer_address;
779 	} else if (sock->type == isc_sockettype_tcp) {
780 		INSIST(address == NULL);
781 		dev->address = sock->peer_address;
782 	}
783 }
784 
785 static void
destroy_socketevent(isc_event_t * event)786 destroy_socketevent(isc_event_t *event) {
787 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
788 
789 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
790 
791 	(ev->destroy)(event);
792 }
793 
794 static isc_socketevent_t *
allocate_socketevent(void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)795 allocate_socketevent(void *sender,
796 		     isc_eventtype_t eventtype, isc_taskaction_t action,
797 		     void *arg)
798 {
799 	isc_socketevent_t *ev;
800 
801 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
802 						     eventtype, action, arg,
803 						     sizeof(*ev));
804 
805 	if (ev == NULL)
806 		return (NULL);
807 
808 	ev->result = ISC_R_UNSET;
809 	ISC_LINK_INIT(ev, ev_link);
810 	ISC_LIST_INIT(ev->bufferlist);
811 	ev->region.base = NULL;
812 	ev->n = 0;
813 	ev->offset = 0;
814 	ev->attributes = 0;
815 	ev->destroy = ev->ev_destroy;
816 	ev->ev_destroy = destroy_socketevent;
817 	ev->dscp = 0;
818 
819 	return (ev);
820 }
821 
822 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
823 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
824 #define DOIO_HARD		2	/* i/o error, event sent */
825 #define DOIO_EOF		3	/* EOF, no event sent */
826 
827 static int
doio_recv(isc_socket_t * sock,isc_socketevent_t * dev)828 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
829 	int cc;
830 	struct iovec iov[MAXSCATTERGATHER_RECV];
831 	size_t read_count;
832 	size_t actual_count;
833 	struct msghdr msghdr;
834 	isc_buffer_t *buffer;
835 	int recv_errno;
836 	union {
837 		struct msghdr msghdr;
838 		char m[RECVCMSGBUFLEN];
839 	} cmsgbuf;
840 
841 	memset(&cmsgbuf, 0, sizeof(cmsgbuf));
842 
843 	build_msghdr_recv(sock, cmsgbuf.m, dev, &msghdr, iov, &read_count);
844 
845 	cc = recvmsg(sock->fd, &msghdr, 0);
846 	recv_errno = errno;
847 
848 	if (cc < 0) {
849 		if (SOFT_ERROR(recv_errno))
850 			return (DOIO_SOFT);
851 
852 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
853 			socket_log(sock, NULL, IOEVENT,
854 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
855 				   sock->fd, cc, recv_errno,
856 				   strerror(recv_errno));
857 		}
858 
859 #define SOFT_OR_HARD(_system, _isc) \
860 	if (recv_errno == _system) { \
861 		if (sock->connected) { \
862 			dev->result = _isc; \
863 			return (DOIO_HARD); \
864 		} \
865 		return (DOIO_SOFT); \
866 	}
867 #define ALWAYS_HARD(_system, _isc) \
868 	if (recv_errno == _system) { \
869 		dev->result = _isc; \
870 		return (DOIO_HARD); \
871 	}
872 
873 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
874 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
875 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
876 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
877 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
878 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
879 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
880 		/* Should never get this one but it was seen. */
881 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
882 		/*
883 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
884 		 * errors.
885 		 */
886 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
887 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
888 
889 #undef SOFT_OR_HARD
890 #undef ALWAYS_HARD
891 
892 		dev->result = isc__errno2result(recv_errno);
893 		return (DOIO_HARD);
894 	}
895 
896 	/*
897 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
898 	 * while on UDP sockets, zero length reads are perfectly valid,
899 	 * although strange.
900 	 */
901 	switch (sock->type) {
902 	case isc_sockettype_tcp:
903 		if (cc == 0)
904 			return (DOIO_EOF);
905 		break;
906 	case isc_sockettype_udp:
907 		break;
908 	default:
909 		INSIST(0);
910 	}
911 
912 	if (sock->type == isc_sockettype_udp) {
913 		dev->address.ss_len = msghdr.msg_namelen;
914 		if (isc_sockaddr_getport(&dev->address) == 0) {
915 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
916 				socket_log(sock, &dev->address, IOEVENT,
917 					   "dropping source port zero packet");
918 			}
919 			return (DOIO_SOFT);
920 		}
921 	}
922 
923 	socket_log(sock, &dev->address, IOEVENT,
924 		   "packet received correctly");
925 
926 	/*
927 	 * Overflow bit detection.  If we received MORE bytes than we should,
928 	 * this indicates an overflow situation.  Set the flag in the
929 	 * dev entry and adjust how much we read by one.
930 	 */
931 	/*
932 	 * If there are control messages attached, run through them and pull
933 	 * out the interesting bits.
934 	 */
935 	process_cmsg(sock, &msghdr, dev);
936 
937 	/*
938 	 * update the buffers (if any) and the i/o count
939 	 */
940 	dev->n += cc;
941 	actual_count = cc;
942 	buffer = ISC_LIST_HEAD(dev->bufferlist);
943 	while (buffer != NULL && actual_count > 0U) {
944 		if (isc_buffer_availablelength(buffer) <= actual_count) {
945 			actual_count -= isc_buffer_availablelength(buffer);
946 			isc_buffer_add(buffer,
947 				       isc_buffer_availablelength(buffer));
948 		} else {
949 			isc_buffer_add(buffer, actual_count);
950 			actual_count = 0;
951 			POST(actual_count);
952 			break;
953 		}
954 		buffer = ISC_LIST_NEXT(buffer, link);
955 		if (buffer == NULL) {
956 			INSIST(actual_count == 0U);
957 		}
958 	}
959 
960 	/*
961 	 * If we read less than we expected, update counters,
962 	 * and let the upper layer poke the descriptor.
963 	 */
964 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
965 		return (DOIO_SOFT);
966 
967 	/*
968 	 * Full reads are posted, or partials if partials are ok.
969 	 */
970 	dev->result = ISC_R_SUCCESS;
971 	return (DOIO_SUCCESS);
972 }
973 
974 /*
975  * Returns:
976  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
977  *			ISC_R_SUCCESS.
978  *
979  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
980  *			dev->result contains the appropriate error.
981  *
982  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
983  *			event was sent.  The operation should be retried.
984  *
985  *	No other return values are possible.
986  */
987 static int
doio_send(isc_socket_t * sock,isc_socketevent_t * dev)988 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
989 	int cc;
990 	struct iovec iov[MAXSCATTERGATHER_SEND];
991 	size_t write_count;
992 	struct msghdr msghdr;
993 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
994 	int attempts = 0;
995 	int send_errno;
996 	union {
997 		struct msghdr msghdr;
998 		char m[SENDCMSGBUFLEN];
999 	} cmsgbuf;
1000 
1001 	memset(&cmsgbuf, 0, sizeof(cmsgbuf));
1002 
1003 	build_msghdr_send(sock, cmsgbuf.m, dev, &msghdr, iov, &write_count);
1004 
1005  resend:
1006 	cc = sendmsg(sock->fd, &msghdr, 0);
1007 	send_errno = errno;
1008 
1009 	/*
1010 	 * Check for error or block condition.
1011 	 */
1012 	if (cc < 0) {
1013 		if (send_errno == EINTR && ++attempts < NRETRIES)
1014 			goto resend;
1015 
1016 		if (SOFT_ERROR(send_errno)) {
1017 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1018 				dev->result = ISC_R_WOULDBLOCK;
1019 			return (DOIO_SOFT);
1020 		}
1021 
1022 #define SOFT_OR_HARD(_system, _isc) \
1023 	if (send_errno == _system) { \
1024 		if (sock->connected) { \
1025 			dev->result = _isc; \
1026 			return (DOIO_HARD); \
1027 		} \
1028 		return (DOIO_SOFT); \
1029 	}
1030 #define ALWAYS_HARD(_system, _isc) \
1031 	if (send_errno == _system) { \
1032 		dev->result = _isc; \
1033 		return (DOIO_HARD); \
1034 	}
1035 
1036 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1037 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1038 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1039 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1040 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1041 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1042 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1043 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1044 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1045 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1046 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1047 
1048 #undef SOFT_OR_HARD
1049 #undef ALWAYS_HARD
1050 
1051 		/*
1052 		 * The other error types depend on whether or not the
1053 		 * socket is UDP or TCP.  If it is UDP, some errors
1054 		 * that we expect to be fatal under TCP are merely
1055 		 * annoying, and are really soft errors.
1056 		 *
1057 		 * However, these soft errors are still returned as
1058 		 * a status.
1059 		 */
1060 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1061 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1062 				 addrbuf, strerror(send_errno));
1063 		dev->result = isc__errno2result(send_errno);
1064 		return (DOIO_HARD);
1065 	}
1066 
1067 	if (cc == 0) {
1068 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1069 				 "doio_send: send() %s 0", "returned");
1070 	}
1071 
1072 	/*
1073 	 * If we write less than we expected, update counters, poke.
1074 	 */
1075 	dev->n += cc;
1076 	if ((size_t)cc != write_count)
1077 		return (DOIO_SOFT);
1078 
1079 	/*
1080 	 * Exactly what we wanted to write.  We're done with this
1081 	 * entry.  Post its completion event.
1082 	 */
1083 	dev->result = ISC_R_SUCCESS;
1084 	return (DOIO_SUCCESS);
1085 }
1086 
1087 /*
1088  * Kill.
1089  *
1090  * Caller must ensure that the socket is not locked and no external
1091  * references exist.
1092  */
1093 static void
socketclose(isc_socketmgr_t * manager,isc_socket_t * sock,int fd)1094 socketclose(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1095 	/*
1096 	 * No one has this socket open, so the watcher doesn't have to be
1097 	 * poked, and the socket doesn't have to be locked.
1098 	 */
1099 	manager->fds[fd] = NULL;
1100 	manager->fdstate[fd] = CLOSE_PENDING;
1101 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1102 
1103 	if (sock->active == 1) {
1104 		sock->active = 0;
1105 	}
1106 
1107 	/*
1108 	 * update manager->maxfd here (XXX: this should be implemented more
1109 	 * efficiently)
1110 	 */
1111 	if (manager->maxfd == fd) {
1112 		int i;
1113 
1114 		manager->maxfd = 0;
1115 		for (i = fd - 1; i >= 0; i--) {
1116 			if (manager->fdstate[i] == MANAGED) {
1117 				manager->maxfd = i;
1118 				break;
1119 			}
1120 		}
1121 	}
1122 
1123 }
1124 
1125 static void
destroy(isc_socket_t ** sockp)1126 destroy(isc_socket_t **sockp) {
1127 	int fd;
1128 	isc_socket_t *sock = *sockp;
1129 	isc_socketmgr_t *manager = sock->manager;
1130 
1131 	socket_log(sock, NULL, CREATION, "destroying");
1132 
1133 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1134 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1135 	INSIST(sock->connect_ev == NULL);
1136 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1137 
1138 	if (sock->fd >= 0) {
1139 		fd = sock->fd;
1140 		sock->fd = -1;
1141 		socketclose(manager, sock, fd);
1142 	}
1143 
1144 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1145 
1146 	/* can't unlock manager as its memory context is still used */
1147 	free_socket(sockp);
1148 }
1149 
1150 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1151 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1152 		isc_socket_t **socketp)
1153 {
1154 	isc_socket_t *sock;
1155 
1156 	sock = malloc(sizeof(*sock));
1157 
1158 	if (sock == NULL)
1159 		return (ISC_R_NOMEMORY);
1160 
1161 	sock->references = 0;
1162 
1163 	sock->manager = manager;
1164 	sock->type = type;
1165 	sock->fd = -1;
1166 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1167 	sock->active = 0;
1168 
1169 	ISC_LINK_INIT(sock, link);
1170 
1171 	/*
1172 	 * Set up list of readers and writers to be initially empty.
1173 	 */
1174 	ISC_LIST_INIT(sock->recv_list);
1175 	ISC_LIST_INIT(sock->send_list);
1176 	sock->connect_ev = NULL;
1177 	sock->pending_recv = 0;
1178 	sock->pending_send = 0;
1179 	sock->connected = 0;
1180 	sock->connecting = 0;
1181 	sock->bound = 0;
1182 	sock->pktdscp = 0;
1183 
1184 	/*
1185 	 * Initialize readable and writable events.
1186 	 */
1187 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1188 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1189 		       NULL, sock, sock, NULL);
1190 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1191 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1192 		       NULL, sock, sock, NULL);
1193 
1194 	*socketp = sock;
1195 
1196 	return (ISC_R_SUCCESS);
1197 }
1198 
1199 /*
1200  * This event requires that the various lists be empty, that the reference
1201  * count be 1.  The other socket bits,
1202  * like the lock, must be initialized as well.  The fd associated must be
1203  * marked as closed, by setting it to -1 on close, or this routine will
1204  * also close the socket.
1205  */
1206 static void
free_socket(isc_socket_t ** socketp)1207 free_socket(isc_socket_t **socketp) {
1208 	isc_socket_t *sock = *socketp;
1209 
1210 	INSIST(sock->references == 0);
1211 	INSIST(!sock->connecting);
1212 	INSIST(!sock->pending_recv);
1213 	INSIST(!sock->pending_send);
1214 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1215 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1216 	INSIST(!ISC_LINK_LINKED(sock, link));
1217 
1218 	free(sock);
1219 
1220 	*socketp = NULL;
1221 }
1222 
1223 static void
use_min_mtu(isc_socket_t * sock)1224 use_min_mtu(isc_socket_t *sock) {
1225 	/* use minimum MTU */
1226 	if (sock->pf == AF_INET6) {
1227 		int on = 1;
1228 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1229 				(void *)&on, sizeof(on));
1230 	}
1231 }
1232 
1233 static void
set_tcp_maxseg(isc_socket_t * sock,int size)1234 set_tcp_maxseg(isc_socket_t *sock, int size) {
1235 	if (sock->type == isc_sockettype_tcp)
1236 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1237 				(void *)&size, sizeof(size));
1238 }
1239 
1240 static isc_result_t
opensocket(isc_socket_t * sock)1241 opensocket(isc_socket_t *sock)
1242 {
1243 	isc_result_t result;
1244 	const char *err = "socket";
1245 	int on = 1;
1246 
1247 	switch (sock->type) {
1248 	case isc_sockettype_udp:
1249 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1250 		break;
1251 	case isc_sockettype_tcp:
1252 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1253 		break;
1254 	}
1255 
1256 	if (sock->fd < 0) {
1257 		switch (errno) {
1258 		case EMFILE:
1259 		case ENFILE:
1260 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1261 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1262 				       "%s: %s", err, strerror(errno));
1263 			/* fallthrough */
1264 		case ENOBUFS:
1265 			return (ISC_R_NORESOURCES);
1266 
1267 		case EPROTONOSUPPORT:
1268 		case EPFNOSUPPORT:
1269 		case EAFNOSUPPORT:
1270 		/*
1271 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1272 		 * EAFNOSUPPORT.
1273 		 */
1274 		case EINVAL:
1275 			return (ISC_R_FAMILYNOSUPPORT);
1276 
1277 		default:
1278 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1279 					 "%s() %s: %s", err, "failed",
1280 					 strerror(errno));
1281 			return (ISC_R_UNEXPECTED);
1282 		}
1283 	}
1284 
1285 	result = make_nonblock(sock->fd);
1286 	if (result != ISC_R_SUCCESS) {
1287 		(void)close(sock->fd);
1288 		return (result);
1289 	}
1290 
1291 	/*
1292 	 * Use minimum mtu if possible.
1293 	 */
1294 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1295 		use_min_mtu(sock);
1296 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1297 	}
1298 
1299 	if (sock->type == isc_sockettype_udp) {
1300 
1301 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1302 			       (void *)&on, sizeof(on)) < 0
1303 		    && errno != ENOPROTOOPT) {
1304 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1305 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1306 					 sock->fd, "failed", strerror(errno));
1307 			/* Press on... */
1308 		}
1309 
1310 		/* RFC 3542 */
1311 		if ((sock->pf == AF_INET6)
1312 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1313 				   (void *)&on, sizeof(on)) < 0)) {
1314 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1315 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1316 					 "%s: %s", sock->fd, "failed",
1317 					 strerror(errno));
1318 		}
1319 	}
1320 
1321 	if (sock->active == 0) {
1322 		sock->active = 1;
1323 	}
1324 
1325 	return (ISC_R_SUCCESS);
1326 }
1327 
1328 /*
1329  * Create a 'type' socket managed
1330  * by 'manager'.  Events will be posted to 'task' and when dispatched
1331  * 'action' will be called with 'arg' as the arg value.  The new
1332  * socket is returned in 'socketp'.
1333  */
1334 static isc_result_t
socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)1335 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1336 	      isc_socket_t **socketp)
1337 {
1338 	isc_socket_t *sock = NULL;
1339 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
1340 	isc_result_t result;
1341 
1342 	REQUIRE(socketp != NULL && *socketp == NULL);
1343 
1344 	result = allocate_socket(manager, type, &sock);
1345 	if (result != ISC_R_SUCCESS)
1346 		return (result);
1347 
1348 	switch (sock->type) {
1349 	case isc_sockettype_udp:
1350 		sock->pktdscp = 1;
1351 		break;
1352 	case isc_sockettype_tcp:
1353 		break;
1354 	default:
1355 		INSIST(0);
1356 	}
1357 
1358 	sock->pf = pf;
1359 
1360 	result = opensocket(sock);
1361 	if (result != ISC_R_SUCCESS) {
1362 		free_socket(&sock);
1363 		return (result);
1364 	}
1365 
1366 	sock->references = 1;
1367 	*socketp = (isc_socket_t *)sock;
1368 
1369 	/*
1370 	 * Note we don't have to lock the socket like we normally would because
1371 	 * there are no external references to it yet.
1372 	 */
1373 
1374 	manager->fds[sock->fd] = sock;
1375 	manager->fdstate[sock->fd] = MANAGED;
1376 
1377 	ISC_LIST_APPEND(manager->socklist, sock, link);
1378 	if (manager->maxfd < sock->fd)
1379 		manager->maxfd = sock->fd;
1380 
1381 	socket_log(sock, NULL, CREATION, "created");
1382 
1383 	return (ISC_R_SUCCESS);
1384 }
1385 
1386 /*%
1387  * Create a new 'type' socket managed by 'manager'.  Events
1388  * will be posted to 'task' and when dispatched 'action' will be
1389  * called with 'arg' as the arg value.  The new socket is returned
1390  * in 'socketp'.
1391  */
1392 isc_result_t
isc_socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)1393 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1394 		   isc_socket_t **socketp)
1395 {
1396 	return (socket_create(manager0, pf, type, socketp));
1397 }
1398 
1399 /*
1400  * Attach to a socket.  Caller must explicitly detach when it is done.
1401  */
1402 void
isc_socket_attach(isc_socket_t * sock0,isc_socket_t ** socketp)1403 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1404 	isc_socket_t *sock = (isc_socket_t *)sock0;
1405 
1406 	REQUIRE(socketp != NULL && *socketp == NULL);
1407 
1408 	sock->references++;
1409 
1410 	*socketp = (isc_socket_t *)sock;
1411 }
1412 
1413 /*
1414  * Dereference a socket.  If this is the last reference to it, clean things
1415  * up by destroying the socket.
1416  */
1417 void
isc_socket_detach(isc_socket_t ** socketp)1418 isc_socket_detach(isc_socket_t **socketp) {
1419 	isc_socket_t *sock;
1420 	int kill_socket = 0;
1421 
1422 	REQUIRE(socketp != NULL);
1423 	sock = (isc_socket_t *)*socketp;
1424 
1425 	REQUIRE(sock->references > 0);
1426 	sock->references--;
1427 	if (sock->references == 0)
1428 		kill_socket = 1;
1429 
1430 	if (kill_socket)
1431 		destroy(&sock);
1432 
1433 	*socketp = NULL;
1434 }
1435 
1436 /*
1437  * I/O is possible on a given socket.  Schedule an event to this task that
1438  * will call an internal function to do the I/O.  This will charge the
1439  * task with the I/O operation and let our select loop handler get back
1440  * to doing something real as fast as possible.
1441  *
1442  * The socket and manager must be locked before calling this function.
1443  */
1444 static void
dispatch_recv(isc_socket_t * sock)1445 dispatch_recv(isc_socket_t *sock) {
1446 	intev_t *iev;
1447 	isc_socketevent_t *ev;
1448 	isc_task_t *sender;
1449 
1450 	INSIST(!sock->pending_recv);
1451 
1452 	ev = ISC_LIST_HEAD(sock->recv_list);
1453 	if (ev == NULL)
1454 		return;
1455 	socket_log(sock, NULL, EVENT,
1456 		   "dispatch_recv:  event %p -> task %p",
1457 		   ev, ev->ev_sender);
1458 	sender = ev->ev_sender;
1459 
1460 	sock->pending_recv = 1;
1461 	iev = &sock->readable_ev;
1462 
1463 	sock->references++;
1464 	iev->ev_sender = sock;
1465 	iev->ev_action = internal_recv;
1466 	iev->ev_arg = sock;
1467 
1468 	isc_task_send(sender, (isc_event_t **)&iev);
1469 }
1470 
1471 static void
dispatch_send(isc_socket_t * sock)1472 dispatch_send(isc_socket_t *sock) {
1473 	intev_t *iev;
1474 	isc_socketevent_t *ev;
1475 	isc_task_t *sender;
1476 
1477 	INSIST(!sock->pending_send);
1478 
1479 	ev = ISC_LIST_HEAD(sock->send_list);
1480 	if (ev == NULL)
1481 		return;
1482 	socket_log(sock, NULL, EVENT,
1483 		   "dispatch_send:  event %p -> task %p",
1484 		   ev, ev->ev_sender);
1485 	sender = ev->ev_sender;
1486 
1487 	sock->pending_send = 1;
1488 	iev = &sock->writable_ev;
1489 
1490 	sock->references++;
1491 	iev->ev_sender = sock;
1492 	iev->ev_action = internal_send;
1493 	iev->ev_arg = sock;
1494 
1495 	isc_task_send(sender, (isc_event_t **)&iev);
1496 }
1497 
1498 static void
dispatch_connect(isc_socket_t * sock)1499 dispatch_connect(isc_socket_t *sock) {
1500 	intev_t *iev;
1501 	isc_socket_connev_t *ev;
1502 
1503 	iev = &sock->writable_ev;
1504 
1505 	ev = sock->connect_ev;
1506 	INSIST(ev != NULL); /* XXX */
1507 
1508 	INSIST(sock->connecting);
1509 
1510 	sock->references++;  /* keep socket around for this internal event */
1511 	iev->ev_sender = sock;
1512 	iev->ev_action = internal_connect;
1513 	iev->ev_arg = sock;
1514 
1515 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1516 }
1517 
1518 /*
1519  * Dequeue an item off the given socket's read queue, set the result code
1520  * in the done event to the one provided, and send it to the task it was
1521  * destined for.
1522  *
1523  * If the event to be sent is on a list, remove it before sending.  If
1524  * asked to, send and detach from the socket as well.
1525  *
1526  * Caller must have the socket locked if the event is attached to the socket.
1527  */
1528 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)1529 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1530 	isc_task_t *task;
1531 
1532 	task = (*dev)->ev_sender;
1533 
1534 	(*dev)->ev_sender = sock;
1535 
1536 	if (ISC_LINK_LINKED(*dev, ev_link))
1537 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1538 
1539 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1540 	    == ISC_SOCKEVENTATTR_ATTACHED)
1541 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1542 	else
1543 		isc_task_send(task, (isc_event_t **)dev);
1544 }
1545 
1546 /*
1547  * See comments for send_recvdone_event() above.
1548  *
1549  * Caller must have the socket locked if the event is attached to the socket.
1550  */
1551 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)1552 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1553 	isc_task_t *task;
1554 
1555 	INSIST(dev != NULL && *dev != NULL);
1556 
1557 	task = (*dev)->ev_sender;
1558 	(*dev)->ev_sender = sock;
1559 
1560 	if (ISC_LINK_LINKED(*dev, ev_link))
1561 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1562 
1563 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1564 	    == ISC_SOCKEVENTATTR_ATTACHED)
1565 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1566 	else
1567 		isc_task_send(task, (isc_event_t **)dev);
1568 }
1569 
1570 static void
internal_recv(isc_task_t * me,isc_event_t * ev)1571 internal_recv(isc_task_t *me, isc_event_t *ev) {
1572 	isc_socketevent_t *dev;
1573 	isc_socket_t *sock;
1574 
1575 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1576 
1577 	sock = ev->ev_sender;
1578 
1579 	socket_log(sock, NULL, IOEVENT,
1580 		   "internal_recv: task %p got event %p", me, ev);
1581 
1582 	INSIST(sock->pending_recv == 1);
1583 	sock->pending_recv = 0;
1584 
1585 	INSIST(sock->references > 0);
1586 	sock->references--;  /* the internal event is done with this socket */
1587 	if (sock->references == 0) {
1588 		destroy(&sock);
1589 		return;
1590 	}
1591 
1592 	/*
1593 	 * Try to do as much I/O as possible on this socket.  There are no
1594 	 * limits here, currently.
1595 	 */
1596 	dev = ISC_LIST_HEAD(sock->recv_list);
1597 	while (dev != NULL) {
1598 		switch (doio_recv(sock, dev)) {
1599 		case DOIO_SOFT:
1600 			goto poke;
1601 
1602 		case DOIO_EOF:
1603 			/*
1604 			 * read of 0 means the remote end was closed.
1605 			 * Run through the event queue and dispatch all
1606 			 * the events with an EOF result code.
1607 			 */
1608 			do {
1609 				dev->result = ISC_R_EOF;
1610 				send_recvdone_event(sock, &dev);
1611 				dev = ISC_LIST_HEAD(sock->recv_list);
1612 			} while (dev != NULL);
1613 			goto poke;
1614 
1615 		case DOIO_SUCCESS:
1616 		case DOIO_HARD:
1617 			send_recvdone_event(sock, &dev);
1618 			break;
1619 		}
1620 
1621 		dev = ISC_LIST_HEAD(sock->recv_list);
1622 	}
1623 
1624  poke:
1625 	if (!ISC_LIST_EMPTY(sock->recv_list))
1626 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1627 }
1628 
1629 static void
internal_send(isc_task_t * me,isc_event_t * ev)1630 internal_send(isc_task_t *me, isc_event_t *ev) {
1631 	isc_socketevent_t *dev;
1632 	isc_socket_t *sock;
1633 
1634 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1635 
1636 	/*
1637 	 * Find out what socket this is and lock it.
1638 	 */
1639 	sock = (isc_socket_t *)ev->ev_sender;
1640 	socket_log(sock, NULL, IOEVENT,
1641 		   "internal_send: task %p got event %p", me, ev);
1642 
1643 	INSIST(sock->pending_send == 1);
1644 	sock->pending_send = 0;
1645 
1646 	INSIST(sock->references > 0);
1647 	sock->references--;  /* the internal event is done with this socket */
1648 	if (sock->references == 0) {
1649 		destroy(&sock);
1650 		return;
1651 	}
1652 
1653 	/*
1654 	 * Try to do as much I/O as possible on this socket.  There are no
1655 	 * limits here, currently.
1656 	 */
1657 	dev = ISC_LIST_HEAD(sock->send_list);
1658 	while (dev != NULL) {
1659 		switch (doio_send(sock, dev)) {
1660 		case DOIO_SOFT:
1661 			goto poke;
1662 
1663 		case DOIO_HARD:
1664 		case DOIO_SUCCESS:
1665 			send_senddone_event(sock, &dev);
1666 			break;
1667 		}
1668 
1669 		dev = ISC_LIST_HEAD(sock->send_list);
1670 	}
1671 
1672  poke:
1673 	if (!ISC_LIST_EMPTY(sock->send_list))
1674 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1675 }
1676 
1677 /*
1678  * Process read/writes on each fd here.  Avoid locking
1679  * and unlocking twice if both reads and writes are possible.
1680  */
1681 static void
process_fd(isc_socketmgr_t * manager,int fd,int readable,int writeable)1682 process_fd(isc_socketmgr_t *manager, int fd, int readable,
1683 	   int writeable)
1684 {
1685 	isc_socket_t *sock;
1686 	int unwatch_read = 0, unwatch_write = 0;
1687 
1688 	/*
1689 	 * If the socket is going to be closed, don't do more I/O.
1690 	 */
1691 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1692 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1693 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1694 		return;
1695 	}
1696 
1697 	sock = manager->fds[fd];
1698 	if (readable) {
1699 		if (sock == NULL) {
1700 			unwatch_read = 1;
1701 			goto check_write;
1702 		}
1703 		if (!SOCK_DEAD(sock)) {
1704 			dispatch_recv(sock);
1705 		}
1706 		unwatch_read = 1;
1707 	}
1708 check_write:
1709 	if (writeable) {
1710 		if (sock == NULL) {
1711 			unwatch_write = 1;
1712 			goto unlock_fd;
1713 		}
1714 		if (!SOCK_DEAD(sock)) {
1715 			if (sock->connecting)
1716 				dispatch_connect(sock);
1717 			else
1718 				dispatch_send(sock);
1719 		}
1720 		unwatch_write = 1;
1721 	}
1722 
1723  unlock_fd:
1724 	if (unwatch_read)
1725 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1726 	if (unwatch_write)
1727 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1728 
1729 }
1730 
1731 static void
process_fds(isc_socketmgr_t * manager,int maxfd,fd_set * readfds,fd_set * writefds)1732 process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds,
1733 	    fd_set *writefds)
1734 {
1735 	int i;
1736 
1737 	REQUIRE(maxfd <= (int)manager->maxsocks);
1738 
1739 	for (i = 0; i < maxfd; i++) {
1740 		process_fd(manager, i, FD_ISSET(i, readfds),
1741 			   FD_ISSET(i, writefds));
1742 	}
1743 }
1744 
1745 /*
1746  * Create a new socket manager.
1747  */
1748 
1749 static isc_result_t
setup_watcher(isc_socketmgr_t * manager)1750 setup_watcher(isc_socketmgr_t *manager) {
1751 	isc_result_t result;
1752 
1753 	UNUSED(result);
1754 
1755 	manager->fd_bufsize = sizeof(fd_set);
1756 
1757 	manager->read_fds = NULL;
1758 	manager->read_fds_copy = NULL;
1759 	manager->write_fds = NULL;
1760 	manager->write_fds_copy = NULL;
1761 
1762 	manager->read_fds = malloc(manager->fd_bufsize);
1763 	if (manager->read_fds != NULL)
1764 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1765 	if (manager->read_fds_copy != NULL)
1766 		manager->write_fds = malloc(manager->fd_bufsize);
1767 	if (manager->write_fds != NULL) {
1768 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1769 	}
1770 	if (manager->write_fds_copy == NULL) {
1771 		if (manager->write_fds != NULL) {
1772 			free(manager->write_fds);
1773 		}
1774 		if (manager->read_fds_copy != NULL) {
1775 			free(manager->read_fds_copy);
1776 		}
1777 		if (manager->read_fds != NULL) {
1778 			free(manager->read_fds);
1779 		}
1780 		return (ISC_R_NOMEMORY);
1781 	}
1782 	memset(manager->read_fds, 0, manager->fd_bufsize);
1783 	memset(manager->write_fds, 0, manager->fd_bufsize);
1784 
1785 	manager->maxfd = 0;
1786 
1787 	return (ISC_R_SUCCESS);
1788 }
1789 
1790 static void
cleanup_watcher(isc_socketmgr_t * manager)1791 cleanup_watcher(isc_socketmgr_t *manager) {
1792 
1793 	if (manager->read_fds != NULL)
1794 		free(manager->read_fds);
1795 	if (manager->read_fds_copy != NULL)
1796 		free(manager->read_fds_copy);
1797 	if (manager->write_fds != NULL)
1798 		free(manager->write_fds);
1799 	if (manager->write_fds_copy != NULL)
1800 		free(manager->write_fds_copy);
1801 }
1802 
1803 static isc_result_t
isc_socketmgr_create2(isc_socketmgr_t ** managerp,unsigned int maxsocks)1804 isc_socketmgr_create2(isc_socketmgr_t **managerp,
1805 		       unsigned int maxsocks)
1806 {
1807 	isc_socketmgr_t *manager;
1808 	isc_result_t result;
1809 
1810 	REQUIRE(managerp != NULL && *managerp == NULL);
1811 
1812 	if (socketmgr != NULL) {
1813 		/* Don't allow maxsocks to be updated */
1814 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1815 			return (ISC_R_EXISTS);
1816 
1817 		socketmgr->refs++;
1818 		*managerp = (isc_socketmgr_t *)socketmgr;
1819 		return (ISC_R_SUCCESS);
1820 	}
1821 
1822 	if (maxsocks == 0)
1823 		maxsocks = FD_SETSIZE;
1824 
1825 	manager = malloc(sizeof(*manager));
1826 	if (manager == NULL)
1827 		return (ISC_R_NOMEMORY);
1828 
1829 	/* zero-clear so that necessary cleanup on failure will be easy */
1830 	memset(manager, 0, sizeof(*manager));
1831 	manager->maxsocks = maxsocks;
1832 	manager->fds = reallocarray(NULL, manager->maxsocks, sizeof(isc_socket_t *));
1833 	if (manager->fds == NULL) {
1834 		result = ISC_R_NOMEMORY;
1835 		goto free_manager;
1836 	}
1837 	manager->fdstate = reallocarray(NULL, manager->maxsocks, sizeof(int));
1838 	if (manager->fdstate == NULL) {
1839 		result = ISC_R_NOMEMORY;
1840 		goto free_manager;
1841 	}
1842 
1843 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1844 	ISC_LIST_INIT(manager->socklist);
1845 
1846 	manager->refs = 1;
1847 
1848 	/*
1849 	 * Set up initial state for the select loop
1850 	 */
1851 	result = setup_watcher(manager);
1852 	if (result != ISC_R_SUCCESS)
1853 		goto cleanup;
1854 
1855 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1856 
1857 	socketmgr = manager;
1858 	*managerp = (isc_socketmgr_t *)manager;
1859 
1860 	return (ISC_R_SUCCESS);
1861 
1862 cleanup:
1863 
1864 free_manager:
1865 	if (manager->fdstate != NULL) {
1866 		free(manager->fdstate);
1867 	}
1868 	if (manager->fds != NULL) {
1869 		free(manager->fds);
1870 	}
1871 	free(manager);
1872 
1873 	return (result);
1874 }
1875 
1876 isc_result_t
isc_socketmgr_create(isc_socketmgr_t ** managerp)1877 isc_socketmgr_create(isc_socketmgr_t **managerp) {
1878 	return (isc_socketmgr_create2(managerp, 0));
1879 }
1880 
1881 void
isc_socketmgr_destroy(isc_socketmgr_t ** managerp)1882 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
1883 	isc_socketmgr_t *manager;
1884 	int i;
1885 
1886 	/*
1887 	 * Destroy a socket manager.
1888 	 */
1889 
1890 	REQUIRE(managerp != NULL);
1891 	manager = (isc_socketmgr_t *)*managerp;
1892 
1893 	manager->refs--;
1894 	if (manager->refs > 0) {
1895 		*managerp = NULL;
1896 		return;
1897 	}
1898 	socketmgr = NULL;
1899 
1900 	/*
1901 	 * Wait for all sockets to be destroyed.
1902 	 */
1903 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1904 		isc_taskmgr_dispatch(NULL);
1905 	}
1906 
1907 	/*
1908 	 * Here, poke our select/poll thread.  Do this by closing the write
1909 	 * half of the pipe, which will send EOF to the read half.
1910 	 * This is currently a no-op in the non-threaded case.
1911 	 */
1912 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
1913 
1914 	/*
1915 	 * Clean up.
1916 	 */
1917 	cleanup_watcher(manager);
1918 
1919 	for (i = 0; i < (int)manager->maxsocks; i++)
1920 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
1921 			(void)close(i);
1922 
1923 	free(manager->fds);
1924 	free(manager->fdstate);
1925 
1926 	free(manager);
1927 
1928 	*managerp = NULL;
1929 
1930 	socketmgr = NULL;
1931 }
1932 
1933 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)1934 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
1935 	    unsigned int flags)
1936 {
1937 	int io_state;
1938 	isc_task_t *ntask = NULL;
1939 	isc_result_t result = ISC_R_SUCCESS;
1940 
1941 	dev->ev_sender = task;
1942 
1943 	if (sock->type == isc_sockettype_udp) {
1944 		io_state = doio_recv(sock, dev);
1945 	} else {
1946 		if (ISC_LIST_EMPTY(sock->recv_list))
1947 			io_state = doio_recv(sock, dev);
1948 		else
1949 			io_state = DOIO_SOFT;
1950 	}
1951 
1952 	switch (io_state) {
1953 	case DOIO_SOFT:
1954 		/*
1955 		 * We couldn't read all or part of the request right now, so
1956 		 * queue it.
1957 		 *
1958 		 * Attach to socket and to task
1959 		 */
1960 		isc_task_attach(task, &ntask);
1961 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
1962 
1963 		/*
1964 		 * Enqueue the request.  If the socket was previously not being
1965 		 * watched, poke the watcher to start paying attention to it.
1966 		 */
1967 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
1968 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1969 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
1970 
1971 		socket_log(sock, NULL, EVENT,
1972 			   "socket_recv: event %p -> task %p",
1973 			   dev, ntask);
1974 
1975 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
1976 			result = ISC_R_INPROGRESS;
1977 		break;
1978 
1979 	case DOIO_EOF:
1980 		dev->result = ISC_R_EOF;
1981 		/* fallthrough */
1982 
1983 	case DOIO_HARD:
1984 	case DOIO_SUCCESS:
1985 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
1986 			send_recvdone_event(sock, &dev);
1987 		break;
1988 	}
1989 
1990 	return (result);
1991 }
1992 
1993 isc_result_t
isc_socket_recvv(isc_socket_t * sock0,isc_bufferlist_t * buflist,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)1994 isc_socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
1995 		  unsigned int minimum, isc_task_t *task,
1996 		  isc_taskaction_t action, void *arg)
1997 {
1998 	isc_socket_t *sock = (isc_socket_t *)sock0;
1999 	isc_socketevent_t *dev;
2000 	unsigned int iocount;
2001 	isc_buffer_t *buffer;
2002 
2003 	REQUIRE(buflist != NULL);
2004 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2005 	REQUIRE(task != NULL);
2006 	REQUIRE(action != NULL);
2007 
2008 	iocount = isc_bufferlist_availablecount(buflist);
2009 	REQUIRE(iocount > 0);
2010 
2011 	INSIST(sock->bound);
2012 
2013 	dev = allocate_socketevent(sock,
2014 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2015 	if (dev == NULL)
2016 		return (ISC_R_NOMEMORY);
2017 
2018 	/*
2019 	 * UDP sockets are always partial read
2020 	 */
2021 	if (sock->type == isc_sockettype_udp)
2022 		dev->minimum = 1;
2023 	else {
2024 		if (minimum == 0)
2025 			dev->minimum = iocount;
2026 		else
2027 			dev->minimum = minimum;
2028 	}
2029 
2030 	/*
2031 	 * Move each buffer from the passed in list to our internal one.
2032 	 */
2033 	buffer = ISC_LIST_HEAD(*buflist);
2034 	while (buffer != NULL) {
2035 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2036 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2037 		buffer = ISC_LIST_HEAD(*buflist);
2038 	}
2039 
2040 	return (socket_recv(sock, dev, task, 0));
2041 }
2042 
2043 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,struct sockaddr_storage * address,struct in6_pktinfo * pktinfo,unsigned int flags)2044 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2045 	    struct sockaddr_storage *address, struct in6_pktinfo *pktinfo,
2046 	    unsigned int flags)
2047 {
2048 	int io_state;
2049 	isc_task_t *ntask = NULL;
2050 	isc_result_t result = ISC_R_SUCCESS;
2051 
2052 	dev->ev_sender = task;
2053 
2054 	set_dev_address(address, sock, dev);
2055 	if (pktinfo != NULL) {
2056 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2057 		dev->pktinfo = *pktinfo;
2058 
2059 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2060 		    !isc_sockaddr_islinklocal(&dev->address)) {
2061 			socket_log(sock, NULL, TRACE,
2062 				   "pktinfo structure provided, ifindex %u "
2063 				   "(set to 0)", pktinfo->ipi6_ifindex);
2064 
2065 			/*
2066 			 * Set the pktinfo index to 0 here, to let the
2067 			 * kernel decide what interface it should send on.
2068 			 */
2069 			dev->pktinfo.ipi6_ifindex = 0;
2070 		}
2071 	}
2072 
2073 	if (sock->type == isc_sockettype_udp)
2074 		io_state = doio_send(sock, dev);
2075 	else {
2076 		if (ISC_LIST_EMPTY(sock->send_list))
2077 			io_state = doio_send(sock, dev);
2078 		else
2079 			io_state = DOIO_SOFT;
2080 	}
2081 
2082 	switch (io_state) {
2083 	case DOIO_SOFT:
2084 		/*
2085 		 * We couldn't send all or part of the request right now, so
2086 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2087 		 */
2088 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2089 			isc_task_attach(task, &ntask);
2090 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2091 
2092 			/*
2093 			 * Enqueue the request.  If the socket was previously
2094 			 * not being watched, poke the watcher to start
2095 			 * paying attention to it.
2096 			 */
2097 			if (ISC_LIST_EMPTY(sock->send_list) &&
2098 			    !sock->pending_send)
2099 				select_poke(sock->manager, sock->fd,
2100 					    SELECT_POKE_WRITE);
2101 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2102 
2103 			socket_log(sock, NULL, EVENT,
2104 				   "socket_send: event %p -> task %p",
2105 				   dev, ntask);
2106 
2107 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2108 				result = ISC_R_INPROGRESS;
2109 			break;
2110 		}
2111 
2112 		/* FALLTHROUGH */
2113 
2114 	case DOIO_HARD:
2115 	case DOIO_SUCCESS:
2116 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2117 			send_senddone_event(sock, &dev);
2118 		break;
2119 	}
2120 
2121 	return (result);
2122 }
2123 
2124 isc_result_t
isc_socket_sendv(isc_socket_t * sock,isc_bufferlist_t * buflist,isc_task_t * task,isc_taskaction_t action,void * arg)2125 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2126 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2127 {
2128 	return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL,
2129 				     NULL, 0));
2130 }
2131 
2132 isc_result_t
isc_socket_sendtov2(isc_socket_t * sock0,isc_bufferlist_t * buflist,isc_task_t * task,isc_taskaction_t action,void * arg,struct sockaddr_storage * address,struct in6_pktinfo * pktinfo,unsigned int flags)2133 isc_socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2134 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2135 		     struct sockaddr_storage *address, struct in6_pktinfo *pktinfo,
2136 		     unsigned int flags)
2137 {
2138 	isc_socket_t *sock = (isc_socket_t *)sock0;
2139 	isc_socketevent_t *dev;
2140 	unsigned int iocount;
2141 	isc_buffer_t *buffer;
2142 
2143 	REQUIRE(buflist != NULL);
2144 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2145 	REQUIRE(task != NULL);
2146 	REQUIRE(action != NULL);
2147 
2148 	iocount = isc_bufferlist_usedcount(buflist);
2149 	REQUIRE(iocount > 0);
2150 
2151 	dev = allocate_socketevent(sock,
2152 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2153 	if (dev == NULL)
2154 		return (ISC_R_NOMEMORY);
2155 
2156 	/*
2157 	 * Move each buffer from the passed in list to our internal one.
2158 	 */
2159 	buffer = ISC_LIST_HEAD(*buflist);
2160 	while (buffer != NULL) {
2161 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2162 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2163 		buffer = ISC_LIST_HEAD(*buflist);
2164 	}
2165 
2166 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2167 }
2168 
2169 isc_result_t
isc_socket_bind(isc_socket_t * sock0,struct sockaddr_storage * sockaddr,unsigned int options)2170 isc_socket_bind(isc_socket_t *sock0, struct sockaddr_storage *sockaddr,
2171 		 unsigned int options) {
2172 	isc_socket_t *sock = (isc_socket_t *)sock0;
2173 	int on = 1;
2174 
2175 	INSIST(!sock->bound);
2176 
2177 	if (sock->pf != sockaddr->ss_family) {
2178 		return (ISC_R_FAMILYMISMATCH);
2179 	}
2180 
2181 	/*
2182 	 * Only set SO_REUSEADDR when we want a specific port.
2183 	 */
2184 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2185 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2186 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2187 		       sizeof(on)) < 0) {
2188 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2189 				 "setsockopt(%d) %s", sock->fd, "failed");
2190 		/* Press on... */
2191 	}
2192 	if (bind(sock->fd, (struct sockaddr *)sockaddr, sockaddr->ss_len) < 0) {
2193 		switch (errno) {
2194 		case EACCES:
2195 			return (ISC_R_NOPERM);
2196 		case EADDRNOTAVAIL:
2197 			return (ISC_R_ADDRNOTAVAIL);
2198 		case EADDRINUSE:
2199 			return (ISC_R_ADDRINUSE);
2200 		case EINVAL:
2201 			return (ISC_R_BOUND);
2202 		default:
2203 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2204 					 strerror(errno));
2205 			return (ISC_R_UNEXPECTED);
2206 		}
2207 	}
2208 
2209 	socket_log(sock, sockaddr, TRACE, "bound");
2210 	sock->bound = 1;
2211 
2212 	return (ISC_R_SUCCESS);
2213 }
2214 
2215 isc_result_t
isc_socket_connect(isc_socket_t * sock0,struct sockaddr_storage * addr,isc_task_t * task,isc_taskaction_t action,void * arg)2216 isc_socket_connect(isc_socket_t *sock0, struct sockaddr_storage *addr,
2217 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2218 {
2219 	isc_socket_t *sock = (isc_socket_t *)sock0;
2220 	isc_socket_connev_t *dev;
2221 	isc_task_t *ntask = NULL;
2222 	isc_socketmgr_t *manager;
2223 	int cc;
2224 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2225 
2226 	REQUIRE(addr != NULL);
2227 	REQUIRE(task != NULL);
2228 	REQUIRE(action != NULL);
2229 
2230 	manager = sock->manager;
2231 	REQUIRE(addr != NULL);
2232 
2233 	if (isc_sockaddr_ismulticast(addr))
2234 		return (ISC_R_MULTICAST);
2235 
2236 	REQUIRE(!sock->connecting);
2237 
2238 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2239 							ISC_SOCKEVENT_CONNECT,
2240 							action,	arg,
2241 							sizeof(*dev));
2242 	if (dev == NULL) {
2243 		return (ISC_R_NOMEMORY);
2244 	}
2245 	ISC_LINK_INIT(dev, ev_link);
2246 
2247 	/*
2248 	 * Try to do the connect right away, as there can be only one
2249 	 * outstanding, and it might happen to complete.
2250 	 */
2251 	sock->peer_address = *addr;
2252 	cc = connect(sock->fd, (struct sockaddr *)addr, addr->ss_len);
2253 	if (cc < 0) {
2254 		/*
2255 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2256 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2257 		 * a success and let the user detect it if it's really an error
2258 		 * at the time of sending a packet on the socket.
2259 		 */
2260 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2261 			cc = 0;
2262 			goto success;
2263 		}
2264 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2265 			goto queue;
2266 
2267 		switch (errno) {
2268 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2269 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2270 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2271 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2272 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2273 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2274 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2275 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2276 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2277 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2278 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2279 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2280 #undef ERROR_MATCH
2281 		}
2282 
2283 		sock->connected = 0;
2284 
2285 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2286 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2287 				 addrbuf, errno, strerror(errno));
2288 
2289 		isc_event_free(ISC_EVENT_PTR(&dev));
2290 		return (ISC_R_UNEXPECTED);
2291 
2292 	err_exit:
2293 		sock->connected = 0;
2294 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2295 
2296 		return (ISC_R_SUCCESS);
2297 	}
2298 
2299 	/*
2300 	 * If connect completed, fire off the done event.
2301 	 */
2302  success:
2303 	if (cc == 0) {
2304 		sock->connected = 1;
2305 		sock->bound = 1;
2306 		dev->result = ISC_R_SUCCESS;
2307 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2308 
2309 		return (ISC_R_SUCCESS);
2310 	}
2311 
2312  queue:
2313 
2314 	/*
2315 	 * Attach to task.
2316 	 */
2317 	isc_task_attach(task, &ntask);
2318 
2319 	sock->connecting = 1;
2320 
2321 	dev->ev_sender = ntask;
2322 
2323 	/*
2324 	 * Poke watcher here.  We still have the socket locked, so there
2325 	 * is no race condition.  We will keep the lock for such a short
2326 	 * bit of time waking it up now or later won't matter all that much.
2327 	 */
2328 	if (sock->connect_ev == NULL)
2329 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2330 
2331 	sock->connect_ev = dev;
2332 
2333 	return (ISC_R_SUCCESS);
2334 }
2335 
2336 /*
2337  * Called when a socket with a pending connect() finishes.
2338  */
2339 static void
internal_connect(isc_task_t * me,isc_event_t * ev)2340 internal_connect(isc_task_t *me, isc_event_t *ev) {
2341 	isc_socket_t *sock;
2342 	isc_socket_connev_t *dev;
2343 	isc_task_t *task;
2344 	int cc;
2345 	socklen_t optlen;
2346 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2347 
2348 	UNUSED(me);
2349 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2350 
2351 	sock = ev->ev_sender;
2352 
2353 	/*
2354 	 * When the internal event was sent the reference count was bumped
2355 	 * to keep the socket around for us.  Decrement the count here.
2356 	 */
2357 	INSIST(sock->references > 0);
2358 	sock->references--;
2359 	if (sock->references == 0) {
2360 		destroy(&sock);
2361 		return;
2362 	}
2363 
2364 	/*
2365 	 * Has this event been canceled?
2366 	 */
2367 	dev = sock->connect_ev;
2368 	if (dev == NULL) {
2369 		INSIST(!sock->connecting);
2370 		return;
2371 	}
2372 
2373 	INSIST(sock->connecting);
2374 	sock->connecting = 0;
2375 
2376 	/*
2377 	 * Get any possible error status here.
2378 	 */
2379 	optlen = sizeof(cc);
2380 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2381 		       (void *)&cc, (void *)&optlen) < 0)
2382 		cc = errno;
2383 	else
2384 		errno = cc;
2385 
2386 	if (errno != 0) {
2387 		/*
2388 		 * If the error is EAGAIN, just re-select on this
2389 		 * fd and pretend nothing strange happened.
2390 		 */
2391 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2392 			sock->connecting = 1;
2393 			select_poke(sock->manager, sock->fd,
2394 				    SELECT_POKE_CONNECT);
2395 			return;
2396 		}
2397 
2398 		/*
2399 		 * Translate other errors into ISC_R_* flavors.
2400 		 */
2401 		switch (errno) {
2402 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2403 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2404 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2405 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2406 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2407 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2408 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2409 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2410 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2411 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2412 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2413 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2414 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2415 #undef ERROR_MATCH
2416 		default:
2417 			dev->result = ISC_R_UNEXPECTED;
2418 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2419 					    sizeof(peerbuf));
2420 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2421 					 "internal_connect: connect(%s) %s",
2422 					 peerbuf, strerror(errno));
2423 		}
2424 	} else {
2425 		dev->result = ISC_R_SUCCESS;
2426 		sock->connected = 1;
2427 		sock->bound = 1;
2428 	}
2429 
2430 	sock->connect_ev = NULL;
2431 
2432 	task = dev->ev_sender;
2433 	dev->ev_sender = sock;
2434 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2435 }
2436 
2437 /*
2438  * Run through the list of events on this socket, and cancel the ones
2439  * queued for task "task" of type "how".  "how" is a bitmask.
2440  */
2441 void
isc_socket_cancel(isc_socket_t * sock0,isc_task_t * task,unsigned int how)2442 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2443 	isc_socket_t *sock = (isc_socket_t *)sock0;
2444 
2445 	/*
2446 	 * Quick exit if there is nothing to do.  Don't even bother locking
2447 	 * in this case.
2448 	 */
2449 	if (how == 0)
2450 		return;
2451 
2452 	/*
2453 	 * All of these do the same thing, more or less.
2454 	 * Each will:
2455 	 *	o If the internal event is marked as "posted" try to
2456 	 *	  remove it from the task's queue.  If this fails, mark it
2457 	 *	  as canceled instead, and let the task clean it up later.
2458 	 *	o For each I/O request for that task of that type, post
2459 	 *	  its done event with status of "ISC_R_CANCELED".
2460 	 *	o Reset any state needed.
2461 	 */
2462 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2463 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2464 		isc_socketevent_t      *dev;
2465 		isc_socketevent_t      *next;
2466 		isc_task_t	       *current_task;
2467 
2468 		dev = ISC_LIST_HEAD(sock->recv_list);
2469 
2470 		while (dev != NULL) {
2471 			current_task = dev->ev_sender;
2472 			next = ISC_LIST_NEXT(dev, ev_link);
2473 
2474 			if ((task == NULL) || (task == current_task)) {
2475 				dev->result = ISC_R_CANCELED;
2476 				send_recvdone_event(sock, &dev);
2477 			}
2478 			dev = next;
2479 		}
2480 	}
2481 
2482 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2483 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2484 		isc_socketevent_t      *dev;
2485 		isc_socketevent_t      *next;
2486 		isc_task_t	       *current_task;
2487 
2488 		dev = ISC_LIST_HEAD(sock->send_list);
2489 
2490 		while (dev != NULL) {
2491 			current_task = dev->ev_sender;
2492 			next = ISC_LIST_NEXT(dev, ev_link);
2493 
2494 			if ((task == NULL) || (task == current_task)) {
2495 				dev->result = ISC_R_CANCELED;
2496 				send_senddone_event(sock, &dev);
2497 			}
2498 			dev = next;
2499 		}
2500 	}
2501 
2502 	/*
2503 	 * Connecting is not a list.
2504 	 */
2505 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2506 	    && sock->connect_ev != NULL) {
2507 		isc_socket_connev_t    *dev;
2508 		isc_task_t	       *current_task;
2509 
2510 		INSIST(sock->connecting);
2511 		sock->connecting = 0;
2512 
2513 		dev = sock->connect_ev;
2514 		current_task = dev->ev_sender;
2515 
2516 		if ((task == NULL) || (task == current_task)) {
2517 			sock->connect_ev = NULL;
2518 
2519 			dev->result = ISC_R_CANCELED;
2520 			dev->ev_sender = sock;
2521 			isc_task_sendanddetach(&current_task,
2522 					       ISC_EVENT_PTR(&dev));
2523 		}
2524 	}
2525 
2526 }
2527 
2528 /*
2529  * In our assumed scenario, we can simply use a single static object.
2530  * XXX: this is not true if the application uses multiple threads with
2531  *      'multi-context' mode.  Fixing this is a future TODO item.
2532  */
2533 static isc_socketwait_t swait_private;
2534 
2535 int
isc_socketmgr_waitevents(isc_socketmgr_t * manager0,struct timeval * tvp,isc_socketwait_t ** swaitp)2536 isc_socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2537 			  isc_socketwait_t **swaitp)
2538 {
2539 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2540 	int n;
2541 
2542 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2543 
2544 	if (manager == NULL)
2545 		manager = socketmgr;
2546 	if (manager == NULL)
2547 		return (0);
2548 
2549 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2550 	memmove(manager->write_fds_copy, manager->write_fds,
2551 		manager->fd_bufsize);
2552 
2553 	swait_private.readset = manager->read_fds_copy;
2554 	swait_private.writeset = manager->write_fds_copy;
2555 	swait_private.maxfd = manager->maxfd + 1;
2556 
2557 	n = select(swait_private.maxfd, swait_private.readset,
2558 		   swait_private.writeset, NULL, tvp);
2559 
2560 	*swaitp = &swait_private;
2561 	return (n);
2562 }
2563 
2564 isc_result_t
isc_socketmgr_dispatch(isc_socketmgr_t * manager0,isc_socketwait_t * swait)2565 isc_socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2566 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2567 
2568 	REQUIRE(swait == &swait_private);
2569 
2570 	if (manager == NULL)
2571 		manager = socketmgr;
2572 	if (manager == NULL)
2573 		return (ISC_R_NOTFOUND);
2574 
2575 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2576 	return (ISC_R_SUCCESS);
2577 }
2578