1 /*
2  * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 2000-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 /* $Id$ */
19 
20 /* This code uses functions which are only available on Server 2003 and
21  * higher, and Windows XP and higher.
22  *
23  * This code is by nature multithreaded and takes advantage of various
24  * features to pass on information through the completion port for
25  * when I/O is completed.  All sends, receives, accepts, and connects are
26  * completed through the completion port.
27  *
28  * The number of Completion Port Worker threads used is the total number
29  * of CPU's + 1. This increases the likelihood that a Worker Thread is
30  * available for processing a completed request.
31  *
32  * XXXPDM 5 August, 2002
33  */
34 
35 #define MAKE_EXTERNAL 1
36 #include <config.h>
37 
38 #include <sys/types.h>
39 
40 #ifndef _WINSOCKAPI_
41 #define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
42 #endif
43 
44 #include <errno.h>
45 #include <stddef.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49 #include <io.h>
50 #include <fcntl.h>
51 #include <process.h>
52 
53 #include <isc/buffer.h>
54 #include <isc/bufferlist.h>
55 #include <isc/condition.h>
56 #include <isc/list.h>
57 #include <isc/log.h>
58 #include <isc/mem.h>
59 #include <isc/msgs.h>
60 #include <isc/mutex.h>
61 #include <isc/net.h>
62 #include <isc/once.h>
63 #include <isc/os.h>
64 #include <isc/platform.h>
65 #include <isc/print.h>
66 #include <isc/region.h>
67 #include <isc/socket.h>
68 #include <isc/stats.h>
69 #include <isc/strerror.h>
70 #include <isc/syslog.h>
71 #include <isc/task.h>
72 #include <isc/thread.h>
73 #include <isc/util.h>
74 #include <isc/win32os.h>
75 
76 #include <mswsock.h>
77 
78 #include "errno2result.h"
79 
80 /*
81  * How in the world can Microsoft exist with APIs like this?
82  * We can't actually call this directly, because it turns out
83  * no library exports this function.  Instead, we need to
84  * issue a runtime call to get the address.
85  */
86 LPFN_CONNECTEX ISCConnectEx;
87 LPFN_ACCEPTEX ISCAcceptEx;
88 LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
89 
90 /*
91  * Run expensive internal consistency checks.
92  */
93 #ifdef ISC_SOCKET_CONSISTENCY_CHECKS
94 #define CONSISTENT(sock) consistent(sock)
95 #else
96 #define CONSISTENT(sock) do {} while (0)
97 #endif
98 static void consistent(isc_socket_t *sock);
99 
100 /*
101  * Define this macro to control the behavior of connection
102  * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
103  * for details.
104  * NOTE: This requires that Windows 2000 systems install Service Pack 2
105  * or later.
106  */
107 #ifndef SIO_UDP_CONNRESET
108 #define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
109 #endif
110 
111 /*
112  * Some systems define the socket length argument as an int, some as size_t,
113  * some as socklen_t.  This is here so it can be easily changed if needed.
114  */
115 #ifndef ISC_SOCKADDR_LEN_T
116 #define ISC_SOCKADDR_LEN_T unsigned int
117 #endif
118 
119 /*
120  * Define what the possible "soft" errors can be.  These are non-fatal returns
121  * of various network related functions, like recv() and so on.
122  */
123 #define SOFT_ERROR(e)	((e) == WSAEINTR || \
124 			 (e) == WSAEWOULDBLOCK || \
125 			 (e) == EWOULDBLOCK || \
126 			 (e) == EINTR || \
127 			 (e) == EAGAIN || \
128 			 (e) == 0)
129 
130 /*
131  * Pending errors are not really errors and should be
132  * kept separate
133  */
134 #define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
135 
136 #define DOIO_SUCCESS	  0       /* i/o ok, event sent */
137 #define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
138 #define DOIO_HARD	  2       /* i/o error, event sent */
139 #define DOIO_EOF	  3       /* EOF, no event sent */
140 #define DOIO_PENDING	  4       /* status when i/o is in process */
141 #define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
142 
143 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
144 
145 /*
146  * DLVL(90)  --  Function entry/exit and other tracing.
147  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
148  * DLVL(60)  --  Socket data send/receive
149  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
150  * DLVL(20)  --  Socket creation/destruction.
151  */
152 #define TRACE_LEVEL		90
153 #define CORRECTNESS_LEVEL	70
154 #define IOEVENT_LEVEL		60
155 #define EVENT_LEVEL		50
156 #define CREATION_LEVEL		20
157 
158 #define TRACE		DLVL(TRACE_LEVEL)
159 #define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
160 #define IOEVENT		DLVL(IOEVENT_LEVEL)
161 #define EVENT		DLVL(EVENT_LEVEL)
162 #define CREATION	DLVL(CREATION_LEVEL)
163 
164 typedef isc_event_t intev_t;
165 
166 /*
167  * Socket State
168  */
169 enum {
170   SOCK_INITIALIZED,	/* Socket Initialized */
171   SOCK_OPEN,		/* Socket opened but nothing yet to do */
172   SOCK_DATA,		/* Socket sending or receiving data */
173   SOCK_LISTEN,		/* TCP Socket listening for connects */
174   SOCK_ACCEPT,		/* TCP socket is waiting to accept */
175   SOCK_CONNECT,		/* TCP Socket connecting */
176   SOCK_CLOSED,		/* Socket has been closed */
177 };
178 
179 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
180 #define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
181 
182 /*
183  * IPv6 control information.  If the socket is an IPv6 socket we want
184  * to collect the destination address and interface so the client can
185  * set them on outgoing packets.
186  */
187 #ifdef ISC_PLATFORM_HAVEIPV6
188 #ifndef USE_CMSG
189 #define USE_CMSG	1
190 #endif
191 #endif
192 
193 /*
194  * We really  don't want to try and use these control messages. Win32
195  * doesn't have this mechanism before XP.
196  */
197 #undef USE_CMSG
198 
199 /*
200  * Message header for recvmsg and sendmsg calls.
201  * Used value-result for recvmsg, value only for sendmsg.
202  */
203 struct msghdr {
204 	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
205 	int      to_addr_len;		/* length of the address */
206 	WSABUF  *msg_iov;		/* scatter/gather array */
207 	u_int   msg_iovlen;             /* # elements in msg_iov */
208 	void	*msg_control;           /* ancillary data, see below */
209 	u_int   msg_controllen;         /* ancillary data buffer len */
210 	int	msg_totallen;		/* total length of this message */
211 } msghdr;
212 
213 /*
214  * The size to raise the receive buffer to.
215  */
216 #define RCVBUFSIZE (32*1024)
217 
218 /*
219  * The number of times a send operation is repeated if the result
220  * is WSAEINTR.
221  */
222 #define NRETRIES 10
223 
224 struct isc_socket {
225 	/* Not locked. */
226 	unsigned int		magic;
227 	isc_socketmgr_t	       *manager;
228 	isc_mutex_t		lock;
229 	isc_sockettype_t	type;
230 
231 	/* Pointers to scatter/gather buffers */
232 	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
233 
234 	/* Locked by socket lock. */
235 	ISC_LINK(isc_socket_t)	link;
236 	unsigned int		references; /* EXTERNAL references */
237 	SOCKET			fd;	/* file handle */
238 	int			pf;	/* protocol family */
239 	char			name[16];
240 	void *			tag;
241 
242 	/*
243 	 * Each recv() call uses this buffer.  It is a per-socket receive
244 	 * buffer that allows us to decouple the system recv() from the
245 	 * recv_list done events.  This means the items on the recv_list
246 	 * can be removed without having to cancel pending system recv()
247 	 * calls.  It also allows us to read-ahead in some cases.
248 	 */
249 	struct {
250 		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
251 		int		from_addr_len;	   // length of the address
252 		char		*base;		   // the base of the buffer
253 		char		*consume_position; // where to start copying data from next
254 		unsigned int	len;		   // the actual size of this buffer
255 		unsigned int	remaining;	   // the number of bytes remaining
256 	} recvbuf;
257 
258 	ISC_LIST(isc_socketevent_t)		send_list;
259 	ISC_LIST(isc_socketevent_t)		recv_list;
260 	ISC_LIST(isc_socket_newconnev_t)	accept_list;
261 	isc_socket_connev_t		       *connect_ev;
262 
263 	isc_sockaddr_t		address;  /* remote address */
264 
265 	unsigned int		listener : 1,	/* listener socket */
266 				connected : 1,
267 				pending_connect : 1, /* connect pending */
268 				bound : 1,	/* bound to local addr */
269 				dupped : 1;     /* created by isc_socket_dup() */
270 	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
271 	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
272 	unsigned int		pending_send;  /* Number of outstanding send() calls. */
273 	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
274 	unsigned int		state; /* Socket state. Debugging and consistency checking. */
275 	int			state_lineno;  /* line which last touched state */
276 };
277 
278 #define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
279 
280 /*
281  * Buffer structure
282  */
283 typedef struct buflist buflist_t;
284 
285 struct buflist {
286 	void			*buf;
287 	unsigned int		buflen;
288 	ISC_LINK(buflist_t)	link;
289 };
290 
291 /*
292  * I/O Completion ports Info structures
293  */
294 
295 static HANDLE hHeapHandle = NULL;
296 typedef struct IoCompletionInfo {
297 	OVERLAPPED		overlapped;
298 	isc_socketevent_t	*dev;  /* send()/recv() done event */
299 	isc_socket_connev_t	*cdev; /* connect() done event */
300 	isc_socket_newconnev_t	*adev; /* accept() done event */
301 	void			*acceptbuffer;
302 	DWORD			received_bytes;
303 	int			request_type;
304 	struct msghdr		messagehdr;
305 	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
306 } IoCompletionInfo;
307 
308 /*
309  * Define a maximum number of I/O Completion Port worker threads
310  * to handle the load on the Completion Port. The actual number
311  * used is the number of CPU's + 1.
312  */
313 #define MAX_IOCPTHREADS 20
314 
315 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
316 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
317 
318 struct isc_socketmgr {
319 	/* Not locked. */
320 	unsigned int			magic;
321 	isc_mem_t		       *mctx;
322 	isc_mutex_t			lock;
323 	isc_stats_t		       *stats;
324 
325 	/* Locked by manager lock. */
326 	ISC_LIST(isc_socket_t)		socklist;
327 	isc_boolean_t			bShutdown;
328 	isc_condition_t			shutdown_ok;
329 	HANDLE				hIoCompletionPort;
330 	int				maxIOCPThreads;
331 	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
332 	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
333 
334 	/*
335 	 * Debugging.
336 	 * Modified by InterlockedIncrement() and InterlockedDecrement()
337 	 */
338 	LONG				totalSockets;
339 	LONG				iocp_total;
340 };
341 
342 enum {
343 	SOCKET_RECV,
344 	SOCKET_SEND,
345 	SOCKET_ACCEPT,
346 	SOCKET_CONNECT
347 };
348 
349 /*
350  * send() and recv() iovec counts
351  */
352 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
353 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
354 
355 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
356 				  isc_sockettype_t type,
357 				  isc_socket_t **socketp,
358 				  isc_socket_t *dup_socket);
359 static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
360 static void maybe_free_socket(isc_socket_t **, int);
361 static void free_socket(isc_socket_t **, int);
362 static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
363 static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
364 static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
365 static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
366 static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
367 static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
368 static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
369 static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
370 static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
371 static void queue_receive_request(isc_socket_t *sock);
372 
373 /*
374  * This is used to dump the contents of the sock structure
375  * You should make sure that the sock is locked before
376  * dumping it. Since the code uses simple printf() statements
377  * it should only be used interactively.
378  */
379 void
sock_dump(isc_socket_t * sock)380 sock_dump(isc_socket_t *sock) {
381 	isc_socketevent_t *ldev;
382 	isc_socket_newconnev_t *ndev;
383 
384 #if 0
385 	isc_sockaddr_t addr;
386 	char socktext[256];
387 
388 	isc_socket_getpeername(sock, &addr);
389 	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
390 	printf("Remote Socket: %s\n", socktext);
391 	isc_socket_getsockname(sock, &addr);
392 	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
393 	printf("This Socket: %s\n", socktext);
394 #endif
395 
396 	printf("\n\t\tSock Dump\n");
397 	printf("\t\tfd: %u\n", sock->fd);
398 	printf("\t\treferences: %d\n", sock->references);
399 	printf("\t\tpending_accept: %d\n", sock->pending_accept);
400 	printf("\t\tconnecting: %d\n", sock->pending_connect);
401 	printf("\t\tconnected: %d\n", sock->connected);
402 	printf("\t\tbound: %d\n", sock->bound);
403 	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
404 	printf("\t\tsocket type: %d\n", sock->type);
405 
406 	printf("\n\t\tSock Recv List\n");
407 	ldev = ISC_LIST_HEAD(sock->recv_list);
408 	while (ldev != NULL) {
409 		printf("\t\tdev: %p\n", ldev);
410 		ldev = ISC_LIST_NEXT(ldev, ev_link);
411 	}
412 
413 	printf("\n\t\tSock Send List\n");
414 	ldev = ISC_LIST_HEAD(sock->send_list);
415 	while (ldev != NULL) {
416 		printf("\t\tdev: %p\n", ldev);
417 		ldev = ISC_LIST_NEXT(ldev, ev_link);
418 	}
419 
420 	printf("\n\t\tSock Accept List\n");
421 	ndev = ISC_LIST_HEAD(sock->accept_list);
422 	while (ndev != NULL) {
423 		printf("\t\tdev: %p\n", ldev);
424 		ndev = ISC_LIST_NEXT(ndev, ev_link);
425 	}
426 }
427 
428 static void
429 socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
430 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
431 	   isc_msgcat_t *msgcat, int msgset, int message,
432 	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
433 
434 /*  This function will add an entry to the I/O completion port
435  *  that will signal the I/O thread to exit (gracefully)
436  */
437 static void
signal_iocompletionport_exit(isc_socketmgr_t * manager)438 signal_iocompletionport_exit(isc_socketmgr_t *manager) {
439 	int i;
440 	int errval;
441 	char strbuf[ISC_STRERRORSIZE];
442 
443 	REQUIRE(VALID_MANAGER(manager));
444 	for (i = 0; i < manager->maxIOCPThreads; i++) {
445 		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
446 						0, 0, 0)) {
447 			errval = GetLastError();
448 			isc__strerror(errval, strbuf, sizeof(strbuf));
449 			FATAL_ERROR(__FILE__, __LINE__,
450 				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
451 				ISC_MSG_FAILED,
452 				"Can't request service thread to exit: %s"),
453 				strbuf);
454 		}
455 	}
456 }
457 
458 /*
459  * Create the worker threads for the I/O Completion Port
460  */
461 void
iocompletionport_createthreads(int total_threads,isc_socketmgr_t * manager)462 iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
463 	int errval;
464 	char strbuf[ISC_STRERRORSIZE];
465 	int i;
466 
467 	INSIST(total_threads > 0);
468 	REQUIRE(VALID_MANAGER(manager));
469 	/*
470 	 * We need at least one
471 	 */
472 	for (i = 0; i < total_threads; i++) {
473 		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
474 						manager, 0,
475 						&manager->dwIOCPThreadIds[i]);
476 		if (manager->hIOCPThreads[i] == NULL) {
477 			errval = GetLastError();
478 			isc__strerror(errval, strbuf, sizeof(strbuf));
479 			FATAL_ERROR(__FILE__, __LINE__,
480 				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
481 				ISC_MSG_FAILED,
482 				"Can't create IOCP thread: %s"),
483 				strbuf);
484 			exit(1);
485 		}
486 	}
487 }
488 
489 /*
490  *  Create/initialise the I/O completion port
491  */
492 void
iocompletionport_init(isc_socketmgr_t * manager)493 iocompletionport_init(isc_socketmgr_t *manager) {
494 	int errval;
495 	char strbuf[ISC_STRERRORSIZE];
496 
497 	REQUIRE(VALID_MANAGER(manager));
498 	/*
499 	 * Create a private heap to handle the socket overlapped structure
500 	 * The minimum number of structures is 10, there is no maximum
501 	 */
502 	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
503 	if (hHeapHandle == NULL) {
504 		errval = GetLastError();
505 		isc__strerror(errval, strbuf, sizeof(strbuf));
506 		FATAL_ERROR(__FILE__, __LINE__,
507 			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
508 					   ISC_MSG_FAILED,
509 					   "HeapCreate() failed during "
510 					   "initialization: %s"),
511 			    strbuf);
512 		exit(1);
513 	}
514 
515 	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
516 
517 	/* Now Create the Completion Port */
518 	manager->hIoCompletionPort = CreateIoCompletionPort(
519 			INVALID_HANDLE_VALUE, NULL,
520 			0, manager->maxIOCPThreads);
521 	if (manager->hIoCompletionPort == NULL) {
522 		errval = GetLastError();
523 		isc__strerror(errval, strbuf, sizeof(strbuf));
524 		FATAL_ERROR(__FILE__, __LINE__,
525 				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
526 				ISC_MSG_FAILED,
527 				"CreateIoCompletionPort() failed "
528 				"during initialization: %s"),
529 				strbuf);
530 		exit(1);
531 	}
532 
533 	/*
534 	 * Worker threads for servicing the I/O
535 	 */
536 	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
537 }
538 
539 /*
540  * Associate a socket with an IO Completion Port.  This allows us to queue events for it
541  * and have our worker pool of threads process them.
542  */
543 void
iocompletionport_update(isc_socket_t * sock)544 iocompletionport_update(isc_socket_t *sock) {
545 	HANDLE hiocp;
546 	char strbuf[ISC_STRERRORSIZE];
547 
548 	REQUIRE(VALID_SOCKET(sock));
549 
550 	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
551 		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
552 
553 	if (hiocp == NULL) {
554 		DWORD errval = GetLastError();
555 		isc__strerror(errval, strbuf, sizeof(strbuf));
556 		isc_log_iwrite(isc_lctx,
557 				ISC_LOGCATEGORY_GENERAL,
558 				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
559 				isc_msgcat, ISC_MSGSET_SOCKET,
560 				ISC_MSG_TOOMANYHANDLES,
561 				"iocompletionport_update: failed to open"
562 				" io completion port: %s",
563 				strbuf);
564 
565 		/* XXXMLG temporary hack to make failures detected.
566 		 * This function should return errors to the caller, not
567 		 * exit here.
568 		 */
569 		FATAL_ERROR(__FILE__, __LINE__,
570 				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
571 				ISC_MSG_FAILED,
572 				"CreateIoCompletionPort() failed "
573 				"during initialization: %s"),
574 				strbuf);
575 		exit(1);
576 	}
577 
578 	InterlockedIncrement(&sock->manager->iocp_total);
579 }
580 
581 /*
582  * Routine to cleanup and then close the socket.
583  * Only close the socket here if it is NOT associated
584  * with an event, otherwise the WSAWaitForMultipleEvents
585  * may fail due to the fact that the Wait should not
586  * be running while closing an event or a socket.
587  * The socket is locked before calling this function
588  */
589 void
socket_close(isc_socket_t * sock)590 socket_close(isc_socket_t *sock) {
591 
592 	REQUIRE(sock != NULL);
593 
594 	if (sock->fd != INVALID_SOCKET) {
595 		closesocket(sock->fd);
596 		sock->fd = INVALID_SOCKET;
597 		_set_state(sock, SOCK_CLOSED);
598 		InterlockedDecrement(&sock->manager->totalSockets);
599 	}
600 }
601 
602 static isc_once_t initialise_once = ISC_ONCE_INIT;
603 static isc_boolean_t initialised = ISC_FALSE;
604 
605 static void
initialise(void)606 initialise(void) {
607 	WORD wVersionRequested;
608 	WSADATA wsaData;
609 	int err;
610 	SOCKET sock;
611 	GUID GUIDConnectEx = WSAID_CONNECTEX;
612 	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
613 	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
614 	DWORD dwBytes;
615 
616 	/* Need Winsock 2.2 or better */
617 	wVersionRequested = MAKEWORD(2, 2);
618 
619 	err = WSAStartup(wVersionRequested, &wsaData);
620 	if (err != 0) {
621 		char strbuf[ISC_STRERRORSIZE];
622 		isc__strerror(err, strbuf, sizeof(strbuf));
623 		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
624 			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
625 					   ISC_MSG_FAILED, "failed"),
626 			    strbuf);
627 		exit(1);
628 	}
629 	/*
630 	 * The following APIs do not exist as functions in a library, but we must
631 	 * ask winsock for them.  They are "extensions" -- but why they cannot be
632 	 * actual functions is beyond me.  So, ask winsock for the pointers to the
633 	 * functions we need.
634 	 */
635 	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
636 	INSIST(sock != INVALID_SOCKET);
637 	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
638 		 &GUIDConnectEx, sizeof(GUIDConnectEx),
639 		 &ISCConnectEx, sizeof(ISCConnectEx),
640 		 &dwBytes, NULL, NULL);
641 	INSIST(err == 0);
642 
643 	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
644 		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
645 		 &ISCAcceptEx, sizeof(ISCAcceptEx),
646 		 &dwBytes, NULL, NULL);
647 	INSIST(err == 0);
648 
649 	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
650 		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
651 		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
652 		 &dwBytes, NULL, NULL);
653 	INSIST(err == 0);
654 
655 	closesocket(sock);
656 
657 	initialised = ISC_TRUE;
658 }
659 
660 /*
661  * Initialize socket services
662  */
663 void
InitSockets(void)664 InitSockets(void) {
665 	RUNTIME_CHECK(isc_once_do(&initialise_once,
666 				  initialise) == ISC_R_SUCCESS);
667 	if (!initialised)
668 		exit(1);
669 }
670 
671 int
internal_sendmsg(isc_socket_t * sock,IoCompletionInfo * lpo,struct msghdr * messagehdr,int flags,int * Error)672 internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
673 		 struct msghdr *messagehdr, int flags, int *Error)
674 {
675 	int Result;
676 	DWORD BytesSent;
677 	DWORD Flags = flags;
678 	int total_sent;
679 
680 	*Error = 0;
681 	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
682 			   messagehdr->msg_iovlen, &BytesSent,
683 			   Flags, (SOCKADDR *)&messagehdr->to_addr,
684 			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
685 			   NULL);
686 
687 	total_sent = (int)BytesSent;
688 
689 	/* Check for errors.*/
690 	if (Result == SOCKET_ERROR) {
691 		*Error = WSAGetLastError();
692 
693 		switch (*Error) {
694 		case WSA_IO_INCOMPLETE:
695 		case WSA_WAIT_IO_COMPLETION:
696 		case WSA_IO_PENDING:
697 		case NO_ERROR:		/* Strange, but okay */
698 			sock->pending_iocp++;
699 			sock->pending_send++;
700 			break;
701 
702 		default:
703 			return (-1);
704 			break;
705 		}
706 	} else {
707 		sock->pending_iocp++;
708 		sock->pending_send++;
709 	}
710 
711 	if (lpo != NULL)
712 		return (0);
713 	else
714 		return (total_sent);
715 }
716 
717 static void
queue_receive_request(isc_socket_t * sock)718 queue_receive_request(isc_socket_t *sock) {
719 	DWORD Flags = 0;
720 	DWORD NumBytes = 0;
721 	int total_bytes = 0;
722 	int Result;
723 	int Error;
724 	int need_retry;
725 	WSABUF iov[1];
726 	IoCompletionInfo *lpo = NULL;
727 	isc_result_t isc_result;
728 
729  retry:
730 	need_retry = ISC_FALSE;
731 
732 	/*
733 	 * If we already have a receive pending, do nothing.
734 	 */
735 	if (sock->pending_recv > 0) {
736 		if (lpo != NULL)
737 			HeapFree(hHeapHandle, 0, lpo);
738 		return;
739 	}
740 
741 	/*
742 	 * If no one is waiting, do nothing.
743 	 */
744 	if (ISC_LIST_EMPTY(sock->recv_list)) {
745 		if (lpo != NULL)
746 			HeapFree(hHeapHandle, 0, lpo);
747 		return;
748 	}
749 
750 	INSIST(sock->recvbuf.remaining == 0);
751 	INSIST(sock->fd != INVALID_SOCKET);
752 
753 	iov[0].len = sock->recvbuf.len;
754 	iov[0].buf = sock->recvbuf.base;
755 
756 	if (lpo == NULL) {
757 		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
758 						    HEAP_ZERO_MEMORY,
759 						    sizeof(IoCompletionInfo));
760 		RUNTIME_CHECK(lpo != NULL);
761 	} else
762 		ZeroMemory(lpo, sizeof(IoCompletionInfo));
763 	lpo->request_type = SOCKET_RECV;
764 
765 	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
766 
767 	Error = 0;
768 	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
769 			     &NumBytes, &Flags,
770 			     (SOCKADDR *)&sock->recvbuf.from_addr,
771 			     &sock->recvbuf.from_addr_len,
772 			     (LPWSAOVERLAPPED)lpo, NULL);
773 
774 	/* Check for errors. */
775 	if (Result == SOCKET_ERROR) {
776 		Error = WSAGetLastError();
777 
778 		switch (Error) {
779 		case WSA_IO_PENDING:
780 			sock->pending_iocp++;
781 			sock->pending_recv++;
782 			break;
783 
784 		/* direct error: no completion event */
785 		case ERROR_HOST_UNREACHABLE:
786 		case WSAENETRESET:
787 		case WSAECONNRESET:
788 			if (!sock->connected) {
789 				/* soft error */
790 				need_retry = ISC_TRUE;
791 				break;
792 			}
793 			/* FALLTHROUGH */
794 
795 		default:
796 			isc_result = isc__errno2result(Error);
797 			if (isc_result == ISC_R_UNEXPECTED)
798 				UNEXPECTED_ERROR(__FILE__, __LINE__,
799 					"WSARecvFrom: Windows error code: %d, isc result %d",
800 					Error, isc_result);
801 			send_recvdone_abort(sock, isc_result);
802 			HeapFree(hHeapHandle, 0, lpo);
803 			lpo = NULL;
804 			break;
805 		}
806 	} else {
807 		/*
808 		 * The recv() finished immediately, but we will still get
809 		 * a completion event.  Rather than duplicate code, let
810 		 * that thread handle sending the data along its way.
811 		 */
812 		sock->pending_iocp++;
813 		sock->pending_recv++;
814 	}
815 
816 	socket_log(__LINE__, sock, NULL, IOEVENT,
817 		   isc_msgcat, ISC_MSGSET_SOCKET,
818 		   ISC_MSG_DOIORECV,
819 		   "queue_io_request: fd %d result %d error %d",
820 		   sock->fd, Result, Error);
821 
822 	CONSISTENT(sock);
823 
824 	if (need_retry)
825 		goto retry;
826 }
827 
828 static void
manager_log(isc_socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)829 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
830 	    isc_logmodule_t *module, int level, const char *fmt, ...)
831 {
832 	char msgbuf[2048];
833 	va_list ap;
834 
835 	if (!isc_log_wouldlog(isc_lctx, level))
836 		return;
837 
838 	va_start(ap, fmt);
839 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
840 	va_end(ap);
841 
842 	isc_log_write(isc_lctx, category, module, level,
843 		      "sockmgr %p: %s", sockmgr, msgbuf);
844 }
845 
846 static void
socket_log(int lineno,isc_socket_t * sock,isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,isc_msgcat_t * msgcat,int msgset,int message,const char * fmt,...)847 socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
848 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
849 	   isc_msgcat_t *msgcat, int msgset, int message,
850 	   const char *fmt, ...)
851 {
852 	char msgbuf[2048];
853 	char peerbuf[256];
854 	va_list ap;
855 
856 
857 	if (!isc_log_wouldlog(isc_lctx, level))
858 		return;
859 
860 	va_start(ap, fmt);
861 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
862 	va_end(ap);
863 
864 	if (address == NULL) {
865 		isc_log_iwrite(isc_lctx, category, module, level,
866 			       msgcat, msgset, message,
867 			       "socket %p line %d: %s", sock, lineno, msgbuf);
868 	} else {
869 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
870 		isc_log_iwrite(isc_lctx, category, module, level,
871 			       msgcat, msgset, message,
872 				   "socket %p line %d peer %s: %s", sock, lineno,
873 				   peerbuf, msgbuf);
874 	}
875 
876 }
877 
878 /*
879  * Make an fd SOCKET non-blocking.
880  */
881 static isc_result_t
make_nonblock(SOCKET fd)882 make_nonblock(SOCKET fd) {
883 	int ret;
884 	unsigned long flags = 1;
885 	char strbuf[ISC_STRERRORSIZE];
886 
887 	/* Set the socket to non-blocking */
888 	ret = ioctlsocket(fd, FIONBIO, &flags);
889 
890 	if (ret == -1) {
891 		isc__strerror(errno, strbuf, sizeof(strbuf));
892 		UNEXPECTED_ERROR(__FILE__, __LINE__,
893 				 "ioctlsocket(%d, FIOBIO, %d): %s",
894 				 fd, flags, strbuf);
895 
896 		return (ISC_R_UNEXPECTED);
897 	}
898 
899 	return (ISC_R_SUCCESS);
900 }
901 
902 /*
903  * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
904  * to not work correctly, returning a WSACONNRESET error when a WSASendTo
905  * fails with an "ICMP port unreachable" response and preventing the
906  * socket from using the WSARecvFrom in subsequent operations.
907  * The function below fixes this, but requires that Windows 2000
908  * Service Pack 2 or later be installed on the system.  NT 4.0
909  * systems are not affected by this and work correctly.
910  * See Microsoft Knowledge Base Article Q263823 for details of this.
911  */
912 isc_result_t
connection_reset_fix(SOCKET fd)913 connection_reset_fix(SOCKET fd) {
914 	DWORD dwBytesReturned = 0;
915 	BOOL  bNewBehavior = FALSE;
916 	DWORD status;
917 
918 	if (isc_win32os_majorversion() < 5)
919 		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
920 
921 	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
922 	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
923 			  sizeof(bNewBehavior), NULL, 0,
924 			  &dwBytesReturned, NULL, NULL);
925 	if (status != SOCKET_ERROR)
926 		return (ISC_R_SUCCESS);
927 	else {
928 		UNEXPECTED_ERROR(__FILE__, __LINE__,
929 				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
930 				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
931 						ISC_MSG_FAILED, "failed"));
932 		return (ISC_R_UNEXPECTED);
933 	}
934 }
935 
936 /*
937  * Construct an iov array and attach it to the msghdr passed in.  This is
938  * the SEND constructor, which will use the used region of the buffer
939  * (if using a buffer list) or will use the internal region (if a single
940  * buffer I/O is requested).
941  *
942  * Nothing can be NULL, and the done event must list at least one buffer
943  * on the buffer linked list for this function to be meaningful.
944  */
945 static void
build_msghdr_send(isc_socket_t * sock,isc_socketevent_t * dev,struct msghdr * msg,char * cmsg,WSABUF * iov,IoCompletionInfo * lpo)946 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
947 		  struct msghdr *msg, char *cmsg, WSABUF *iov,
948 		  IoCompletionInfo  *lpo)
949 {
950 	unsigned int iovcount;
951 	isc_buffer_t *buffer;
952 	buflist_t  *cpbuffer;
953 	isc_region_t used;
954 	size_t write_count;
955 	size_t skip_count;
956 
957 	memset(msg, 0, sizeof(*msg));
958 
959 	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
960 	msg->to_addr_len = dev->address.length;
961 
962 	buffer = ISC_LIST_HEAD(dev->bufferlist);
963 	write_count = 0;
964 	iovcount = 0;
965 
966 	/*
967 	 * Single buffer I/O?  Skip what we've done so far in this region.
968 	 */
969 	if (buffer == NULL) {
970 		write_count = dev->region.length - dev->n;
971 		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
972 		RUNTIME_CHECK(cpbuffer != NULL);
973 		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
974 		RUNTIME_CHECK(cpbuffer->buf != NULL);
975 
976 		socket_log(__LINE__, sock, NULL, TRACE,
977 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
978 		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
979 		   cpbuffer->buf, write_count);
980 
981 		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
982 		cpbuffer->buflen = write_count;
983 		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
984 		iov[0].buf = cpbuffer->buf;
985 		iov[0].len = write_count;
986 		iovcount = 1;
987 
988 		goto config;
989 	}
990 
991 	/*
992 	 * Multibuffer I/O.
993 	 * Skip the data in the buffer list that we have already written.
994 	 */
995 	skip_count = dev->n;
996 	while (buffer != NULL) {
997 		REQUIRE(ISC_BUFFER_VALID(buffer));
998 		if (skip_count < isc_buffer_usedlength(buffer))
999 			break;
1000 		skip_count -= isc_buffer_usedlength(buffer);
1001 		buffer = ISC_LIST_NEXT(buffer, link);
1002 	}
1003 
1004 	while (buffer != NULL) {
1005 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1006 
1007 		isc_buffer_usedregion(buffer, &used);
1008 
1009 		if (used.length > 0) {
1010 			int uselen = used.length - skip_count;
1011 			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1012 			RUNTIME_CHECK(cpbuffer != NULL);
1013 			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1014 			RUNTIME_CHECK(cpbuffer->buf != NULL);
1015 
1016 			socket_log(__LINE__, sock, NULL, TRACE,
1017 			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1018 			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1019 			   cpbuffer->buf, write_count);
1020 
1021 			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
1022 			cpbuffer->buflen = uselen;
1023 			iov[iovcount].buf = cpbuffer->buf;
1024 			iov[iovcount].len = used.length - skip_count;
1025 			write_count += uselen;
1026 			skip_count = 0;
1027 			iovcount++;
1028 		}
1029 		buffer = ISC_LIST_NEXT(buffer, link);
1030 	}
1031 
1032 	INSIST(skip_count == 0);
1033 
1034  config:
1035 	msg->msg_iov = iov;
1036 	msg->msg_iovlen = iovcount;
1037 	msg->msg_totallen = write_count;
1038 }
1039 
1040 static void
set_dev_address(isc_sockaddr_t * address,isc_socket_t * sock,isc_socketevent_t * dev)1041 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1042 		isc_socketevent_t *dev)
1043 {
1044 	if (sock->type == isc_sockettype_udp) {
1045 		if (address != NULL)
1046 			dev->address = *address;
1047 		else
1048 			dev->address = sock->address;
1049 	} else if (sock->type == isc_sockettype_tcp) {
1050 		INSIST(address == NULL);
1051 		dev->address = sock->address;
1052 	}
1053 }
1054 
1055 static void
destroy_socketevent(isc_event_t * event)1056 destroy_socketevent(isc_event_t *event) {
1057 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1058 
1059 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1060 
1061 	(ev->destroy)(event);
1062 }
1063 
1064 static isc_socketevent_t *
allocate_socketevent(isc_socket_t * sock,isc_eventtype_t eventtype,isc_taskaction_t action,const void * arg)1065 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1066 		     isc_taskaction_t action, const void *arg)
1067 {
1068 	isc_socketevent_t *ev;
1069 
1070 	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1071 						     sock, eventtype,
1072 						     action, arg,
1073 						     sizeof(*ev));
1074 	if (ev == NULL)
1075 		return (NULL);
1076 
1077 	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1078 	ISC_LINK_INIT(ev, ev_link);
1079 	ISC_LIST_INIT(ev->bufferlist);
1080 	ev->region.base = NULL;
1081 	ev->n = 0;
1082 	ev->offset = 0;
1083 	ev->attributes = 0;
1084 	ev->destroy = ev->ev_destroy;
1085 	ev->ev_destroy = destroy_socketevent;
1086 
1087 	return (ev);
1088 }
1089 
1090 #if defined(ISC_SOCKET_DEBUG)
1091 static void
dump_msg(struct msghdr * msg,isc_socket_t * sock)1092 dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1093 	unsigned int i;
1094 
1095 	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1096 	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1097 	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1098 	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1099 		printf("\t\t%d\tbase %p, len %d\n", i,
1100 		       msg->msg_iov[i].buf,
1101 		       msg->msg_iov[i].len);
1102 }
1103 #endif
1104 
1105 /*
1106  * map the error code
1107  */
1108 int
map_socket_error(isc_socket_t * sock,int windows_errno,int * isc_errno,char * errorstring,size_t bufsize)1109 map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1110 		 char *errorstring, size_t bufsize) {
1111 
1112 	int doreturn;
1113 	switch (windows_errno) {
1114 	case WSAECONNREFUSED:
1115 		*isc_errno = ISC_R_CONNREFUSED;
1116 		if (sock->connected)
1117 			doreturn = DOIO_HARD;
1118 		else
1119 			doreturn = DOIO_SOFT;
1120 		break;
1121 	case WSAENETUNREACH:
1122 	case ERROR_NETWORK_UNREACHABLE:
1123 		*isc_errno = ISC_R_NETUNREACH;
1124 		if (sock->connected)
1125 			doreturn = DOIO_HARD;
1126 		else
1127 			doreturn = DOIO_SOFT;
1128 		break;
1129 	case ERROR_PORT_UNREACHABLE:
1130 	case ERROR_HOST_UNREACHABLE:
1131 	case WSAEHOSTUNREACH:
1132 		*isc_errno = ISC_R_HOSTUNREACH;
1133 		if (sock->connected)
1134 			doreturn = DOIO_HARD;
1135 		else
1136 			doreturn = DOIO_SOFT;
1137 		break;
1138 	case WSAENETDOWN:
1139 		*isc_errno = ISC_R_NETDOWN;
1140 		if (sock->connected)
1141 			doreturn = DOIO_HARD;
1142 		else
1143 			doreturn = DOIO_SOFT;
1144 		break;
1145 	case WSAEHOSTDOWN:
1146 		*isc_errno = ISC_R_HOSTDOWN;
1147 		if (sock->connected)
1148 			doreturn = DOIO_HARD;
1149 		else
1150 			doreturn = DOIO_SOFT;
1151 		break;
1152 	case WSAEACCES:
1153 		*isc_errno = ISC_R_NOPERM;
1154 		if (sock->connected)
1155 			doreturn = DOIO_HARD;
1156 		else
1157 			doreturn = DOIO_SOFT;
1158 		break;
1159 	case WSAECONNRESET:
1160 	case WSAENETRESET:
1161 	case WSAECONNABORTED:
1162 	case WSAEDISCON:
1163 		*isc_errno = ISC_R_CONNECTIONRESET;
1164 		if (sock->connected)
1165 			doreturn = DOIO_HARD;
1166 		else
1167 			doreturn = DOIO_SOFT;
1168 		break;
1169 	case WSAENOTCONN:
1170 		*isc_errno = ISC_R_NOTCONNECTED;
1171 		if (sock->connected)
1172 			doreturn = DOIO_HARD;
1173 		else
1174 			doreturn = DOIO_SOFT;
1175 		break;
1176 	case ERROR_OPERATION_ABORTED:
1177 	case ERROR_CONNECTION_ABORTED:
1178 	case ERROR_REQUEST_ABORTED:
1179 		*isc_errno = ISC_R_CONNECTIONRESET;
1180 		doreturn = DOIO_HARD;
1181 		break;
1182 	case WSAENOBUFS:
1183 		*isc_errno = ISC_R_NORESOURCES;
1184 		doreturn = DOIO_HARD;
1185 		break;
1186 	case WSAEAFNOSUPPORT:
1187 		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1188 		doreturn = DOIO_HARD;
1189 		break;
1190 	case WSAEADDRNOTAVAIL:
1191 		*isc_errno = ISC_R_ADDRNOTAVAIL;
1192 		doreturn = DOIO_HARD;
1193 		break;
1194 	case WSAEDESTADDRREQ:
1195 		*isc_errno = ISC_R_BADADDRESSFORM;
1196 		doreturn = DOIO_HARD;
1197 		break;
1198 	case ERROR_NETNAME_DELETED:
1199 		*isc_errno = ISC_R_NETDOWN;
1200 		doreturn = DOIO_HARD;
1201 		break;
1202 	default:
1203 		*isc_errno = ISC_R_IOERROR;
1204 		doreturn = DOIO_HARD;
1205 		break;
1206 	}
1207 	if (doreturn == DOIO_HARD) {
1208 		isc__strerror(windows_errno, errorstring, bufsize);
1209 	}
1210 	return (doreturn);
1211 }
1212 
1213 static void
fill_recv(isc_socket_t * sock,isc_socketevent_t * dev)1214 fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1215 	isc_region_t r;
1216 	int copylen;
1217 	isc_buffer_t *buffer;
1218 
1219 	INSIST(dev->n < dev->minimum);
1220 	INSIST(sock->recvbuf.remaining > 0);
1221 	INSIST(sock->pending_recv == 0);
1222 
1223 	if (sock->type == isc_sockettype_udp) {
1224 		dev->address.length = sock->recvbuf.from_addr_len;
1225 		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1226 		    sock->recvbuf.from_addr_len);
1227 		if (isc_sockaddr_getport(&dev->address) == 0) {
1228 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1229 				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1230 					   isc_msgcat, ISC_MSGSET_SOCKET,
1231 					   ISC_MSG_ZEROPORT,
1232 					   "dropping source port zero packet");
1233 			}
1234 			sock->recvbuf.remaining = 0;
1235 			return;
1236 		}
1237 	} else if (sock->type == isc_sockettype_tcp) {
1238 		dev->address = sock->address;
1239 	}
1240 
1241 	/*
1242 	 * Run through the list of buffers we were given, and find the
1243 	 * first one with space.  Once it is found, loop through, filling
1244 	 * the buffers as much as possible.
1245 	 */
1246 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1247 	if (buffer != NULL) { // Multi-buffer receive
1248 		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1249 			REQUIRE(ISC_BUFFER_VALID(buffer));
1250 			if (isc_buffer_availablelength(buffer) > 0) {
1251 				isc_buffer_availableregion(buffer, &r);
1252 				copylen = min(r.length, sock->recvbuf.remaining);
1253 				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1254 				sock->recvbuf.consume_position += copylen;
1255 				sock->recvbuf.remaining -= copylen;
1256 				isc_buffer_add(buffer, copylen);
1257 				dev->n += copylen;
1258 			}
1259 			buffer = ISC_LIST_NEXT(buffer, link);
1260 		}
1261 	} else { // Single-buffer receive
1262 		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1263 		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1264 		sock->recvbuf.consume_position += copylen;
1265 		sock->recvbuf.remaining -= copylen;
1266 		dev->n += copylen;
1267 	}
1268 
1269 	/*
1270 	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1271 	 * data in our receive buffer, and the caller only gave us
1272 	 * 1k of space, we will toss the remaining 3k of data.  TCP
1273 	 * will keep the extra data around and use it for later requests.
1274 	 */
1275 	if (sock->type == isc_sockettype_udp)
1276 		sock->recvbuf.remaining = 0;
1277 }
1278 
1279 /*
1280  * Copy out as much data from the internal buffer to done events.
1281  * As each done event is filled, send it along its way.
1282  */
1283 static void
completeio_recv(isc_socket_t * sock)1284 completeio_recv(isc_socket_t *sock)
1285 {
1286 	isc_socketevent_t *dev;
1287 
1288 	/*
1289 	 * If we are in the process of filling our buffer, we cannot
1290 	 * touch it yet, so don't.
1291 	 */
1292 	if (sock->pending_recv > 0)
1293 		return;
1294 
1295 	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1296 		dev = ISC_LIST_HEAD(sock->recv_list);
1297 
1298 		/*
1299 		 * See if we have sufficient data in our receive buffer
1300 		 * to handle this.  If we do, copy out the data.
1301 		 */
1302 		fill_recv(sock, dev);
1303 
1304 		/*
1305 		 * Did we satisfy it?
1306 		 */
1307 		if (dev->n >= dev->minimum) {
1308 			dev->result = ISC_R_SUCCESS;
1309 			send_recvdone_event(sock, &dev);
1310 		}
1311 	}
1312 }
1313 
1314 /*
1315  * Returns:
1316  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1317  *			ISC_R_SUCCESS.
1318  *
1319  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1320  *			dev->result contains the appropriate error.
1321  *
1322  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1323  *			event was sent.  The operation should be retried.
1324  *
1325  *	No other return values are possible.
1326  */
1327 static int
completeio_send(isc_socket_t * sock,isc_socketevent_t * dev,struct msghdr * messagehdr,int cc,int send_errno)1328 completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1329 		struct msghdr *messagehdr, int cc, int send_errno)
1330 {
1331 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1332 	char strbuf[ISC_STRERRORSIZE];
1333 
1334 	if (send_errno != 0) {
1335 		if (SOFT_ERROR(send_errno))
1336 			return (DOIO_SOFT);
1337 
1338 		return (map_socket_error(sock, send_errno, &dev->result,
1339 			strbuf, sizeof(strbuf)));
1340 
1341 		/*
1342 		 * The other error types depend on whether or not the
1343 		 * socket is UDP or TCP.  If it is UDP, some errors
1344 		 * that we expect to be fatal under TCP are merely
1345 		 * annoying, and are really soft errors.
1346 		 *
1347 		 * However, these soft errors are still returned as
1348 		 * a status.
1349 		 */
1350 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1351 		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1352 		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1353 				 addrbuf, strbuf);
1354 		dev->result = isc__errno2result(send_errno);
1355 		return (DOIO_HARD);
1356 	}
1357 
1358 	/*
1359 	 * If we write less than we expected, update counters, poke.
1360 	 */
1361 	dev->n += cc;
1362 	if (cc != messagehdr->msg_totallen)
1363 		return (DOIO_SOFT);
1364 
1365 	/*
1366 	 * Exactly what we wanted to write.  We're done with this
1367 	 * entry.  Post its completion event.
1368 	 */
1369 	dev->result = ISC_R_SUCCESS;
1370 	return (DOIO_SUCCESS);
1371 }
1372 
1373 static int
startio_send(isc_socket_t * sock,isc_socketevent_t * dev,int * nbytes,int * send_errno)1374 startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1375 	     int *send_errno)
1376 {
1377 	char *cmsg = NULL;
1378 	char strbuf[ISC_STRERRORSIZE];
1379 	IoCompletionInfo *lpo;
1380 	int status;
1381 	struct msghdr *msghdr;
1382 
1383 	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1384 					    HEAP_ZERO_MEMORY,
1385 					    sizeof(IoCompletionInfo));
1386 	RUNTIME_CHECK(lpo != NULL);
1387 	lpo->request_type = SOCKET_SEND;
1388 	lpo->dev = dev;
1389 	msghdr = &lpo->messagehdr;
1390 	memset(msghdr, 0, sizeof(struct msghdr));
1391 	ISC_LIST_INIT(lpo->bufferlist);
1392 
1393 	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1394 
1395 	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1396 
1397 	if (*nbytes < 0) {
1398 		/*
1399 		 * I/O has been initiated
1400 		 * completion will be through the completion port
1401 		 */
1402 		if (PENDING_ERROR(*send_errno)) {
1403 			status = DOIO_PENDING;
1404 			goto done;
1405 		}
1406 
1407 		if (SOFT_ERROR(*send_errno)) {
1408 			status = DOIO_SOFT;
1409 			goto done;
1410 		}
1411 
1412 		/*
1413 		 * If we got this far then something is wrong
1414 		 */
1415 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1416 			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1417 			socket_log(__LINE__, sock, NULL, IOEVENT,
1418 				   isc_msgcat, ISC_MSGSET_SOCKET,
1419 				   ISC_MSG_INTERNALSEND,
1420 				   "startio_send: internal_sendmsg(%d) %d "
1421 				   "bytes, err %d/%s",
1422 				   sock->fd, *nbytes, *send_errno, strbuf);
1423 		}
1424 		status = DOIO_HARD;
1425 		goto done;
1426 	}
1427 	dev->result = ISC_R_SUCCESS;
1428 	status = DOIO_SOFT;
1429  done:
1430 	_set_state(sock, SOCK_DATA);
1431 	return (status);
1432 }
1433 
1434 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1435 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1436 		isc_socket_t **socketp) {
1437 	isc_socket_t *sock;
1438 	isc_result_t result;
1439 
1440 	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1441 
1442 	if (sock == NULL)
1443 		return (ISC_R_NOMEMORY);
1444 
1445 	sock->magic = 0;
1446 	sock->references = 0;
1447 
1448 	sock->manager = manager;
1449 	sock->type = type;
1450 	sock->fd = INVALID_SOCKET;
1451 
1452 	ISC_LINK_INIT(sock, link);
1453 
1454 	/*
1455 	 * set up list of readers and writers to be initially empty
1456 	 */
1457 	ISC_LIST_INIT(sock->recv_list);
1458 	ISC_LIST_INIT(sock->send_list);
1459 	ISC_LIST_INIT(sock->accept_list);
1460 	sock->connect_ev = NULL;
1461 	sock->pending_accept = 0;
1462 	sock->pending_recv = 0;
1463 	sock->pending_send = 0;
1464 	sock->pending_iocp = 0;
1465 	sock->listener = 0;
1466 	sock->connected = 0;
1467 	sock->pending_connect = 0;
1468 	sock->bound = 0;
1469 	sock->dupped = 0;
1470 	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1471 	_set_state(sock, SOCK_INITIALIZED);
1472 
1473 	sock->recvbuf.len = 65536;
1474 	sock->recvbuf.consume_position = sock->recvbuf.base;
1475 	sock->recvbuf.remaining = 0;
1476 	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1477 	if (sock->recvbuf.base == NULL) {
1478 		sock->magic = 0;
1479 		goto error;
1480 	}
1481 
1482 	/*
1483 	 * initialize the lock
1484 	 */
1485 	result = isc_mutex_init(&sock->lock);
1486 	if (result != ISC_R_SUCCESS) {
1487 		sock->magic = 0;
1488 		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1489 		sock->recvbuf.base = NULL;
1490 		goto error;
1491 	}
1492 
1493 	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1494 		   "allocated");
1495 
1496 	sock->magic = SOCKET_MAGIC;
1497 	*socketp = sock;
1498 
1499 	return (ISC_R_SUCCESS);
1500 
1501  error:
1502 	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1503 
1504 	return (result);
1505 }
1506 
1507 /*
1508  * Verify that the socket state is consistent.
1509  */
1510 static void
consistent(isc_socket_t * sock)1511 consistent(isc_socket_t *sock) {
1512 
1513 	isc_socketevent_t *dev;
1514 	isc_socket_newconnev_t *nev;
1515 	unsigned int count;
1516 	char *crash_reason;
1517 	isc_boolean_t crash = ISC_FALSE;
1518 
1519 	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1520 		+ sock->pending_accept + sock->pending_connect);
1521 
1522 	dev = ISC_LIST_HEAD(sock->send_list);
1523 	count = 0;
1524 	while (dev != NULL) {
1525 		count++;
1526 		dev = ISC_LIST_NEXT(dev, ev_link);
1527 	}
1528 	if (count > sock->pending_send) {
1529 		crash = ISC_TRUE;
1530 		crash_reason = "send_list > sock->pending_send";
1531 	}
1532 
1533 	nev = ISC_LIST_HEAD(sock->accept_list);
1534 	count = 0;
1535 	while (nev != NULL) {
1536 		count++;
1537 		nev = ISC_LIST_NEXT(nev, ev_link);
1538 	}
1539 	if (count > sock->pending_accept) {
1540 		crash = ISC_TRUE;
1541 		crash_reason = "send_list > sock->pending_send";
1542 	}
1543 
1544 	if (crash) {
1545 		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1546 			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1547 			   crash_reason);
1548 		sock_dump(sock);
1549 		INSIST(crash == ISC_FALSE);
1550 	}
1551 }
1552 
1553 /*
1554  * Maybe free the socket.
1555  *
1556  * This function will verify tht the socket is no longer in use in any way,
1557  * either internally or externally.  This is the only place where this
1558  * check is to be made; if some bit of code believes that IT is done with
1559  * the socket (e.g., some reference counter reaches zero), it should call
1560  * this function.
1561  *
1562  * When calling this function, the socket must be locked, and the manager
1563  * must be unlocked.
1564  *
1565  * When this function returns, *socketp will be NULL.  No tricks to try
1566  * to hold on to this pointer are allowed.
1567  */
1568 static void
maybe_free_socket(isc_socket_t ** socketp,int lineno)1569 maybe_free_socket(isc_socket_t **socketp, int lineno) {
1570 	isc_socket_t *sock = *socketp;
1571 	*socketp = NULL;
1572 
1573 	INSIST(VALID_SOCKET(sock));
1574 	CONSISTENT(sock);
1575 
1576 	if (sock->pending_iocp > 0
1577 	    || sock->pending_recv > 0
1578 	    || sock->pending_send > 0
1579 	    || sock->pending_accept > 0
1580 	    || sock->references > 0
1581 	    || sock->pending_connect == 1
1582 	    || !ISC_LIST_EMPTY(sock->recv_list)
1583 	    || !ISC_LIST_EMPTY(sock->send_list)
1584 	    || !ISC_LIST_EMPTY(sock->accept_list)
1585 	    || sock->fd != INVALID_SOCKET) {
1586 		UNLOCK(&sock->lock);
1587 		return;
1588 	}
1589 	UNLOCK(&sock->lock);
1590 
1591 	free_socket(&sock, lineno);
1592 }
1593 
1594 void
free_socket(isc_socket_t ** sockp,int lineno)1595 free_socket(isc_socket_t **sockp, int lineno) {
1596 	isc_socketmgr_t *manager;
1597 	isc_socket_t *sock = *sockp;
1598 	*sockp = NULL;
1599 
1600 	manager = sock->manager;
1601 
1602 	/*
1603 	 * Seems we can free the socket after all.
1604 	 */
1605 	manager = sock->manager;
1606 	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1607 		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1608 		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1609 
1610 	sock->magic = 0;
1611 	DESTROYLOCK(&sock->lock);
1612 
1613 	if (sock->recvbuf.base != NULL)
1614 		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1615 
1616 	LOCK(&manager->lock);
1617 	if (ISC_LINK_LINKED(sock, link))
1618 		ISC_LIST_UNLINK(manager->socklist, sock, link);
1619 	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1620 
1621 	if (ISC_LIST_EMPTY(manager->socklist))
1622 		SIGNAL(&manager->shutdown_ok);
1623 	UNLOCK(&manager->lock);
1624 }
1625 
1626 /*
1627  * Create a new 'type' socket managed by 'manager'.  Events
1628  * will be posted to 'task' and when dispatched 'action' will be
1629  * called with 'arg' as the arg value.  The new socket is returned
1630  * in 'socketp'.
1631  */
1632 static isc_result_t
socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp,isc_socket_t * dup_socket)1633 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1634 	      isc_socket_t **socketp, isc_socket_t *dup_socket)
1635 {
1636 	isc_socket_t *sock = NULL;
1637 	isc_result_t result;
1638 #if defined(USE_CMSG)
1639 	int on = 1;
1640 #endif
1641 #if defined(SO_RCVBUF)
1642 	ISC_SOCKADDR_LEN_T optlen;
1643 	int size;
1644 #endif
1645 	int socket_errno;
1646 	char strbuf[ISC_STRERRORSIZE];
1647 
1648 	REQUIRE(VALID_MANAGER(manager));
1649 	REQUIRE(socketp != NULL && *socketp == NULL);
1650 	REQUIRE(type != isc_sockettype_fdwatch);
1651 
1652 	if (dup_socket != NULL)
1653 		return (ISC_R_NOTIMPLEMENTED);
1654 
1655 	result = allocate_socket(manager, type, &sock);
1656 	if (result != ISC_R_SUCCESS)
1657 		return (result);
1658 
1659 	sock->pf = pf;
1660 #if 0
1661 	if (dup_socket == NULL) {
1662 #endif
1663 		switch (type) {
1664 		case isc_sockettype_udp:
1665 			sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1666 			if (sock->fd != INVALID_SOCKET) {
1667 				result = connection_reset_fix(sock->fd);
1668 				if (result != ISC_R_SUCCESS) {
1669 					socket_log(__LINE__, sock,
1670 						NULL, EVENT, NULL, 0, 0,
1671 						"closed %d %d %d "
1672 						"con_reset_fix_failed",
1673 						sock->pending_recv,
1674 						sock->pending_send,
1675 						sock->references);
1676 					closesocket(sock->fd);
1677 					_set_state(sock, SOCK_CLOSED);
1678 					sock->fd = INVALID_SOCKET;
1679 					free_socket(&sock, __LINE__);
1680 					return (result);
1681 				}
1682 			}
1683 			break;
1684 		case isc_sockettype_tcp:
1685 			sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1686 			break;
1687 		}
1688 #if 0
1689 	} else {
1690 		/*
1691 		 * XXX: dup() is deprecated in windows, use _dup()
1692 		 * instead.  In future we may want to investigate
1693 		 * WSADuplicateSocket().
1694 		 */
1695 		sock->fd = _dup(dup_socket->fd);
1696 		sock->dupped = 1;
1697 		sock->bound = dup_socket->bound;
1698 	}
1699 #endif
1700 
1701 	if (sock->fd == INVALID_SOCKET) {
1702 		socket_errno = WSAGetLastError();
1703 		free_socket(&sock, __LINE__);
1704 
1705 		switch (socket_errno) {
1706 		case WSAEMFILE:
1707 		case WSAENOBUFS:
1708 			return (ISC_R_NORESOURCES);
1709 
1710 		case WSAEPROTONOSUPPORT:
1711 		case WSAEPFNOSUPPORT:
1712 		case WSAEAFNOSUPPORT:
1713 			return (ISC_R_FAMILYNOSUPPORT);
1714 
1715 		default:
1716 			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1717 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1718 					 "socket() %s: %s",
1719 					 isc_msgcat_get(isc_msgcat,
1720 							ISC_MSGSET_GENERAL,
1721 							ISC_MSG_FAILED,
1722 							"failed"),
1723 					 strbuf);
1724 			return (ISC_R_UNEXPECTED);
1725 		}
1726 	}
1727 
1728 	result = make_nonblock(sock->fd);
1729 	if (result != ISC_R_SUCCESS) {
1730 		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1731 			"closed %d %d %d make_nonblock_failed",
1732 			sock->pending_recv, sock->pending_send,
1733 			sock->references);
1734 		closesocket(sock->fd);
1735 		sock->fd = INVALID_SOCKET;
1736 		free_socket(&sock, __LINE__);
1737 		return (result);
1738 	}
1739 
1740 
1741 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1742 	if (type == isc_sockettype_udp) {
1743 
1744 #if defined(USE_CMSG)
1745 #if defined(ISC_PLATFORM_HAVEIPV6)
1746 #ifdef IPV6_RECVPKTINFO
1747 		/* 2292bis */
1748 		if ((pf == AF_INET6)
1749 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1750 				   (char *)&on, sizeof(on)) < 0)) {
1751 			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1752 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1753 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1754 					 "%s: %s", sock->fd,
1755 					 isc_msgcat_get(isc_msgcat,
1756 							ISC_MSGSET_GENERAL,
1757 							ISC_MSG_FAILED,
1758 							"failed"),
1759 					 strbuf);
1760 		}
1761 #else
1762 		/* 2292 */
1763 		if ((pf == AF_INET6)
1764 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1765 				   (char *)&on, sizeof(on)) < 0)) {
1766 			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1767 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1768 					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1769 					 sock->fd,
1770 					 isc_msgcat_get(isc_msgcat,
1771 							ISC_MSGSET_GENERAL,
1772 							ISC_MSG_FAILED,
1773 							"failed"),
1774 					 strbuf);
1775 		}
1776 #endif /* IPV6_RECVPKTINFO */
1777 #ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1778 		/* use minimum MTU */
1779 		if (pf == AF_INET6) {
1780 			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1781 					 IPV6_USE_MIN_MTU,
1782 					 (char *)&on, sizeof(on));
1783 		}
1784 #endif
1785 #endif /* ISC_PLATFORM_HAVEIPV6 */
1786 #endif /* defined(USE_CMSG) */
1787 
1788 #if defined(SO_RCVBUF)
1789 	       optlen = sizeof(size);
1790 	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1791 			      (char *)&size, &optlen) >= 0 &&
1792 		    size < RCVBUFSIZE) {
1793 		       size = RCVBUFSIZE;
1794 		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1795 					(char *)&size, sizeof(size));
1796 	       }
1797 #endif
1798 
1799 	}
1800 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1801 
1802 	_set_state(sock, SOCK_OPEN);
1803 	sock->references = 1;
1804 	*socketp = sock;
1805 
1806 	iocompletionport_update(sock);
1807 
1808 	/*
1809 	 * Note we don't have to lock the socket like we normally would because
1810 	 * there are no external references to it yet.
1811 	 */
1812 	LOCK(&manager->lock);
1813 	ISC_LIST_APPEND(manager->socklist, sock, link);
1814 	InterlockedIncrement(&manager->totalSockets);
1815 	UNLOCK(&manager->lock);
1816 
1817 	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1818 		   ISC_MSGSET_SOCKET, ISC_MSG_CREATED,
1819 		   "created %u type %u", sock->fd, type);
1820 
1821 	return (ISC_R_SUCCESS);
1822 }
1823 
1824 isc_result_t
isc__socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp)1825 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1826 		   isc_socket_t **socketp)
1827 {
1828 	return (socket_create(manager, pf, type, socketp, NULL));
1829 }
1830 
1831 isc_result_t
isc__socket_dup(isc_socket_t * sock,isc_socket_t ** socketp)1832 isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
1833 	REQUIRE(VALID_SOCKET(sock));
1834 	REQUIRE(socketp != NULL && *socketp == NULL);
1835 
1836 #if 1
1837 	return (ISC_R_NOTIMPLEMENTED);
1838 #else
1839 	return (socket_create(sock->manager, sock->pf, sock->type,
1840 			      socketp, sock));
1841 #endif
1842 }
1843 
1844 isc_result_t
isc_socket_open(isc_socket_t * sock)1845 isc_socket_open(isc_socket_t *sock) {
1846 	REQUIRE(VALID_SOCKET(sock));
1847 	REQUIRE(sock->type != isc_sockettype_fdwatch);
1848 
1849 	return (ISC_R_NOTIMPLEMENTED);
1850 }
1851 
1852 /*
1853  * Attach to a socket.  Caller must explicitly detach when it is done.
1854  */
1855 void
isc__socket_attach(isc_socket_t * sock,isc_socket_t ** socketp)1856 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1857 	REQUIRE(VALID_SOCKET(sock));
1858 	REQUIRE(socketp != NULL && *socketp == NULL);
1859 
1860 	LOCK(&sock->lock);
1861 	CONSISTENT(sock);
1862 	sock->references++;
1863 	UNLOCK(&sock->lock);
1864 
1865 	*socketp = sock;
1866 }
1867 
1868 /*
1869  * Dereference a socket.  If this is the last reference to it, clean things
1870  * up by destroying the socket.
1871  */
1872 void
isc__socket_detach(isc_socket_t ** socketp)1873 isc__socket_detach(isc_socket_t **socketp) {
1874 	isc_socket_t *sock;
1875 	isc_boolean_t kill_socket = ISC_FALSE;
1876 
1877 	REQUIRE(socketp != NULL);
1878 	sock = *socketp;
1879 	REQUIRE(VALID_SOCKET(sock));
1880 	REQUIRE(sock->type != isc_sockettype_fdwatch);
1881 
1882 	LOCK(&sock->lock);
1883 	CONSISTENT(sock);
1884 	REQUIRE(sock->references > 0);
1885 	sock->references--;
1886 
1887 	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1888 		"detach_socket %d %d %d",
1889 		sock->pending_recv, sock->pending_send,
1890 		sock->references);
1891 
1892 	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1893 		closesocket(sock->fd);
1894 		sock->fd = INVALID_SOCKET;
1895 		_set_state(sock, SOCK_CLOSED);
1896 	}
1897 
1898 	maybe_free_socket(&sock, __LINE__);
1899 
1900 	*socketp = NULL;
1901 }
1902 
1903 isc_result_t
isc_socket_close(isc_socket_t * sock)1904 isc_socket_close(isc_socket_t *sock) {
1905 	REQUIRE(VALID_SOCKET(sock));
1906 	REQUIRE(sock->type != isc_sockettype_fdwatch);
1907 
1908 	return (ISC_R_NOTIMPLEMENTED);
1909 }
1910 
1911 /*
1912  * Dequeue an item off the given socket's read queue, set the result code
1913  * in the done event to the one provided, and send it to the task it was
1914  * destined for.
1915  *
1916  * If the event to be sent is on a list, remove it before sending.  If
1917  * asked to, send and detach from the task as well.
1918  *
1919  * Caller must have the socket locked if the event is attached to the socket.
1920  */
1921 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)1922 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1923 	isc_task_t *task;
1924 
1925 	task = (*dev)->ev_sender;
1926 	(*dev)->ev_sender = sock;
1927 
1928 	if (ISC_LINK_LINKED(*dev, ev_link))
1929 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1930 
1931 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1932 	    == ISC_SOCKEVENTATTR_ATTACHED)
1933 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1934 	else
1935 		isc_task_send(task, (isc_event_t **)dev);
1936 
1937 	CONSISTENT(sock);
1938 }
1939 
1940 /*
1941  * See comments for send_recvdone_event() above.
1942  */
1943 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)1944 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1945 	isc_task_t *task;
1946 
1947 	INSIST(dev != NULL && *dev != NULL);
1948 
1949 	task = (*dev)->ev_sender;
1950 	(*dev)->ev_sender = sock;
1951 
1952 	if (ISC_LINK_LINKED(*dev, ev_link))
1953 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1954 
1955 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1956 	    == ISC_SOCKEVENTATTR_ATTACHED)
1957 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1958 	else
1959 		isc_task_send(task, (isc_event_t **)dev);
1960 
1961 	CONSISTENT(sock);
1962 }
1963 
1964 /*
1965  * See comments for send_recvdone_event() above.
1966  */
1967 static void
send_acceptdone_event(isc_socket_t * sock,isc_socket_newconnev_t ** adev)1968 send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1969 	isc_task_t *task;
1970 
1971 	INSIST(adev != NULL && *adev != NULL);
1972 
1973 	task = (*adev)->ev_sender;
1974 	(*adev)->ev_sender = sock;
1975 
1976 	if (ISC_LINK_LINKED(*adev, ev_link))
1977 		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1978 
1979 	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1980 
1981 	CONSISTENT(sock);
1982 }
1983 
1984 /*
1985  * See comments for send_recvdone_event() above.
1986  */
1987 static void
send_connectdone_event(isc_socket_t * sock,isc_socket_connev_t ** cdev)1988 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1989 	isc_task_t *task;
1990 
1991 	INSIST(cdev != NULL && *cdev != NULL);
1992 
1993 	task = (*cdev)->ev_sender;
1994 	(*cdev)->ev_sender = sock;
1995 
1996 	sock->connect_ev = NULL;
1997 
1998 	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1999 
2000 	CONSISTENT(sock);
2001 }
2002 
2003 /*
2004  * On entry to this function, the event delivered is the internal
2005  * readable event, and the first item on the accept_list should be
2006  * the done event we want to send.  If the list is empty, this is a no-op,
2007  * so just close the new connection, unlock, and return.
2008  *
2009  * Note the socket is locked before entering here
2010  */
2011 static void
internal_accept(isc_socket_t * sock,IoCompletionInfo * lpo,int accept_errno)2012 internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
2013 	isc_socket_newconnev_t *adev;
2014 	isc_result_t result = ISC_R_SUCCESS;
2015 	isc_socket_t *nsock;
2016 	struct sockaddr *localaddr;
2017 	int localaddr_len = sizeof(*localaddr);
2018 	struct sockaddr *remoteaddr;
2019 	int remoteaddr_len = sizeof(*remoteaddr);
2020 
2021 	INSIST(VALID_SOCKET(sock));
2022 	LOCK(&sock->lock);
2023 	CONSISTENT(sock);
2024 
2025 	socket_log(__LINE__, sock, NULL, TRACE,
2026 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2027 		   "internal_accept called");
2028 
2029 	INSIST(sock->listener);
2030 
2031 	INSIST(sock->pending_iocp > 0);
2032 	sock->pending_iocp--;
2033 	INSIST(sock->pending_accept > 0);
2034 	sock->pending_accept--;
2035 
2036 	adev = lpo->adev;
2037 
2038 	/*
2039 	 * If the event is no longer in the list we can just return.
2040 	 */
2041 	if (!acceptdone_is_active(sock, adev))
2042 		goto done;
2043 
2044 	nsock = adev->newsocket;
2045 
2046 	/*
2047 	 * Pull off the done event.
2048 	 */
2049 	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2050 
2051 	/*
2052 	 * Extract the addresses from the socket, copy them into the structure,
2053 	 * and return the new socket.
2054 	 */
2055 	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2056 		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2057 		(LPSOCKADDR *)&localaddr, &localaddr_len,
2058 		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2059 	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
2060 	adev->address.length = remoteaddr_len;
2061 	nsock->address = adev->address;
2062 	nsock->pf = adev->address.type.sa.sa_family;
2063 
2064 	socket_log(__LINE__, nsock, &nsock->address, TRACE,
2065 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2066 		   "internal_accept parent %p", sock);
2067 
2068 	result = make_nonblock(adev->newsocket->fd);
2069 	INSIST(result == ISC_R_SUCCESS);
2070 
2071 	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2072 			  (char *)&sock->fd, sizeof(sock->fd)) == 0);
2073 
2074 	/*
2075 	 * Hook it up into the manager.
2076 	 */
2077 	nsock->bound = 1;
2078 	nsock->connected = 1;
2079 	_set_state(nsock, SOCK_OPEN);
2080 
2081 	LOCK(&nsock->manager->lock);
2082 	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2083 	InterlockedIncrement(&nsock->manager->totalSockets);
2084 	UNLOCK(&nsock->manager->lock);
2085 
2086 	socket_log(__LINE__, sock, &nsock->address, CREATION,
2087 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2088 		   "accepted_connection new_socket %p fd %d",
2089 		   nsock, nsock->fd);
2090 
2091 	adev->result = result;
2092 	send_acceptdone_event(sock, &adev);
2093 
2094 done:
2095 	CONSISTENT(sock);
2096 	UNLOCK(&sock->lock);
2097 
2098 	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2099 	lpo->acceptbuffer = NULL;
2100 }
2101 
2102 /*
2103  * Called when a socket with a pending connect() finishes.
2104  * Note that the socket is locked before entering.
2105  */
2106 static void
internal_connect(isc_socket_t * sock,IoCompletionInfo * lpo,int connect_errno)2107 internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2108 	isc_socket_connev_t *cdev;
2109 	char strbuf[ISC_STRERRORSIZE];
2110 
2111 	INSIST(VALID_SOCKET(sock));
2112 
2113 	LOCK(&sock->lock);
2114 
2115 	INSIST(sock->pending_iocp > 0);
2116 	sock->pending_iocp--;
2117 	INSIST(sock->pending_connect == 1);
2118 	sock->pending_connect = 0;
2119 
2120 	/*
2121 	 * Has this event been canceled?
2122 	 */
2123 	cdev = lpo->cdev;
2124 	if (!connectdone_is_active(sock, cdev)) {
2125 		sock->pending_connect = 0;
2126 		if (sock->fd != INVALID_SOCKET) {
2127 			closesocket(sock->fd);
2128 			sock->fd = INVALID_SOCKET;
2129 			_set_state(sock, SOCK_CLOSED);
2130 		}
2131 		CONSISTENT(sock);
2132 		UNLOCK(&sock->lock);
2133 		return;
2134 	}
2135 
2136 	/*
2137 	 * Check possible Windows network event error status here.
2138 	 */
2139 	if (connect_errno != 0) {
2140 		/*
2141 		 * If the error is SOFT, just try again on this
2142 		 * fd and pretend nothing strange happened.
2143 		 */
2144 		if (SOFT_ERROR(connect_errno) ||
2145 		    connect_errno == WSAEINPROGRESS) {
2146 			sock->pending_connect = 1;
2147 			CONSISTENT(sock);
2148 			UNLOCK(&sock->lock);
2149 			return;
2150 		}
2151 
2152 		/*
2153 		 * Translate other errors into ISC_R_* flavors.
2154 		 */
2155 		switch (connect_errno) {
2156 #define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2157 			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2158 			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2159 			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2160 			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2161 			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2162 			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2163 			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2164 			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2165 			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2166 			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2167 			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2168 			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2169 #undef ERROR_MATCH
2170 		default:
2171 			cdev->result = ISC_R_UNEXPECTED;
2172 			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2173 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2174 					 "internal_connect: connect() %s",
2175 					 strbuf);
2176 		}
2177 	} else {
2178 		INSIST(setsockopt(sock->fd, SOL_SOCKET,
2179 				  SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2180 		cdev->result = ISC_R_SUCCESS;
2181 		sock->connected = 1;
2182 		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2183 			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2184 			   "internal_connect: success");
2185 	}
2186 
2187 	send_connectdone_event(sock, &cdev);
2188 
2189 	UNLOCK(&sock->lock);
2190 }
2191 
2192 /*
2193  * Loop through the socket, returning ISC_R_EOF for each done event pending.
2194  */
2195 static void
send_recvdone_abort(isc_socket_t * sock,isc_result_t result)2196 send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2197 	isc_socketevent_t *dev;
2198 
2199 	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2200 		dev = ISC_LIST_HEAD(sock->recv_list);
2201 		dev->result = result;
2202 		send_recvdone_event(sock, &dev);
2203 	}
2204 }
2205 
2206 /*
2207  * Take the data we received in our private buffer, and if any recv() calls on
2208  * our list are satisfied, send the corresponding done event.
2209  *
2210  * If we need more data (there are still items on the recv_list after we consume all
2211  * our data) then arrange for another system recv() call to fill our buffers.
2212  */
2213 static void
internal_recv(isc_socket_t * sock,int nbytes)2214 internal_recv(isc_socket_t *sock, int nbytes)
2215 {
2216 	INSIST(VALID_SOCKET(sock));
2217 
2218 	LOCK(&sock->lock);
2219 	CONSISTENT(sock);
2220 
2221 	socket_log(__LINE__, sock, NULL, IOEVENT,
2222 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2223 		   "internal_recv: %d bytes received", nbytes);
2224 
2225 	/*
2226 	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2227 	 * event from our notification list (or never placed it on it due to immediate completion.)
2228 	 * Handle the reference counting here, and handle the cancellation event just after.
2229 	 */
2230 	INSIST(sock->pending_iocp > 0);
2231 	sock->pending_iocp--;
2232 	INSIST(sock->pending_recv > 0);
2233 	sock->pending_recv--;
2234 
2235 	/*
2236 	 * The only way we could have gotten here is that our I/O has successfully completed.
2237 	 * Update our pointers, and move on.  The only odd case here is that we might not
2238 	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2239 	 * this is the case, we will re-issue the recv() call for what we need.
2240 	 *
2241 	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2242 	 * has closed.
2243 	 */
2244 	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2245 		send_recvdone_abort(sock, ISC_R_EOF);
2246 		maybe_free_socket(&sock, __LINE__);
2247 		return;
2248 	}
2249 	sock->recvbuf.remaining = nbytes;
2250 	sock->recvbuf.consume_position = sock->recvbuf.base;
2251 	completeio_recv(sock);
2252 
2253 	/*
2254 	 * If there are more receivers waiting for data, queue another receive
2255 	 * here.
2256 	 */
2257 	queue_receive_request(sock);
2258 
2259 	/*
2260 	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2261 	 */
2262 	maybe_free_socket(&sock, __LINE__);
2263 }
2264 
2265 static void
internal_send(isc_socket_t * sock,isc_socketevent_t * dev,struct msghdr * messagehdr,int nbytes,int send_errno,IoCompletionInfo * lpo)2266 internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2267 	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2268 {
2269 	buflist_t *buffer;
2270 
2271 	/*
2272 	 * Find out what socket this is and lock it.
2273 	 */
2274 	INSIST(VALID_SOCKET(sock));
2275 
2276 	LOCK(&sock->lock);
2277 	CONSISTENT(sock);
2278 
2279 	socket_log(__LINE__, sock, NULL, IOEVENT,
2280 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2281 		   "internal_send: task got socket event %p", dev);
2282 
2283 	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2284 	while (buffer != NULL) {
2285 		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2286 
2287 		socket_log(__LINE__, sock, NULL, TRACE,
2288 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2289 		   "free_buffer %p %p", buffer, buffer->buf);
2290 
2291 		HeapFree(hHeapHandle, 0, buffer->buf);
2292 		HeapFree(hHeapHandle, 0, buffer);
2293 		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2294 	}
2295 
2296 	INSIST(sock->pending_iocp > 0);
2297 	sock->pending_iocp--;
2298 	INSIST(sock->pending_send > 0);
2299 	sock->pending_send--;
2300 
2301 	/* If the event is no longer in the list we can just return */
2302 	if (!senddone_is_active(sock, dev))
2303 		goto done;
2304 
2305 	/*
2306 	 * Set the error code and send things on its way.
2307 	 */
2308 	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2309 	case DOIO_SOFT:
2310 		break;
2311 	case DOIO_HARD:
2312 	case DOIO_SUCCESS:
2313 		send_senddone_event(sock, &dev);
2314 		break;
2315 	}
2316 
2317  done:
2318 	maybe_free_socket(&sock, __LINE__);
2319 }
2320 
2321 /*
2322  * These return if the done event passed in is on the list (or for connect, is
2323  * the one we're waiting for.  Using these ensures we will not double-send an
2324  * event.
2325  */
2326 static isc_boolean_t
senddone_is_active(isc_socket_t * sock,isc_socketevent_t * dev)2327 senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2328 {
2329 	isc_socketevent_t *ldev;
2330 
2331 	ldev = ISC_LIST_HEAD(sock->send_list);
2332 	while (ldev != NULL && ldev != dev)
2333 		ldev = ISC_LIST_NEXT(ldev, ev_link);
2334 
2335 	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2336 }
2337 
2338 static isc_boolean_t
acceptdone_is_active(isc_socket_t * sock,isc_socket_newconnev_t * dev)2339 acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2340 {
2341 	isc_socket_newconnev_t *ldev;
2342 
2343 	ldev = ISC_LIST_HEAD(sock->accept_list);
2344 	while (ldev != NULL && ldev != dev)
2345 		ldev = ISC_LIST_NEXT(ldev, ev_link);
2346 
2347 	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2348 }
2349 
2350 static isc_boolean_t
connectdone_is_active(isc_socket_t * sock,isc_socket_connev_t * dev)2351 connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2352 {
2353 	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2354 }
2355 
2356 //
2357 // The Windows network stack seems to have two very distinct paths depending
2358 // on what is installed.  Specifically, if something is looking at network
2359 // connections (like an anti-virus or anti-malware application, such as
2360 // McAfee products) Windows may return additional error conditions which
2361 // were not previously returned.
2362 //
2363 // One specific one is when a TCP SYN scan is used.  In this situation,
2364 // Windows responds with the SYN-ACK, but the scanner never responds with
2365 // the 3rd packet, the ACK.  Windows consiers this a partially open connection.
2366 // Most Unix networking stacks, and Windows without McAfee installed, will
2367 // not return this to the caller.  However, with this product installed,
2368 // Windows returns this as a failed status on the Accept() call.  Here, we
2369 // will just re-issue the ISCAcceptEx() call as if nothing had happened.
2370 //
2371 // This code should only be called when the listening socket has received
2372 // such an error.  Additionally, the "parent" socket must be locked.
2373 // Additionally, the lpo argument is re-used here, and must not be freed
2374 // by the caller.
2375 //
2376 static isc_result_t
restart_accept(isc_socket_t * parent,IoCompletionInfo * lpo)2377 restart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2378 {
2379 	isc_socket_t *nsock = lpo->adev->newsocket;
2380 	SOCKET new_fd;
2381 
2382 	/*
2383 	 * AcceptEx() requires we pass in a socket.  Note that we carefully
2384 	 * do not close the previous socket in case of an error message returned by
2385 	 * our new socket() call.  If we return an error here, our caller will
2386 	 * clean up.
2387 	 */
2388 	new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2389 	if (nsock->fd == INVALID_SOCKET) {
2390 		return (ISC_R_FAILURE); // parent will ask windows for error message
2391 	}
2392 	closesocket(nsock->fd);
2393 	nsock->fd = new_fd;
2394 
2395 	memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2396 
2397 	ISCAcceptEx(parent->fd,
2398 		    nsock->fd,				/* Accepted Socket */
2399 		    lpo->acceptbuffer,			/* Buffer for initial Recv */
2400 		    0,					/* Length of Buffer */
2401 		    sizeof(SOCKADDR_STORAGE) + 16,	/* Local address length + 16 */
2402 		    sizeof(SOCKADDR_STORAGE) + 16,	/* Remote address lengh + 16 */
2403 		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
2404 		    (LPOVERLAPPED)lpo			/* Overlapped structure */
2405 		    );
2406 
2407 	InterlockedDecrement(&nsock->manager->iocp_total);
2408 	iocompletionport_update(nsock);
2409 
2410 	return (ISC_R_SUCCESS);
2411 }
2412 
2413 /*
2414  * This is the I/O Completion Port Worker Function. It loops forever
2415  * waiting for I/O to complete and then forwards them for further
2416  * processing. There are a number of these in separate threads.
2417  */
2418 static isc_threadresult_t WINAPI
SocketIoThread(LPVOID ThreadContext)2419 SocketIoThread(LPVOID ThreadContext) {
2420 	isc_socketmgr_t *manager = ThreadContext;
2421 	BOOL bSuccess = FALSE;
2422 	DWORD nbytes;
2423 	IoCompletionInfo *lpo = NULL;
2424 	isc_socket_t *sock = NULL;
2425 	int request;
2426 	struct msghdr *messagehdr = NULL;
2427 	int errval;
2428 	char strbuf[ISC_STRERRORSIZE];
2429 	int errstatus;
2430 
2431 	REQUIRE(VALID_MANAGER(manager));
2432 
2433 	/*
2434 	 * Set the thread priority high enough so I/O will
2435 	 * preempt normal recv packet processing, but not
2436 	 * higher than the timer sync thread.
2437 	 */
2438 	if (!SetThreadPriority(GetCurrentThread(),
2439 			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2440 		errval = GetLastError();
2441 		isc__strerror(errval, strbuf, sizeof(strbuf));
2442 		FATAL_ERROR(__FILE__, __LINE__,
2443 				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2444 				ISC_MSG_FAILED,
2445 				"Can't set thread priority: %s"),
2446 				strbuf);
2447 	}
2448 
2449 	/*
2450 	 * Loop forever waiting on I/O Completions and then processing them
2451 	 */
2452 	while (TRUE) {
2453 		wait_again:
2454 		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2455 						     &nbytes, (LPDWORD)&sock,
2456 						     (LPWSAOVERLAPPED *)&lpo,
2457 						     INFINITE);
2458 		if (lpo == NULL) /* Received request to exit */
2459 			break;
2460 
2461 		REQUIRE(VALID_SOCKET(sock));
2462 
2463 		request = lpo->request_type;
2464 
2465 		errstatus = 0;
2466 		if (!bSuccess) {
2467 			isc_result_t isc_result;
2468 
2469 			/*
2470 			 * Did the I/O operation complete?
2471 			 */
2472 			errstatus = GetLastError();
2473 			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2474 
2475 			LOCK(&sock->lock);
2476 			CONSISTENT(sock);
2477 			switch (request) {
2478 			case SOCKET_RECV:
2479 				INSIST(sock->pending_iocp > 0);
2480 				sock->pending_iocp--;
2481 				INSIST(sock->pending_recv > 0);
2482 				sock->pending_recv--;
2483 				if (!sock->connected &&
2484 				    ((errstatus == ERROR_HOST_UNREACHABLE) ||
2485 				     (errstatus == WSAENETRESET) ||
2486 				     (errstatus == WSAECONNRESET))) {
2487 					/* ignore soft errors */
2488 					queue_receive_request(sock);
2489 					break;
2490 				}
2491 				send_recvdone_abort(sock, isc_result);
2492 				if (isc_result == ISC_R_UNEXPECTED) {
2493 					UNEXPECTED_ERROR(__FILE__, __LINE__,
2494 						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2495 						errstatus, isc_result);
2496 				}
2497 				break;
2498 
2499 			case SOCKET_SEND:
2500 				INSIST(sock->pending_iocp > 0);
2501 				sock->pending_iocp--;
2502 				INSIST(sock->pending_send > 0);
2503 				sock->pending_send--;
2504 				if (senddone_is_active(sock, lpo->dev)) {
2505 					lpo->dev->result = isc_result;
2506 					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2507 						"canceled_send");
2508 					send_senddone_event(sock, &lpo->dev);
2509 				}
2510 				break;
2511 
2512 			case SOCKET_ACCEPT:
2513 				INSIST(sock->pending_iocp > 0);
2514 				INSIST(sock->pending_accept > 0);
2515 
2516 				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2517 					"Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2518 
2519 				if (acceptdone_is_active(sock, lpo->adev)) {
2520 					if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2521 						UNLOCK(&sock->lock);
2522 						goto wait_again;
2523 					} else {
2524 						errstatus = GetLastError();
2525 						isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2526 						socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2527 							"restart_accept() failed: errstatus=%d isc_result=%d",
2528 							errstatus, isc_result);
2529 					}
2530 				}
2531 
2532 				sock->pending_iocp--;
2533 				sock->pending_accept--;
2534 				if (acceptdone_is_active(sock, lpo->adev)) {
2535 					closesocket(lpo->adev->newsocket->fd);
2536 					lpo->adev->newsocket->fd = INVALID_SOCKET;
2537 					lpo->adev->newsocket->references--;
2538 					free_socket(&lpo->adev->newsocket, __LINE__);
2539 					lpo->adev->result = isc_result;
2540 					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2541 						"canceled_accept");
2542 					send_acceptdone_event(sock, &lpo->adev);
2543 				}
2544 				break;
2545 
2546 			case SOCKET_CONNECT:
2547 				INSIST(sock->pending_iocp > 0);
2548 				sock->pending_iocp--;
2549 				INSIST(sock->pending_connect == 1);
2550 				sock->pending_connect = 0;
2551 				if (connectdone_is_active(sock, lpo->cdev)) {
2552 					lpo->cdev->result = isc_result;
2553 					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2554 						"canceled_connect");
2555 					send_connectdone_event(sock, &lpo->cdev);
2556 				}
2557 				break;
2558 			}
2559 			maybe_free_socket(&sock, __LINE__);
2560 
2561 			if (lpo != NULL)
2562 				HeapFree(hHeapHandle, 0, lpo);
2563 			continue;
2564 		}
2565 
2566 		messagehdr = &lpo->messagehdr;
2567 
2568 		switch (request) {
2569 		case SOCKET_RECV:
2570 			internal_recv(sock, nbytes);
2571 			break;
2572 		case SOCKET_SEND:
2573 			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2574 			break;
2575 		case SOCKET_ACCEPT:
2576 			internal_accept(sock, lpo, errstatus);
2577 			break;
2578 		case SOCKET_CONNECT:
2579 			internal_connect(sock, lpo, errstatus);
2580 			break;
2581 		}
2582 
2583 		if (lpo != NULL)
2584 			HeapFree(hHeapHandle, 0, lpo);
2585 	}
2586 
2587 	/*
2588 	 * Exit Completion Port Thread
2589 	 */
2590 	manager_log(manager, TRACE,
2591 		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2592 				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2593 	return ((isc_threadresult_t)0);
2594 }
2595 
2596 /*
2597  * Create a new socket manager.
2598  */
2599 isc_result_t
isc__socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp)2600 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2601 	return (isc_socketmgr_create2(mctx, managerp, 0));
2602 }
2603 
2604 isc_result_t
isc__socketmgr_create2(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks)2605 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2606 		       unsigned int maxsocks)
2607 {
2608 	isc_socketmgr_t *manager;
2609 	isc_result_t result;
2610 
2611 	REQUIRE(managerp != NULL && *managerp == NULL);
2612 
2613 	if (maxsocks != 0)
2614 		return (ISC_R_NOTIMPLEMENTED);
2615 
2616 	manager = isc_mem_get(mctx, sizeof(*manager));
2617 	if (manager == NULL)
2618 		return (ISC_R_NOMEMORY);
2619 
2620 	InitSockets();
2621 
2622 	manager->magic = SOCKET_MANAGER_MAGIC;
2623 	manager->mctx = NULL;
2624 	manager->stats = NULL;
2625 	ISC_LIST_INIT(manager->socklist);
2626 	result = isc_mutex_init(&manager->lock);
2627 	if (result != ISC_R_SUCCESS) {
2628 		isc_mem_put(mctx, manager, sizeof(*manager));
2629 		return (result);
2630 	}
2631 	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2632 		DESTROYLOCK(&manager->lock);
2633 		isc_mem_put(mctx, manager, sizeof(*manager));
2634 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2635 				 "isc_condition_init() %s",
2636 				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2637 						ISC_MSG_FAILED, "failed"));
2638 		return (ISC_R_UNEXPECTED);
2639 	}
2640 
2641 	isc_mem_attach(mctx, &manager->mctx);
2642 
2643 	iocompletionport_init(manager);	/* Create the Completion Ports */
2644 
2645 	manager->bShutdown = ISC_FALSE;
2646 	manager->totalSockets = 0;
2647 	manager->iocp_total = 0;
2648 
2649 	*managerp = manager;
2650 
2651 	return (ISC_R_SUCCESS);
2652 }
2653 
2654 isc_result_t
isc__socketmgr_getmaxsockets(isc_socketmgr_t * manager,unsigned int * nsockp)2655 isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2656 	REQUIRE(VALID_MANAGER(manager));
2657 	REQUIRE(nsockp != NULL);
2658 
2659 	return (ISC_R_NOTIMPLEMENTED);
2660 }
2661 
2662 void
isc__socketmgr_setstats(isc_socketmgr_t * manager,isc_stats_t * stats)2663 isc__socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2664 	REQUIRE(VALID_MANAGER(manager));
2665 	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2666 	REQUIRE(manager->stats == NULL);
2667 	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2668 
2669 	isc_stats_attach(stats, &manager->stats);
2670 }
2671 
2672 void
isc__socketmgr_destroy(isc_socketmgr_t ** managerp)2673 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2674 	isc_socketmgr_t *manager;
2675 	int i;
2676 	isc_mem_t *mctx;
2677 
2678 	/*
2679 	 * Destroy a socket manager.
2680 	 */
2681 
2682 	REQUIRE(managerp != NULL);
2683 	manager = *managerp;
2684 	REQUIRE(VALID_MANAGER(manager));
2685 
2686 	LOCK(&manager->lock);
2687 
2688 	/*
2689 	 * Wait for all sockets to be destroyed.
2690 	 */
2691 	while (!ISC_LIST_EMPTY(manager->socklist)) {
2692 		manager_log(manager, CREATION,
2693 			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2694 					   ISC_MSG_SOCKETSREMAIN,
2695 					   "sockets exist"));
2696 		WAIT(&manager->shutdown_ok, &manager->lock);
2697 	}
2698 
2699 	UNLOCK(&manager->lock);
2700 
2701 	/*
2702 	 * Here, we need to had some wait code for the completion port
2703 	 * thread.
2704 	 */
2705 	signal_iocompletionport_exit(manager);
2706 	manager->bShutdown = ISC_TRUE;
2707 
2708 	/*
2709 	 * Wait for threads to exit.
2710 	 */
2711 	for (i = 0; i < manager->maxIOCPThreads; i++) {
2712 		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2713 			NULL) != ISC_R_SUCCESS)
2714 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2715 				 "isc_thread_join() for Completion Port %s",
2716 				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2717 						ISC_MSG_FAILED, "failed"));
2718 	}
2719 	/*
2720 	 * Clean up.
2721 	 */
2722 
2723 	CloseHandle(manager->hIoCompletionPort);
2724 
2725 	(void)isc_condition_destroy(&manager->shutdown_ok);
2726 
2727 	DESTROYLOCK(&manager->lock);
2728 	if (manager->stats != NULL)
2729 		isc_stats_detach(&manager->stats);
2730 	manager->magic = 0;
2731 	mctx= manager->mctx;
2732 	isc_mem_put(mctx, manager, sizeof(*manager));
2733 
2734 	isc_mem_detach(&mctx);
2735 
2736 	*managerp = NULL;
2737 }
2738 
2739 static void
queue_receive_event(isc_socket_t * sock,isc_task_t * task,isc_socketevent_t * dev)2740 queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2741 {
2742 	isc_task_t *ntask = NULL;
2743 
2744 	isc_task_attach(task, &ntask);
2745 	dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2746 
2747 	/*
2748 	 * Enqueue the request.
2749 	 */
2750 	INSIST(!ISC_LINK_LINKED(dev, ev_link));
2751 	ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2752 
2753 	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2754 		   "queue_receive_event: event %p -> task %p",
2755 		   dev, ntask);
2756 }
2757 
2758 /*
2759  * Check the pending receive queue, and if we have data pending, give it to this
2760  * caller.  If we have none, queue an I/O request.  If this caller is not the first
2761  * on the list, then we will just queue this event and return.
2762  *
2763  * Caller must have the socket locked.
2764  */
2765 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)2766 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2767 	    unsigned int flags)
2768 {
2769 	int cc = 0;
2770 	isc_task_t *ntask = NULL;
2771 	isc_result_t result = ISC_R_SUCCESS;
2772 	int recv_errno = 0;
2773 
2774 	dev->ev_sender = task;
2775 
2776 	if (sock->fd == INVALID_SOCKET)
2777 		return (ISC_R_EOF);
2778 
2779 	/*
2780 	 * Queue our event on the list of things to do.  Call our function to
2781 	 * attempt to fill buffers as much as possible, and return done events.
2782 	 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2783 	 * here and tell our caller that we could not satisfy it immediately.
2784 	 */
2785 	queue_receive_event(sock, task, dev);
2786 	if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2787 		result = ISC_R_INPROGRESS;
2788 
2789 	completeio_recv(sock);
2790 
2791 	/*
2792 	 * If there are more receivers waiting for data, queue another receive
2793 	 * here.  If the
2794 	 */
2795 	queue_receive_request(sock);
2796 
2797 	return (result);
2798 }
2799 
2800 isc_result_t
isc__socket_recvv(isc_socket_t * sock,isc_bufferlist_t * buflist,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,const void * arg)2801 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2802 		 unsigned int minimum, isc_task_t *task,
2803 		 isc_taskaction_t action, const void *arg)
2804 {
2805 	isc_socketevent_t *dev;
2806 	isc_socketmgr_t *manager;
2807 	unsigned int iocount;
2808 	isc_buffer_t *buffer;
2809 	isc_result_t ret;
2810 
2811 	REQUIRE(VALID_SOCKET(sock));
2812 	LOCK(&sock->lock);
2813 	CONSISTENT(sock);
2814 
2815 	/*
2816 	 * Make sure that the socket is not closed.  XXXMLG change error here?
2817 	 */
2818 	if (sock->fd == INVALID_SOCKET) {
2819 		UNLOCK(&sock->lock);
2820 		return (ISC_R_CONNREFUSED);
2821 	}
2822 
2823 	REQUIRE(buflist != NULL);
2824 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2825 	REQUIRE(task != NULL);
2826 	REQUIRE(action != NULL);
2827 
2828 	manager = sock->manager;
2829 	REQUIRE(VALID_MANAGER(manager));
2830 
2831 	iocount = isc_bufferlist_availablecount(buflist);
2832 	REQUIRE(iocount > 0);
2833 
2834 	INSIST(sock->bound);
2835 
2836 	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2837 	if (dev == NULL) {
2838 		UNLOCK(&sock->lock);
2839 		return (ISC_R_NOMEMORY);
2840 	}
2841 
2842 	/*
2843 	 * UDP sockets are always partial read
2844 	 */
2845 	if (sock->type == isc_sockettype_udp)
2846 		dev->minimum = 1;
2847 	else {
2848 		if (minimum == 0)
2849 			dev->minimum = iocount;
2850 		else
2851 			dev->minimum = minimum;
2852 	}
2853 
2854 	/*
2855 	 * Move each buffer from the passed in list to our internal one.
2856 	 */
2857 	buffer = ISC_LIST_HEAD(*buflist);
2858 	while (buffer != NULL) {
2859 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2860 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2861 		buffer = ISC_LIST_HEAD(*buflist);
2862 	}
2863 
2864 	ret = socket_recv(sock, dev, task, 0);
2865 
2866 	UNLOCK(&sock->lock);
2867 	return (ret);
2868 }
2869 
2870 isc_result_t
isc__socket_recv(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,const void * arg)2871 isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
2872 		 unsigned int minimum, isc_task_t *task,
2873 		 isc_taskaction_t action, const void *arg)
2874 {
2875 	isc_socketevent_t *dev;
2876 	isc_socketmgr_t *manager;
2877 	isc_result_t ret;
2878 
2879 	REQUIRE(VALID_SOCKET(sock));
2880 	LOCK(&sock->lock);
2881 	CONSISTENT(sock);
2882 
2883 	/*
2884 	 * make sure that the socket's not closed
2885 	 */
2886 	if (sock->fd == INVALID_SOCKET) {
2887 		UNLOCK(&sock->lock);
2888 		return (ISC_R_CONNREFUSED);
2889 	}
2890 	REQUIRE(action != NULL);
2891 
2892 	manager = sock->manager;
2893 	REQUIRE(VALID_MANAGER(manager));
2894 
2895 	INSIST(sock->bound);
2896 
2897 	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2898 	if (dev == NULL) {
2899 		UNLOCK(&sock->lock);
2900 		return (ISC_R_NOMEMORY);
2901 	}
2902 
2903 	ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2904 	UNLOCK(&sock->lock);
2905 	return (ret);
2906 }
2907 
2908 isc_result_t
isc__socket_recv2(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)2909 isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
2910 		  unsigned int minimum, isc_task_t *task,
2911 		  isc_socketevent_t *event, unsigned int flags)
2912 {
2913 	isc_result_t ret;
2914 
2915 	REQUIRE(VALID_SOCKET(sock));
2916 	LOCK(&sock->lock);
2917 	CONSISTENT(sock);
2918 
2919 	event->result = ISC_R_UNEXPECTED;
2920 	event->ev_sender = sock;
2921 	/*
2922 	 * make sure that the socket's not closed
2923 	 */
2924 	if (sock->fd == INVALID_SOCKET) {
2925 		UNLOCK(&sock->lock);
2926 		return (ISC_R_CONNREFUSED);
2927 	}
2928 
2929 	ISC_LIST_INIT(event->bufferlist);
2930 	event->region = *region;
2931 	event->n = 0;
2932 	event->offset = 0;
2933 	event->attributes = 0;
2934 
2935 	/*
2936 	 * UDP sockets are always partial read.
2937 	 */
2938 	if (sock->type == isc_sockettype_udp)
2939 		event->minimum = 1;
2940 	else {
2941 		if (minimum == 0)
2942 			event->minimum = region->length;
2943 		else
2944 			event->minimum = minimum;
2945 	}
2946 
2947 	ret = socket_recv(sock, event, task, flags);
2948 	UNLOCK(&sock->lock);
2949 	return (ret);
2950 }
2951 
2952 /*
2953  * Caller must have the socket locked.
2954  */
2955 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)2956 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2957 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2958 	    unsigned int flags)
2959 {
2960 	int io_state;
2961 	int send_errno = 0;
2962 	int cc = 0;
2963 	isc_task_t *ntask = NULL;
2964 	isc_result_t result = ISC_R_SUCCESS;
2965 
2966 	dev->ev_sender = task;
2967 
2968 	set_dev_address(address, sock, dev);
2969 	if (pktinfo != NULL) {
2970 		socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2971 			   ISC_MSG_PKTINFOPROVIDED,
2972 			   "pktinfo structure provided, ifindex %u (set to 0)",
2973 			   pktinfo->ipi6_ifindex);
2974 
2975 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2976 		dev->pktinfo = *pktinfo;
2977 		/*
2978 		 * Set the pktinfo index to 0 here, to let the kernel decide
2979 		 * what interface it should send on.
2980 		 */
2981 		dev->pktinfo.ipi6_ifindex = 0;
2982 	}
2983 
2984 	io_state = startio_send(sock, dev, &cc, &send_errno);
2985 	switch (io_state) {
2986 	case DOIO_PENDING:	/* I/O started. Nothing more to do */
2987 	case DOIO_SOFT:
2988 		/*
2989 		 * We couldn't send all or part of the request right now, so
2990 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2991 		 */
2992 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2993 			isc_task_attach(task, &ntask);
2994 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2995 
2996 			/*
2997 			 * Enqueue the request.
2998 			 */
2999 			INSIST(!ISC_LINK_LINKED(dev, ev_link));
3000 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3001 
3002 			socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
3003 				   "socket_send: event %p -> task %p",
3004 				   dev, ntask);
3005 
3006 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3007 				result = ISC_R_INPROGRESS;
3008 			break;
3009 		}
3010 
3011 	case DOIO_SUCCESS:
3012 		break;
3013 	}
3014 
3015 	return (result);
3016 }
3017 
3018 isc_result_t
isc__socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,const void * arg)3019 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
3020 		 isc_task_t *task, isc_taskaction_t action, const void *arg)
3021 {
3022 	/*
3023 	 * REQUIRE() checking is performed in isc_socket_sendto().
3024 	 */
3025 	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3026 				  NULL));
3027 }
3028 
3029 isc_result_t
isc__socket_sendto(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,const void * arg,isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)3030 isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
3031 		   isc_task_t *task, isc_taskaction_t action, const void *arg,
3032 		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3033 {
3034 	isc_socketevent_t *dev;
3035 	isc_socketmgr_t *manager;
3036 	isc_result_t ret;
3037 
3038 	REQUIRE(VALID_SOCKET(sock));
3039 	REQUIRE(sock->type != isc_sockettype_fdwatch);
3040 
3041 	LOCK(&sock->lock);
3042 	CONSISTENT(sock);
3043 
3044 	/*
3045 	 * make sure that the socket's not closed
3046 	 */
3047 	if (sock->fd == INVALID_SOCKET) {
3048 		UNLOCK(&sock->lock);
3049 		return (ISC_R_CONNREFUSED);
3050 	}
3051 	REQUIRE(region != NULL);
3052 	REQUIRE(task != NULL);
3053 	REQUIRE(action != NULL);
3054 
3055 	manager = sock->manager;
3056 	REQUIRE(VALID_MANAGER(manager));
3057 
3058 	INSIST(sock->bound);
3059 
3060 	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3061 	if (dev == NULL) {
3062 		UNLOCK(&sock->lock);
3063 		return (ISC_R_NOMEMORY);
3064 	}
3065 	dev->region = *region;
3066 
3067 	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3068 	UNLOCK(&sock->lock);
3069 	return (ret);
3070 }
3071 
3072 isc_result_t
isc__socket_sendv(isc_socket_t * sock,isc_bufferlist_t * buflist,isc_task_t * task,isc_taskaction_t action,const void * arg)3073 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3074 		  isc_task_t *task, isc_taskaction_t action, const void *arg)
3075 {
3076 	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3077 				   NULL));
3078 }
3079 
3080 isc_result_t
isc__socket_sendtov(isc_socket_t * sock,isc_bufferlist_t * buflist,isc_task_t * task,isc_taskaction_t action,const void * arg,isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)3081 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3082 		    isc_task_t *task, isc_taskaction_t action, const void *arg,
3083 		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3084 {
3085 	isc_socketevent_t *dev;
3086 	isc_socketmgr_t *manager;
3087 	unsigned int iocount;
3088 	isc_buffer_t *buffer;
3089 	isc_result_t ret;
3090 
3091 	REQUIRE(VALID_SOCKET(sock));
3092 
3093 	LOCK(&sock->lock);
3094 	CONSISTENT(sock);
3095 
3096 	/*
3097 	 * make sure that the socket's not closed
3098 	 */
3099 	if (sock->fd == INVALID_SOCKET) {
3100 		UNLOCK(&sock->lock);
3101 		return (ISC_R_CONNREFUSED);
3102 	}
3103 	REQUIRE(buflist != NULL);
3104 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
3105 	REQUIRE(task != NULL);
3106 	REQUIRE(action != NULL);
3107 
3108 	manager = sock->manager;
3109 	REQUIRE(VALID_MANAGER(manager));
3110 
3111 	iocount = isc_bufferlist_usedcount(buflist);
3112 	REQUIRE(iocount > 0);
3113 
3114 	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3115 	if (dev == NULL) {
3116 		UNLOCK(&sock->lock);
3117 		return (ISC_R_NOMEMORY);
3118 	}
3119 
3120 	/*
3121 	 * Move each buffer from the passed in list to our internal one.
3122 	 */
3123 	buffer = ISC_LIST_HEAD(*buflist);
3124 	while (buffer != NULL) {
3125 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
3126 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3127 		buffer = ISC_LIST_HEAD(*buflist);
3128 	}
3129 
3130 	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3131 	UNLOCK(&sock->lock);
3132 	return (ret);
3133 }
3134 
3135 isc_result_t
isc__socket_sendto2(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)3136 isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3137 		    isc_task_t *task,
3138 		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3139 		    isc_socketevent_t *event, unsigned int flags)
3140 {
3141 	isc_result_t ret;
3142 
3143 	REQUIRE(VALID_SOCKET(sock));
3144 	LOCK(&sock->lock);
3145 	CONSISTENT(sock);
3146 
3147 	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3148 	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3149 		REQUIRE(sock->type == isc_sockettype_udp);
3150 	event->ev_sender = sock;
3151 	event->result = ISC_R_UNEXPECTED;
3152 	/*
3153 	 * make sure that the socket's not closed
3154 	 */
3155 	if (sock->fd == INVALID_SOCKET) {
3156 		UNLOCK(&sock->lock);
3157 		return (ISC_R_CONNREFUSED);
3158 	}
3159 	ISC_LIST_INIT(event->bufferlist);
3160 	event->region = *region;
3161 	event->n = 0;
3162 	event->offset = 0;
3163 	event->attributes = 0;
3164 
3165 	ret = socket_send(sock, event, task, address, pktinfo, flags);
3166 	UNLOCK(&sock->lock);
3167 	return (ret);
3168 }
3169 
3170 isc_result_t
isc__socket_bind(isc_socket_t * sock,isc_sockaddr_t * sockaddr,unsigned int options)3171 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3172 		 unsigned int options) {
3173 	int bind_errno;
3174 	char strbuf[ISC_STRERRORSIZE];
3175 	int on = 1;
3176 
3177 	REQUIRE(VALID_SOCKET(sock));
3178 	LOCK(&sock->lock);
3179 	CONSISTENT(sock);
3180 
3181 	/*
3182 	 * make sure that the socket's not closed
3183 	 */
3184 	if (sock->fd == INVALID_SOCKET) {
3185 		UNLOCK(&sock->lock);
3186 		return (ISC_R_CONNREFUSED);
3187 	}
3188 
3189 	INSIST(!sock->bound);
3190 	INSIST(!sock->dupped);
3191 
3192 	if (sock->pf != sockaddr->type.sa.sa_family) {
3193 		UNLOCK(&sock->lock);
3194 		return (ISC_R_FAMILYMISMATCH);
3195 	}
3196 	/*
3197 	 * Only set SO_REUSEADDR when we want a specific port.
3198 	 */
3199 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3200 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3201 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
3202 		       sizeof(on)) < 0) {
3203 		UNEXPECTED_ERROR(__FILE__, __LINE__,
3204 				 "setsockopt(%d) %s", sock->fd,
3205 				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3206 						ISC_MSG_FAILED, "failed"));
3207 		/* Press on... */
3208 	}
3209 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3210 		bind_errno = WSAGetLastError();
3211 		UNLOCK(&sock->lock);
3212 		switch (bind_errno) {
3213 		case WSAEACCES:
3214 			return (ISC_R_NOPERM);
3215 		case WSAEADDRNOTAVAIL:
3216 			return (ISC_R_ADDRNOTAVAIL);
3217 		case WSAEADDRINUSE:
3218 			return (ISC_R_ADDRINUSE);
3219 		case WSAEINVAL:
3220 			return (ISC_R_BOUND);
3221 		default:
3222 			isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3223 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3224 					 strbuf);
3225 			return (ISC_R_UNEXPECTED);
3226 		}
3227 	}
3228 
3229 	socket_log(__LINE__, sock, sockaddr, TRACE,
3230 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3231 	sock->bound = 1;
3232 
3233 	UNLOCK(&sock->lock);
3234 	return (ISC_R_SUCCESS);
3235 }
3236 
3237 isc_result_t
isc__socket_filter(isc_socket_t * sock,const char * filter)3238 isc__socket_filter(isc_socket_t *sock, const char *filter) {
3239 	UNUSED(sock);
3240 	UNUSED(filter);
3241 
3242 	REQUIRE(VALID_SOCKET(sock));
3243 	return (ISC_R_NOTIMPLEMENTED);
3244 }
3245 
3246 /*
3247  * Set up to listen on a given socket.  We do this by creating an internal
3248  * event that will be dispatched when the socket has read activity.  The
3249  * watcher will send the internal event to the task when there is a new
3250  * connection.
3251  *
3252  * Unlike in read, we don't preallocate a done event here.  Every time there
3253  * is a new connection we'll have to allocate a new one anyway, so we might
3254  * as well keep things simple rather than having to track them.
3255  */
3256 isc_result_t
isc__socket_listen(isc_socket_t * sock,unsigned int backlog)3257 isc__socket_listen(isc_socket_t *sock, unsigned int backlog) {
3258 	char strbuf[ISC_STRERRORSIZE];
3259 
3260 	REQUIRE(VALID_SOCKET(sock));
3261 
3262 	LOCK(&sock->lock);
3263 	CONSISTENT(sock);
3264 
3265 	/*
3266 	 * make sure that the socket's not closed
3267 	 */
3268 	if (sock->fd == INVALID_SOCKET) {
3269 		UNLOCK(&sock->lock);
3270 		return (ISC_R_CONNREFUSED);
3271 	}
3272 
3273 	REQUIRE(!sock->listener);
3274 	REQUIRE(sock->bound);
3275 	REQUIRE(sock->type == isc_sockettype_tcp);
3276 
3277 	if (backlog == 0)
3278 		backlog = SOMAXCONN;
3279 
3280 	if (listen(sock->fd, (int)backlog) < 0) {
3281 		UNLOCK(&sock->lock);
3282 		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3283 
3284 		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3285 
3286 		return (ISC_R_UNEXPECTED);
3287 	}
3288 
3289 	socket_log(__LINE__, sock, NULL, TRACE,
3290 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3291 	sock->listener = 1;
3292 	_set_state(sock, SOCK_LISTEN);
3293 
3294 	UNLOCK(&sock->lock);
3295 	return (ISC_R_SUCCESS);
3296 }
3297 
3298 /*
3299  * This should try to do aggressive accept() XXXMLG
3300  */
3301 isc_result_t
isc__socket_accept(isc_socket_t * sock,isc_task_t * task,isc_taskaction_t action,const void * arg)3302 isc__socket_accept(isc_socket_t *sock,
3303 		   isc_task_t *task, isc_taskaction_t action, const void *arg)
3304 {
3305 	isc_socket_newconnev_t *adev;
3306 	isc_socketmgr_t *manager;
3307 	isc_task_t *ntask = NULL;
3308 	isc_socket_t *nsock;
3309 	isc_result_t result;
3310 	IoCompletionInfo *lpo;
3311 
3312 	REQUIRE(VALID_SOCKET(sock));
3313 
3314 	manager = sock->manager;
3315 	REQUIRE(VALID_MANAGER(manager));
3316 
3317 	LOCK(&sock->lock);
3318 	CONSISTENT(sock);
3319 
3320 	/*
3321 	 * make sure that the socket's not closed
3322 	 */
3323 	if (sock->fd == INVALID_SOCKET) {
3324 		UNLOCK(&sock->lock);
3325 		return (ISC_R_CONNREFUSED);
3326 	}
3327 
3328 	REQUIRE(sock->listener);
3329 
3330 	/*
3331 	 * Sender field is overloaded here with the task we will be sending
3332 	 * this event to.  Just before the actual event is delivered the
3333 	 * actual ev_sender will be touched up to be the socket.
3334 	 */
3335 	adev = (isc_socket_newconnev_t *)
3336 		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3337 				   action, arg, sizeof(*adev));
3338 	if (adev == NULL) {
3339 		UNLOCK(&sock->lock);
3340 		return (ISC_R_NOMEMORY);
3341 	}
3342 	ISC_LINK_INIT(adev, ev_link);
3343 
3344 	result = allocate_socket(manager, sock->type, &nsock);
3345 	if (result != ISC_R_SUCCESS) {
3346 		isc_event_free((isc_event_t **)&adev);
3347 		UNLOCK(&sock->lock);
3348 		return (result);
3349 	}
3350 
3351 	/*
3352 	 * AcceptEx() requires we pass in a socket.
3353 	 */
3354 	nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3355 	if (nsock->fd == INVALID_SOCKET) {
3356 		free_socket(&nsock, __LINE__);
3357 		isc_event_free((isc_event_t **)&adev);
3358 		UNLOCK(&sock->lock);
3359 		return (ISC_R_FAILURE); // XXXMLG need real error message
3360 	}
3361 
3362 	/*
3363 	 * Attach to socket and to task.
3364 	 */
3365 	isc_task_attach(task, &ntask);
3366 	if (isc_task_exiting(ntask)) {
3367 		free_socket(&nsock, __LINE__);
3368 		isc_task_detach(&ntask);
3369 		isc_event_free(ISC_EVENT_PTR(&adev));
3370 		UNLOCK(&sock->lock);
3371 		return (ISC_R_SHUTTINGDOWN);
3372 	}
3373 	nsock->references++;
3374 
3375 	adev->ev_sender = ntask;
3376 	adev->newsocket = nsock;
3377 	_set_state(nsock, SOCK_ACCEPT);
3378 
3379 	/*
3380 	 * Queue io completion for an accept().
3381 	 */
3382 	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3383 					    HEAP_ZERO_MEMORY,
3384 					    sizeof(IoCompletionInfo));
3385 	RUNTIME_CHECK(lpo != NULL);
3386 	lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3387 		(sizeof(SOCKADDR_STORAGE) + 16) * 2);
3388 	RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3389 
3390 	lpo->adev = adev;
3391 	lpo->request_type = SOCKET_ACCEPT;
3392 
3393 	ISCAcceptEx(sock->fd,
3394 		    nsock->fd,				/* Accepted Socket */
3395 		    lpo->acceptbuffer,			/* Buffer for initial Recv */
3396 		    0,					/* Length of Buffer */
3397 		    sizeof(SOCKADDR_STORAGE) + 16,		/* Local address length + 16 */
3398 		    sizeof(SOCKADDR_STORAGE) + 16,		/* Remote address lengh + 16 */
3399 		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
3400 		    (LPOVERLAPPED)lpo			/* Overlapped structure */
3401 		    );
3402 	iocompletionport_update(nsock);
3403 
3404 	socket_log(__LINE__, sock, NULL, TRACE,
3405 		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3406 		   "accepting for nsock %p fd %d", nsock, nsock->fd);
3407 
3408 	/*
3409 	 * Enqueue the event
3410 	 */
3411 	ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3412 	sock->pending_accept++;
3413 	sock->pending_iocp++;
3414 
3415 	UNLOCK(&sock->lock);
3416 	return (ISC_R_SUCCESS);
3417 }
3418 
3419 isc_result_t
isc__socket_connect(isc_socket_t * sock,isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,const void * arg)3420 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3421 		    isc_task_t *task, isc_taskaction_t action, const void *arg)
3422 {
3423 	char strbuf[ISC_STRERRORSIZE];
3424 	isc_socket_connev_t *cdev;
3425 	isc_task_t *ntask = NULL;
3426 	isc_socketmgr_t *manager;
3427 	IoCompletionInfo *lpo;
3428 	int bind_errno;
3429 
3430 	REQUIRE(VALID_SOCKET(sock));
3431 	REQUIRE(addr != NULL);
3432 	REQUIRE(task != NULL);
3433 	REQUIRE(action != NULL);
3434 
3435 	manager = sock->manager;
3436 	REQUIRE(VALID_MANAGER(manager));
3437 	REQUIRE(addr != NULL);
3438 
3439 	if (isc_sockaddr_ismulticast(addr))
3440 		return (ISC_R_MULTICAST);
3441 
3442 	LOCK(&sock->lock);
3443 	CONSISTENT(sock);
3444 
3445 	/*
3446 	 * make sure that the socket's not closed
3447 	 */
3448 	if (sock->fd == INVALID_SOCKET) {
3449 		UNLOCK(&sock->lock);
3450 		return (ISC_R_CONNREFUSED);
3451 	}
3452 
3453 	/*
3454 	 * Windows sockets won't connect unless the socket is bound.
3455 	 */
3456 	if (!sock->bound) {
3457 		isc_sockaddr_t any;
3458 
3459 		isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3460 		if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3461 			bind_errno = WSAGetLastError();
3462 			UNLOCK(&sock->lock);
3463 			switch (bind_errno) {
3464 			case WSAEACCES:
3465 				return (ISC_R_NOPERM);
3466 			case WSAEADDRNOTAVAIL:
3467 				return (ISC_R_ADDRNOTAVAIL);
3468 			case WSAEADDRINUSE:
3469 				return (ISC_R_ADDRINUSE);
3470 			case WSAEINVAL:
3471 				return (ISC_R_BOUND);
3472 			default:
3473 				isc__strerror(bind_errno, strbuf,
3474 					      sizeof(strbuf));
3475 				UNEXPECTED_ERROR(__FILE__, __LINE__,
3476 						 "bind: %s", strbuf);
3477 				return (ISC_R_UNEXPECTED);
3478 			}
3479 		}
3480 		sock->bound = 1;
3481 	}
3482 
3483 	REQUIRE(!sock->pending_connect);
3484 
3485 	cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3486 							ISC_SOCKEVENT_CONNECT,
3487 							action,	arg,
3488 							sizeof(*cdev));
3489 	if (cdev == NULL) {
3490 		UNLOCK(&sock->lock);
3491 		return (ISC_R_NOMEMORY);
3492 	}
3493 	ISC_LINK_INIT(cdev, ev_link);
3494 
3495 	if (sock->type == isc_sockettype_tcp) {
3496 		/*
3497 		 * Queue io completion for an accept().
3498 		 */
3499 		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3500 						    HEAP_ZERO_MEMORY,
3501 						    sizeof(IoCompletionInfo));
3502 		lpo->cdev = cdev;
3503 		lpo->request_type = SOCKET_CONNECT;
3504 
3505 		sock->address = *addr;
3506 		ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3507 			NULL, 0, NULL, (LPOVERLAPPED)lpo);
3508 
3509 		/*
3510 		 * Attach to task.
3511 		 */
3512 		isc_task_attach(task, &ntask);
3513 		cdev->ev_sender = ntask;
3514 
3515 		sock->pending_connect = 1;
3516 		_set_state(sock, SOCK_CONNECT);
3517 
3518 		/*
3519 		 * Enqueue the request.
3520 		 */
3521 		sock->connect_ev = cdev;
3522 		sock->pending_iocp++;
3523 	} else {
3524 		WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3525 		cdev->result = ISC_R_SUCCESS;
3526 		isc_task_send(task, (isc_event_t **)&cdev);
3527 	}
3528 	CONSISTENT(sock);
3529 	UNLOCK(&sock->lock);
3530 
3531 	return (ISC_R_SUCCESS);
3532 }
3533 
3534 isc_result_t
isc__socket_getpeername(isc_socket_t * sock,isc_sockaddr_t * addressp)3535 isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3536 	isc_result_t result;
3537 
3538 	REQUIRE(VALID_SOCKET(sock));
3539 	REQUIRE(addressp != NULL);
3540 
3541 	LOCK(&sock->lock);
3542 	CONSISTENT(sock);
3543 
3544 	/*
3545 	 * make sure that the socket's not closed
3546 	 */
3547 	if (sock->fd == INVALID_SOCKET) {
3548 		UNLOCK(&sock->lock);
3549 		return (ISC_R_CONNREFUSED);
3550 	}
3551 
3552 	if (sock->connected) {
3553 		*addressp = sock->address;
3554 		result = ISC_R_SUCCESS;
3555 	} else {
3556 		result = ISC_R_NOTCONNECTED;
3557 	}
3558 
3559 	UNLOCK(&sock->lock);
3560 
3561 	return (result);
3562 }
3563 
3564 isc_result_t
isc__socket_getsockname(isc_socket_t * sock,isc_sockaddr_t * addressp)3565 isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3566 	ISC_SOCKADDR_LEN_T len;
3567 	isc_result_t result;
3568 	char strbuf[ISC_STRERRORSIZE];
3569 
3570 	REQUIRE(VALID_SOCKET(sock));
3571 	REQUIRE(addressp != NULL);
3572 
3573 	LOCK(&sock->lock);
3574 	CONSISTENT(sock);
3575 
3576 	/*
3577 	 * make sure that the socket's not closed
3578 	 */
3579 	if (sock->fd == INVALID_SOCKET) {
3580 		UNLOCK(&sock->lock);
3581 		return (ISC_R_CONNREFUSED);
3582 	}
3583 
3584 	if (!sock->bound) {
3585 		result = ISC_R_NOTBOUND;
3586 		goto out;
3587 	}
3588 
3589 	result = ISC_R_SUCCESS;
3590 
3591 	len = sizeof(addressp->type);
3592 	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3593 		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3594 		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3595 				 strbuf);
3596 		result = ISC_R_UNEXPECTED;
3597 		goto out;
3598 	}
3599 	addressp->length = (unsigned int)len;
3600 
3601  out:
3602 	UNLOCK(&sock->lock);
3603 
3604 	return (result);
3605 }
3606 
3607 /*
3608  * Run through the list of events on this socket, and cancel the ones
3609  * queued for task "task" of type "how".  "how" is a bitmask.
3610  */
3611 void
isc__socket_cancel(isc_socket_t * sock,isc_task_t * task,unsigned int how)3612 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3613 
3614 	REQUIRE(VALID_SOCKET(sock));
3615 
3616 	/*
3617 	 * Quick exit if there is nothing to do.  Don't even bother locking
3618 	 * in this case.
3619 	 */
3620 	if (how == 0)
3621 		return;
3622 
3623 	LOCK(&sock->lock);
3624 	CONSISTENT(sock);
3625 
3626 	/*
3627 	 * make sure that the socket's not closed
3628 	 */
3629 	if (sock->fd == INVALID_SOCKET) {
3630 		UNLOCK(&sock->lock);
3631 		return;
3632 	}
3633 
3634 	/*
3635 	 * All of these do the same thing, more or less.
3636 	 * Each will:
3637 	 *	o If the internal event is marked as "posted" try to
3638 	 *	  remove it from the task's queue.  If this fails, mark it
3639 	 *	  as canceled instead, and let the task clean it up later.
3640 	 *	o For each I/O request for that task of that type, post
3641 	 *	  its done event with status of "ISC_R_CANCELED".
3642 	 *	o Reset any state needed.
3643 	 */
3644 
3645 	if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3646 		isc_socketevent_t      *dev;
3647 		isc_socketevent_t      *next;
3648 		isc_task_t	       *current_task;
3649 
3650 		dev = ISC_LIST_HEAD(sock->recv_list);
3651 		while (dev != NULL) {
3652 			current_task = dev->ev_sender;
3653 			next = ISC_LIST_NEXT(dev, ev_link);
3654 			if ((task == NULL) || (task == current_task)) {
3655 				dev->result = ISC_R_CANCELED;
3656 				send_recvdone_event(sock, &dev);
3657 			}
3658 			dev = next;
3659 		}
3660 	}
3661 	how &= ~ISC_SOCKCANCEL_RECV;
3662 
3663 	if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3664 		isc_socketevent_t      *dev;
3665 		isc_socketevent_t      *next;
3666 		isc_task_t	       *current_task;
3667 
3668 		dev = ISC_LIST_HEAD(sock->send_list);
3669 
3670 		while (dev != NULL) {
3671 			current_task = dev->ev_sender;
3672 			next = ISC_LIST_NEXT(dev, ev_link);
3673 			if ((task == NULL) || (task == current_task)) {
3674 				dev->result = ISC_R_CANCELED;
3675 				send_senddone_event(sock, &dev);
3676 			}
3677 			dev = next;
3678 		}
3679 	}
3680 	how &= ~ISC_SOCKCANCEL_SEND;
3681 
3682 	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3683 	    && !ISC_LIST_EMPTY(sock->accept_list)) {
3684 		isc_socket_newconnev_t *dev;
3685 		isc_socket_newconnev_t *next;
3686 		isc_task_t	       *current_task;
3687 
3688 		dev = ISC_LIST_HEAD(sock->accept_list);
3689 		while (dev != NULL) {
3690 			current_task = dev->ev_sender;
3691 			next = ISC_LIST_NEXT(dev, ev_link);
3692 
3693 			if ((task == NULL) || (task == current_task)) {
3694 
3695 				dev->newsocket->references--;
3696 				closesocket(dev->newsocket->fd);
3697 				dev->newsocket->fd = INVALID_SOCKET;
3698 				free_socket(&dev->newsocket, __LINE__);
3699 
3700 				dev->result = ISC_R_CANCELED;
3701 				send_acceptdone_event(sock, &dev);
3702 			}
3703 
3704 			dev = next;
3705 		}
3706 	}
3707 	how &= ~ISC_SOCKCANCEL_ACCEPT;
3708 
3709 	/*
3710 	 * Connecting is not a list.
3711 	 */
3712 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3713 	    && sock->connect_ev != NULL) {
3714 		isc_socket_connev_t    *dev;
3715 		isc_task_t	       *current_task;
3716 
3717 		INSIST(sock->pending_connect);
3718 
3719 		dev = sock->connect_ev;
3720 		current_task = dev->ev_sender;
3721 
3722 		if ((task == NULL) || (task == current_task)) {
3723 			closesocket(sock->fd);
3724 			sock->fd = INVALID_SOCKET;
3725 			_set_state(sock, SOCK_CLOSED);
3726 
3727 			sock->connect_ev = NULL;
3728 			dev->result = ISC_R_CANCELED;
3729 			send_connectdone_event(sock, &dev);
3730 		}
3731 	}
3732 	how &= ~ISC_SOCKCANCEL_CONNECT;
3733 
3734 	maybe_free_socket(&sock, __LINE__);
3735 }
3736 
3737 isc_sockettype_t
isc__socket_gettype(isc_socket_t * sock)3738 isc__socket_gettype(isc_socket_t *sock) {
3739 	isc_sockettype_t type;
3740 
3741 	REQUIRE(VALID_SOCKET(sock));
3742 
3743 	LOCK(&sock->lock);
3744 
3745 	/*
3746 	 * make sure that the socket's not closed
3747 	 */
3748 	if (sock->fd == INVALID_SOCKET) {
3749 		UNLOCK(&sock->lock);
3750 		return (ISC_R_CONNREFUSED);
3751 	}
3752 
3753 	type = sock->type;
3754 	UNLOCK(&sock->lock);
3755 	return (type);
3756 }
3757 
3758 isc_boolean_t
isc__socket_isbound(isc_socket_t * sock)3759 isc__socket_isbound(isc_socket_t *sock) {
3760 	isc_boolean_t val;
3761 
3762 	REQUIRE(VALID_SOCKET(sock));
3763 
3764 	LOCK(&sock->lock);
3765 	CONSISTENT(sock);
3766 
3767 	/*
3768 	 * make sure that the socket's not closed
3769 	 */
3770 	if (sock->fd == INVALID_SOCKET) {
3771 		UNLOCK(&sock->lock);
3772 		return (ISC_FALSE);
3773 	}
3774 
3775 	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3776 	UNLOCK(&sock->lock);
3777 
3778 	return (val);
3779 }
3780 
3781 void
isc__socket_ipv6only(isc_socket_t * sock,isc_boolean_t yes)3782 isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3783 #if defined(IPV6_V6ONLY)
3784 	int onoff = yes ? 1 : 0;
3785 #else
3786 	UNUSED(yes);
3787 #endif
3788 
3789 	REQUIRE(VALID_SOCKET(sock));
3790 
3791 #ifdef IPV6_V6ONLY
3792 	if (sock->pf == AF_INET6) {
3793 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3794 				 (char *)&onoff, sizeof(onoff));
3795 	}
3796 #endif
3797 }
3798 
3799 void
isc__socket_cleanunix(isc_sockaddr_t * addr,isc_boolean_t active)3800 isc__socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3801 	UNUSED(addr);
3802 	UNUSED(active);
3803 }
3804 
3805 isc_result_t
isc__socket_permunix(isc_sockaddr_t * addr,isc_uint32_t perm,isc_uint32_t owner,isc_uint32_t group)3806 isc__socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3807 		     isc_uint32_t owner,	isc_uint32_t group)
3808 {
3809 	UNUSED(addr);
3810 	UNUSED(perm);
3811 	UNUSED(owner);
3812 	UNUSED(group);
3813 	return (ISC_R_NOTIMPLEMENTED);
3814 }
3815 
3816 void
isc__socket_setname(isc_socket_t * socket,const char * name,void * tag)3817 isc__socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3818 
3819 	/*
3820 	 * Name 'socket'.
3821 	 */
3822 
3823 	REQUIRE(VALID_SOCKET(socket));
3824 
3825 	LOCK(&socket->lock);
3826 	memset(socket->name, 0, sizeof(socket->name));
3827 	strncpy(socket->name, name, sizeof(socket->name) - 1);
3828 	socket->tag = tag;
3829 	UNLOCK(&socket->lock);
3830 }
3831 
3832 const char *
isc__socket_getname(isc_socket_t * socket)3833 isc__socket_getname(isc_socket_t *socket) {
3834 	return (socket->name);
3835 }
3836 
3837 void *
isc__socket_gettag(isc_socket_t * socket)3838 isc__socket_gettag(isc_socket_t *socket) {
3839 	return (socket->tag);
3840 }
3841 
3842 int
isc__socket_getfd(isc_socket_t * socket)3843 isc__socket_getfd(isc_socket_t *socket) {
3844 	return ((short) socket->fd);
3845 }
3846 
3847 void
isc__socketmgr_setreserved(isc_socketmgr_t * manager,isc_uint32_t reserved)3848 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3849 	UNUSED(manager);
3850 	UNUSED(reserved);
3851 }
3852 
3853 void
isc___socketmgr_maxudp(isc_socketmgr_t * manager,int maxudp)3854 isc___socketmgr_maxudp(isc_socketmgr_t *manager, int maxudp) {
3855 
3856 	UNUSED(manager);
3857 	UNUSED(maxudp);
3858 }
3859 
3860 #ifdef HAVE_LIBXML2
3861 
3862 static const char *
_socktype(isc_sockettype_t type)3863 _socktype(isc_sockettype_t type)
3864 {
3865 	if (type == isc_sockettype_udp)
3866 		return ("udp");
3867 	else if (type == isc_sockettype_tcp)
3868 		return ("tcp");
3869 	else if (type == isc_sockettype_unix)
3870 		return ("unix");
3871 	else if (type == isc_sockettype_fdwatch)
3872 		return ("fdwatch");
3873 	else
3874 		return ("not-initialized");
3875 }
3876 
3877 void
isc_socketmgr_renderxml(isc_socketmgr_t * mgr,xmlTextWriterPtr writer)3878 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
3879 {
3880 	isc_socket_t *sock;
3881 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3882 	isc_sockaddr_t addr;
3883 	ISC_SOCKADDR_LEN_T len;
3884 
3885 	LOCK(&mgr->lock);
3886 
3887 #ifndef ISC_PLATFORM_USETHREADS
3888 	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3889 	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
3890 	xmlTextWriterEndElement(writer);
3891 #endif
3892 
3893 	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
3894 	sock = ISC_LIST_HEAD(mgr->socklist);
3895 	while (sock != NULL) {
3896 		LOCK(&sock->lock);
3897 		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
3898 
3899 		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
3900 		xmlTextWriterWriteFormatString(writer, "%p", sock);
3901 		xmlTextWriterEndElement(writer);
3902 
3903 		if (sock->name[0] != 0) {
3904 			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
3905 			xmlTextWriterWriteFormatString(writer, "%s",
3906 						       sock->name);
3907 			xmlTextWriterEndElement(writer); /* name */
3908 		}
3909 
3910 		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3911 		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
3912 		xmlTextWriterEndElement(writer);
3913 
3914 		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
3915 					  ISC_XMLCHAR _socktype(sock->type));
3916 
3917 		if (sock->connected) {
3918 			isc_sockaddr_format(&sock->address, peerbuf,
3919 					    sizeof(peerbuf));
3920 			xmlTextWriterWriteElement(writer,
3921 						  ISC_XMLCHAR "peer-address",
3922 						  ISC_XMLCHAR peerbuf);
3923 		}
3924 
3925 		len = sizeof(addr);
3926 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
3927 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
3928 			xmlTextWriterWriteElement(writer,
3929 						  ISC_XMLCHAR "local-address",
3930 						  ISC_XMLCHAR peerbuf);
3931 		}
3932 
3933 		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
3934 		if (sock->pending_recv)
3935 			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3936 						ISC_XMLCHAR "pending-receive");
3937 		if (sock->pending_send)
3938 			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3939 						  ISC_XMLCHAR "pending-send");
3940 		if (sock->pending_accept)
3941 			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3942 						 ISC_XMLCHAR "pending_accept");
3943 		if (sock->listener)
3944 			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3945 						  ISC_XMLCHAR "listener");
3946 		if (sock->connected)
3947 			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3948 						  ISC_XMLCHAR "connected");
3949 		if (sock->pending_connect)
3950 			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3951 						  ISC_XMLCHAR "connecting");
3952 		if (sock->bound)
3953 			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3954 						  ISC_XMLCHAR "bound");
3955 
3956 		xmlTextWriterEndElement(writer); /* states */
3957 
3958 		xmlTextWriterEndElement(writer); /* socket */
3959 
3960 		UNLOCK(&sock->lock);
3961 		sock = ISC_LIST_NEXT(sock, link);
3962 	}
3963 	xmlTextWriterEndElement(writer); /* sockets */
3964 
3965 	UNLOCK(&mgr->lock);
3966 }
3967 #endif /* HAVE_LIBXML2 */
3968