1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package poll
6
7import (
8	"errors"
9	"internal/race"
10	"internal/syscall/windows"
11	"io"
12	"sync"
13	"syscall"
14	"unicode/utf16"
15	"unicode/utf8"
16	"unsafe"
17)
18
19var (
20	initErr error
21	ioSync  uint64
22)
23
24// This package uses the SetFileCompletionNotificationModes Windows
25// API to skip calling GetQueuedCompletionStatus if an IO operation
26// completes synchronously. There is a known bug where
27// SetFileCompletionNotificationModes crashes on some systems (see
28// https://support.microsoft.com/kb/2568167 for details).
29
30var useSetFileCompletionNotificationModes bool // determines is SetFileCompletionNotificationModes is present and safe to use
31
32// checkSetFileCompletionNotificationModes verifies that
33// SetFileCompletionNotificationModes Windows API is present
34// on the system and is safe to use.
35// See https://support.microsoft.com/kb/2568167 for details.
36func checkSetFileCompletionNotificationModes() {
37	err := syscall.LoadSetFileCompletionNotificationModes()
38	if err != nil {
39		return
40	}
41	protos := [2]int32{syscall.IPPROTO_TCP, 0}
42	var buf [32]syscall.WSAProtocolInfo
43	len := uint32(unsafe.Sizeof(buf))
44	n, err := syscall.WSAEnumProtocols(&protos[0], &buf[0], &len)
45	if err != nil {
46		return
47	}
48	for i := int32(0); i < n; i++ {
49		if buf[i].ServiceFlags1&syscall.XP1_IFS_HANDLES == 0 {
50			return
51		}
52	}
53	useSetFileCompletionNotificationModes = true
54}
55
56func init() {
57	var d syscall.WSAData
58	e := syscall.WSAStartup(uint32(0x202), &d)
59	if e != nil {
60		initErr = e
61	}
62	checkSetFileCompletionNotificationModes()
63}
64
65// operation contains superset of data necessary to perform all async IO.
66type operation struct {
67	// Used by IOCP interface, it must be first field
68	// of the struct, as our code rely on it.
69	o syscall.Overlapped
70
71	// fields used by runtime.netpoll
72	runtimeCtx uintptr
73	mode       int32
74	errno      int32
75	qty        uint32
76
77	// fields used only by net package
78	fd     *FD
79	buf    syscall.WSABuf
80	msg    windows.WSAMsg
81	sa     syscall.Sockaddr
82	rsa    *syscall.RawSockaddrAny
83	rsan   int32
84	handle syscall.Handle
85	flags  uint32
86	bufs   []syscall.WSABuf
87}
88
89func (o *operation) InitBuf(buf []byte) {
90	o.buf.Len = uint32(len(buf))
91	o.buf.Buf = nil
92	if len(buf) != 0 {
93		o.buf.Buf = &buf[0]
94	}
95}
96
97func (o *operation) InitBufs(buf *[][]byte) {
98	if o.bufs == nil {
99		o.bufs = make([]syscall.WSABuf, 0, len(*buf))
100	} else {
101		o.bufs = o.bufs[:0]
102	}
103	for _, b := range *buf {
104		if len(b) == 0 {
105			o.bufs = append(o.bufs, syscall.WSABuf{})
106			continue
107		}
108		for len(b) > maxRW {
109			o.bufs = append(o.bufs, syscall.WSABuf{Len: maxRW, Buf: &b[0]})
110			b = b[maxRW:]
111		}
112		if len(b) > 0 {
113			o.bufs = append(o.bufs, syscall.WSABuf{Len: uint32(len(b)), Buf: &b[0]})
114		}
115	}
116}
117
118// ClearBufs clears all pointers to Buffers parameter captured
119// by InitBufs, so it can be released by garbage collector.
120func (o *operation) ClearBufs() {
121	for i := range o.bufs {
122		o.bufs[i].Buf = nil
123	}
124	o.bufs = o.bufs[:0]
125}
126
127func (o *operation) InitMsg(p []byte, oob []byte) {
128	o.InitBuf(p)
129	o.msg.Buffers = &o.buf
130	o.msg.BufferCount = 1
131
132	o.msg.Name = nil
133	o.msg.Namelen = 0
134
135	o.msg.Flags = 0
136	o.msg.Control.Len = uint32(len(oob))
137	o.msg.Control.Buf = nil
138	if len(oob) != 0 {
139		o.msg.Control.Buf = &oob[0]
140	}
141}
142
143// execIO executes a single IO operation o. It submits and cancels
144// IO in the current thread for systems where Windows CancelIoEx API
145// is available. Alternatively, it passes the request onto
146// runtime netpoll and waits for completion or cancels request.
147func execIO(o *operation, submit func(o *operation) error) (int, error) {
148	if o.fd.pd.runtimeCtx == 0 {
149		return 0, errors.New("internal error: polling on unsupported descriptor type")
150	}
151
152	fd := o.fd
153	// Notify runtime netpoll about starting IO.
154	err := fd.pd.prepare(int(o.mode), fd.isFile)
155	if err != nil {
156		return 0, err
157	}
158	// Start IO.
159	err = submit(o)
160	switch err {
161	case nil:
162		// IO completed immediately
163		if o.fd.skipSyncNotif {
164			// No completion message will follow, so return immediately.
165			return int(o.qty), nil
166		}
167		// Need to get our completion message anyway.
168	case syscall.ERROR_IO_PENDING:
169		// IO started, and we have to wait for its completion.
170		err = nil
171	default:
172		return 0, err
173	}
174	// Wait for our request to complete.
175	err = fd.pd.wait(int(o.mode), fd.isFile)
176	if err == nil {
177		// All is good. Extract our IO results and return.
178		if o.errno != 0 {
179			err = syscall.Errno(o.errno)
180			// More data available. Return back the size of received data.
181			if err == syscall.ERROR_MORE_DATA || err == windows.WSAEMSGSIZE {
182				return int(o.qty), err
183			}
184			return 0, err
185		}
186		return int(o.qty), nil
187	}
188	// IO is interrupted by "close" or "timeout"
189	netpollErr := err
190	switch netpollErr {
191	case ErrNetClosing, ErrFileClosing, ErrDeadlineExceeded:
192		// will deal with those.
193	default:
194		panic("unexpected runtime.netpoll error: " + netpollErr.Error())
195	}
196	// Cancel our request.
197	err = syscall.CancelIoEx(fd.Sysfd, &o.o)
198	// Assuming ERROR_NOT_FOUND is returned, if IO is completed.
199	if err != nil && err != syscall.ERROR_NOT_FOUND {
200		// TODO(brainman): maybe do something else, but panic.
201		panic(err)
202	}
203	// Wait for cancellation to complete.
204	fd.pd.waitCanceled(int(o.mode))
205	if o.errno != 0 {
206		err = syscall.Errno(o.errno)
207		if err == syscall.ERROR_OPERATION_ABORTED { // IO Canceled
208			err = netpollErr
209		}
210		return 0, err
211	}
212	// We issued a cancellation request. But, it seems, IO operation succeeded
213	// before the cancellation request run. We need to treat the IO operation as
214	// succeeded (the bytes are actually sent/recv from network).
215	return int(o.qty), nil
216}
217
218// FD is a file descriptor. The net and os packages embed this type in
219// a larger type representing a network connection or OS file.
220type FD struct {
221	// Lock sysfd and serialize access to Read and Write methods.
222	fdmu fdMutex
223
224	// System file descriptor. Immutable until Close.
225	Sysfd syscall.Handle
226
227	// Read operation.
228	rop operation
229	// Write operation.
230	wop operation
231
232	// I/O poller.
233	pd pollDesc
234
235	// Used to implement pread/pwrite.
236	l sync.Mutex
237
238	// For console I/O.
239	lastbits       []byte   // first few bytes of the last incomplete rune in last write
240	readuint16     []uint16 // buffer to hold uint16s obtained with ReadConsole
241	readbyte       []byte   // buffer to hold decoding of readuint16 from utf16 to utf8
242	readbyteOffset int      // readbyte[readOffset:] is yet to be consumed with file.Read
243
244	// Semaphore signaled when file is closed.
245	csema uint32
246
247	skipSyncNotif bool
248
249	// Whether this is a streaming descriptor, as opposed to a
250	// packet-based descriptor like a UDP socket.
251	IsStream bool
252
253	// Whether a zero byte read indicates EOF. This is false for a
254	// message based socket connection.
255	ZeroReadIsEOF bool
256
257	// Whether this is a file rather than a network socket.
258	isFile bool
259
260	// The kind of this file.
261	kind fileKind
262}
263
264// fileKind describes the kind of file.
265type fileKind byte
266
267const (
268	kindNet fileKind = iota
269	kindFile
270	kindConsole
271	kindDir
272	kindPipe
273)
274
275// logInitFD is set by tests to enable file descriptor initialization logging.
276var logInitFD func(net string, fd *FD, err error)
277
278// Init initializes the FD. The Sysfd field should already be set.
279// This can be called multiple times on a single FD.
280// The net argument is a network name from the net package (e.g., "tcp"),
281// or "file" or "console" or "dir".
282// Set pollable to true if fd should be managed by runtime netpoll.
283func (fd *FD) Init(net string, pollable bool) (string, error) {
284	if initErr != nil {
285		return "", initErr
286	}
287
288	switch net {
289	case "file":
290		fd.kind = kindFile
291	case "console":
292		fd.kind = kindConsole
293	case "dir":
294		fd.kind = kindDir
295	case "pipe":
296		fd.kind = kindPipe
297	case "tcp", "tcp4", "tcp6",
298		"udp", "udp4", "udp6",
299		"ip", "ip4", "ip6",
300		"unix", "unixgram", "unixpacket":
301		fd.kind = kindNet
302	default:
303		return "", errors.New("internal error: unknown network type " + net)
304	}
305	fd.isFile = fd.kind != kindNet
306
307	var err error
308	if pollable {
309		// Only call init for a network socket.
310		// This means that we don't add files to the runtime poller.
311		// Adding files to the runtime poller can confuse matters
312		// if the user is doing their own overlapped I/O.
313		// See issue #21172.
314		//
315		// In general the code below avoids calling the execIO
316		// function for non-network sockets. If some method does
317		// somehow call execIO, then execIO, and therefore the
318		// calling method, will return an error, because
319		// fd.pd.runtimeCtx will be 0.
320		err = fd.pd.init(fd)
321	}
322	if logInitFD != nil {
323		logInitFD(net, fd, err)
324	}
325	if err != nil {
326		return "", err
327	}
328	if pollable && useSetFileCompletionNotificationModes {
329		// We do not use events, so we can skip them always.
330		flags := uint8(syscall.FILE_SKIP_SET_EVENT_ON_HANDLE)
331		// It's not safe to skip completion notifications for UDP:
332		// https://docs.microsoft.com/en-us/archive/blogs/winserverperformance/designing-applications-for-high-performance-part-iii
333		if net == "tcp" {
334			flags |= syscall.FILE_SKIP_COMPLETION_PORT_ON_SUCCESS
335		}
336		err := syscall.SetFileCompletionNotificationModes(fd.Sysfd, flags)
337		if err == nil && flags&syscall.FILE_SKIP_COMPLETION_PORT_ON_SUCCESS != 0 {
338			fd.skipSyncNotif = true
339		}
340	}
341	// Disable SIO_UDP_CONNRESET behavior.
342	// http://support.microsoft.com/kb/263823
343	switch net {
344	case "udp", "udp4", "udp6":
345		ret := uint32(0)
346		flag := uint32(0)
347		size := uint32(unsafe.Sizeof(flag))
348		err := syscall.WSAIoctl(fd.Sysfd, syscall.SIO_UDP_CONNRESET, (*byte)(unsafe.Pointer(&flag)), size, nil, 0, &ret, nil, 0)
349		if err != nil {
350			return "wsaioctl", err
351		}
352	}
353	fd.rop.mode = 'r'
354	fd.wop.mode = 'w'
355	fd.rop.fd = fd
356	fd.wop.fd = fd
357	fd.rop.runtimeCtx = fd.pd.runtimeCtx
358	fd.wop.runtimeCtx = fd.pd.runtimeCtx
359	return "", nil
360}
361
362func (fd *FD) destroy() error {
363	if fd.Sysfd == syscall.InvalidHandle {
364		return syscall.EINVAL
365	}
366	// Poller may want to unregister fd in readiness notification mechanism,
367	// so this must be executed before fd.CloseFunc.
368	fd.pd.close()
369	var err error
370	switch fd.kind {
371	case kindNet:
372		// The net package uses the CloseFunc variable for testing.
373		err = CloseFunc(fd.Sysfd)
374	case kindDir:
375		err = syscall.FindClose(fd.Sysfd)
376	default:
377		err = syscall.CloseHandle(fd.Sysfd)
378	}
379	fd.Sysfd = syscall.InvalidHandle
380	runtime_Semrelease(&fd.csema)
381	return err
382}
383
384// Close closes the FD. The underlying file descriptor is closed by
385// the destroy method when there are no remaining references.
386func (fd *FD) Close() error {
387	if !fd.fdmu.increfAndClose() {
388		return errClosing(fd.isFile)
389	}
390	if fd.kind == kindPipe {
391		syscall.CancelIoEx(fd.Sysfd, nil)
392	}
393	// unblock pending reader and writer
394	fd.pd.evict()
395	err := fd.decref()
396	// Wait until the descriptor is closed. If this was the only
397	// reference, it is already closed.
398	runtime_Semacquire(&fd.csema)
399	return err
400}
401
402// Windows ReadFile and WSARecv use DWORD (uint32) parameter to pass buffer length.
403// This prevents us reading blocks larger than 4GB.
404// See golang.org/issue/26923.
405const maxRW = 1 << 30 // 1GB is large enough and keeps subsequent reads aligned
406
407// Read implements io.Reader.
408func (fd *FD) Read(buf []byte) (int, error) {
409	if err := fd.readLock(); err != nil {
410		return 0, err
411	}
412	defer fd.readUnlock()
413
414	if len(buf) > maxRW {
415		buf = buf[:maxRW]
416	}
417
418	var n int
419	var err error
420	if fd.isFile {
421		fd.l.Lock()
422		defer fd.l.Unlock()
423		switch fd.kind {
424		case kindConsole:
425			n, err = fd.readConsole(buf)
426		default:
427			n, err = syscall.Read(fd.Sysfd, buf)
428			if fd.kind == kindPipe && err == syscall.ERROR_OPERATION_ABORTED {
429				// Close uses CancelIoEx to interrupt concurrent I/O for pipes.
430				// If the fd is a pipe and the Read was interrupted by CancelIoEx,
431				// we assume it is interrupted by Close.
432				err = ErrFileClosing
433			}
434		}
435		if err != nil {
436			n = 0
437		}
438	} else {
439		o := &fd.rop
440		o.InitBuf(buf)
441		n, err = execIO(o, func(o *operation) error {
442			return syscall.WSARecv(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, &o.o, nil)
443		})
444		if race.Enabled {
445			race.Acquire(unsafe.Pointer(&ioSync))
446		}
447	}
448	if len(buf) != 0 {
449		err = fd.eofError(n, err)
450	}
451	return n, err
452}
453
454var ReadConsole = syscall.ReadConsole // changed for testing
455
456// readConsole reads utf16 characters from console File,
457// encodes them into utf8 and stores them in buffer b.
458// It returns the number of utf8 bytes read and an error, if any.
459func (fd *FD) readConsole(b []byte) (int, error) {
460	if len(b) == 0 {
461		return 0, nil
462	}
463
464	if fd.readuint16 == nil {
465		// Note: syscall.ReadConsole fails for very large buffers.
466		// The limit is somewhere around (but not exactly) 16384.
467		// Stay well below.
468		fd.readuint16 = make([]uint16, 0, 10000)
469		fd.readbyte = make([]byte, 0, 4*cap(fd.readuint16))
470	}
471
472	for fd.readbyteOffset >= len(fd.readbyte) {
473		n := cap(fd.readuint16) - len(fd.readuint16)
474		if n > len(b) {
475			n = len(b)
476		}
477		var nw uint32
478		err := ReadConsole(fd.Sysfd, &fd.readuint16[:len(fd.readuint16)+1][len(fd.readuint16)], uint32(n), &nw, nil)
479		if err != nil {
480			return 0, err
481		}
482		uint16s := fd.readuint16[:len(fd.readuint16)+int(nw)]
483		fd.readuint16 = fd.readuint16[:0]
484		buf := fd.readbyte[:0]
485		for i := 0; i < len(uint16s); i++ {
486			r := rune(uint16s[i])
487			if utf16.IsSurrogate(r) {
488				if i+1 == len(uint16s) {
489					if nw > 0 {
490						// Save half surrogate pair for next time.
491						fd.readuint16 = fd.readuint16[:1]
492						fd.readuint16[0] = uint16(r)
493						break
494					}
495					r = utf8.RuneError
496				} else {
497					r = utf16.DecodeRune(r, rune(uint16s[i+1]))
498					if r != utf8.RuneError {
499						i++
500					}
501				}
502			}
503			n := utf8.EncodeRune(buf[len(buf):cap(buf)], r)
504			buf = buf[:len(buf)+n]
505		}
506		fd.readbyte = buf
507		fd.readbyteOffset = 0
508		if nw == 0 {
509			break
510		}
511	}
512
513	src := fd.readbyte[fd.readbyteOffset:]
514	var i int
515	for i = 0; i < len(src) && i < len(b); i++ {
516		x := src[i]
517		if x == 0x1A { // Ctrl-Z
518			if i == 0 {
519				fd.readbyteOffset++
520			}
521			break
522		}
523		b[i] = x
524	}
525	fd.readbyteOffset += i
526	return i, nil
527}
528
529// Pread emulates the Unix pread system call.
530func (fd *FD) Pread(b []byte, off int64) (int, error) {
531	// Call incref, not readLock, because since pread specifies the
532	// offset it is independent from other reads.
533	if err := fd.incref(); err != nil {
534		return 0, err
535	}
536	defer fd.decref()
537
538	if len(b) > maxRW {
539		b = b[:maxRW]
540	}
541
542	fd.l.Lock()
543	defer fd.l.Unlock()
544	curoffset, e := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
545	if e != nil {
546		return 0, e
547	}
548	defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
549	o := syscall.Overlapped{
550		OffsetHigh: uint32(off >> 32),
551		Offset:     uint32(off),
552	}
553	var done uint32
554	e = syscall.ReadFile(fd.Sysfd, b, &done, &o)
555	if e != nil {
556		done = 0
557		if e == syscall.ERROR_HANDLE_EOF {
558			e = io.EOF
559		}
560	}
561	if len(b) != 0 {
562		e = fd.eofError(int(done), e)
563	}
564	return int(done), e
565}
566
567// ReadFrom wraps the recvfrom network call.
568func (fd *FD) ReadFrom(buf []byte) (int, syscall.Sockaddr, error) {
569	if len(buf) == 0 {
570		return 0, nil, nil
571	}
572	if len(buf) > maxRW {
573		buf = buf[:maxRW]
574	}
575	if err := fd.readLock(); err != nil {
576		return 0, nil, err
577	}
578	defer fd.readUnlock()
579	o := &fd.rop
580	o.InitBuf(buf)
581	n, err := execIO(o, func(o *operation) error {
582		if o.rsa == nil {
583			o.rsa = new(syscall.RawSockaddrAny)
584		}
585		o.rsan = int32(unsafe.Sizeof(*o.rsa))
586		return syscall.WSARecvFrom(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, o.rsa, &o.rsan, &o.o, nil)
587	})
588	err = fd.eofError(n, err)
589	if err != nil {
590		return n, nil, err
591	}
592	sa, _ := o.rsa.Sockaddr()
593	return n, sa, nil
594}
595
596// Write implements io.Writer.
597func (fd *FD) Write(buf []byte) (int, error) {
598	if err := fd.writeLock(); err != nil {
599		return 0, err
600	}
601	defer fd.writeUnlock()
602	if fd.isFile {
603		fd.l.Lock()
604		defer fd.l.Unlock()
605	}
606
607	ntotal := 0
608	for len(buf) > 0 {
609		b := buf
610		if len(b) > maxRW {
611			b = b[:maxRW]
612		}
613		var n int
614		var err error
615		if fd.isFile {
616			switch fd.kind {
617			case kindConsole:
618				n, err = fd.writeConsole(b)
619			default:
620				n, err = syscall.Write(fd.Sysfd, b)
621				if fd.kind == kindPipe && err == syscall.ERROR_OPERATION_ABORTED {
622					// Close uses CancelIoEx to interrupt concurrent I/O for pipes.
623					// If the fd is a pipe and the Write was interrupted by CancelIoEx,
624					// we assume it is interrupted by Close.
625					err = ErrFileClosing
626				}
627			}
628			if err != nil {
629				n = 0
630			}
631		} else {
632			if race.Enabled {
633				race.ReleaseMerge(unsafe.Pointer(&ioSync))
634			}
635			o := &fd.wop
636			o.InitBuf(b)
637			n, err = execIO(o, func(o *operation) error {
638				return syscall.WSASend(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, &o.o, nil)
639			})
640		}
641		ntotal += n
642		if err != nil {
643			return ntotal, err
644		}
645		buf = buf[n:]
646	}
647	return ntotal, nil
648}
649
650// writeConsole writes len(b) bytes to the console File.
651// It returns the number of bytes written and an error, if any.
652func (fd *FD) writeConsole(b []byte) (int, error) {
653	n := len(b)
654	runes := make([]rune, 0, 256)
655	if len(fd.lastbits) > 0 {
656		b = append(fd.lastbits, b...)
657		fd.lastbits = nil
658
659	}
660	for len(b) >= utf8.UTFMax || utf8.FullRune(b) {
661		r, l := utf8.DecodeRune(b)
662		runes = append(runes, r)
663		b = b[l:]
664	}
665	if len(b) > 0 {
666		fd.lastbits = make([]byte, len(b))
667		copy(fd.lastbits, b)
668	}
669	// syscall.WriteConsole seems to fail, if given large buffer.
670	// So limit the buffer to 16000 characters. This number was
671	// discovered by experimenting with syscall.WriteConsole.
672	const maxWrite = 16000
673	for len(runes) > 0 {
674		m := len(runes)
675		if m > maxWrite {
676			m = maxWrite
677		}
678		chunk := runes[:m]
679		runes = runes[m:]
680		uint16s := utf16.Encode(chunk)
681		for len(uint16s) > 0 {
682			var written uint32
683			err := syscall.WriteConsole(fd.Sysfd, &uint16s[0], uint32(len(uint16s)), &written, nil)
684			if err != nil {
685				return 0, err
686			}
687			uint16s = uint16s[written:]
688		}
689	}
690	return n, nil
691}
692
693// Pwrite emulates the Unix pwrite system call.
694func (fd *FD) Pwrite(buf []byte, off int64) (int, error) {
695	// Call incref, not writeLock, because since pwrite specifies the
696	// offset it is independent from other writes.
697	if err := fd.incref(); err != nil {
698		return 0, err
699	}
700	defer fd.decref()
701
702	fd.l.Lock()
703	defer fd.l.Unlock()
704	curoffset, e := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
705	if e != nil {
706		return 0, e
707	}
708	defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
709
710	ntotal := 0
711	for len(buf) > 0 {
712		b := buf
713		if len(b) > maxRW {
714			b = b[:maxRW]
715		}
716		var n uint32
717		o := syscall.Overlapped{
718			OffsetHigh: uint32(off >> 32),
719			Offset:     uint32(off),
720		}
721		e = syscall.WriteFile(fd.Sysfd, b, &n, &o)
722		ntotal += int(n)
723		if e != nil {
724			return ntotal, e
725		}
726		buf = buf[n:]
727		off += int64(n)
728	}
729	return ntotal, nil
730}
731
732// Writev emulates the Unix writev system call.
733func (fd *FD) Writev(buf *[][]byte) (int64, error) {
734	if len(*buf) == 0 {
735		return 0, nil
736	}
737	if err := fd.writeLock(); err != nil {
738		return 0, err
739	}
740	defer fd.writeUnlock()
741	if race.Enabled {
742		race.ReleaseMerge(unsafe.Pointer(&ioSync))
743	}
744	o := &fd.wop
745	o.InitBufs(buf)
746	n, err := execIO(o, func(o *operation) error {
747		return syscall.WSASend(o.fd.Sysfd, &o.bufs[0], uint32(len(o.bufs)), &o.qty, 0, &o.o, nil)
748	})
749	o.ClearBufs()
750	TestHookDidWritev(n)
751	consume(buf, int64(n))
752	return int64(n), err
753}
754
755// WriteTo wraps the sendto network call.
756func (fd *FD) WriteTo(buf []byte, sa syscall.Sockaddr) (int, error) {
757	if err := fd.writeLock(); err != nil {
758		return 0, err
759	}
760	defer fd.writeUnlock()
761
762	if len(buf) == 0 {
763		// handle zero-byte payload
764		o := &fd.wop
765		o.InitBuf(buf)
766		o.sa = sa
767		n, err := execIO(o, func(o *operation) error {
768			return syscall.WSASendto(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, o.sa, &o.o, nil)
769		})
770		return n, err
771	}
772
773	ntotal := 0
774	for len(buf) > 0 {
775		b := buf
776		if len(b) > maxRW {
777			b = b[:maxRW]
778		}
779		o := &fd.wop
780		o.InitBuf(b)
781		o.sa = sa
782		n, err := execIO(o, func(o *operation) error {
783			return syscall.WSASendto(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, o.sa, &o.o, nil)
784		})
785		ntotal += int(n)
786		if err != nil {
787			return ntotal, err
788		}
789		buf = buf[n:]
790	}
791	return ntotal, nil
792}
793
794// Call ConnectEx. This doesn't need any locking, since it is only
795// called when the descriptor is first created. This is here rather
796// than in the net package so that it can use fd.wop.
797func (fd *FD) ConnectEx(ra syscall.Sockaddr) error {
798	o := &fd.wop
799	o.sa = ra
800	_, err := execIO(o, func(o *operation) error {
801		return ConnectExFunc(o.fd.Sysfd, o.sa, nil, 0, nil, &o.o)
802	})
803	return err
804}
805
806func (fd *FD) acceptOne(s syscall.Handle, rawsa []syscall.RawSockaddrAny, o *operation) (string, error) {
807	// Submit accept request.
808	o.handle = s
809	o.rsan = int32(unsafe.Sizeof(rawsa[0]))
810	_, err := execIO(o, func(o *operation) error {
811		return AcceptFunc(o.fd.Sysfd, o.handle, (*byte)(unsafe.Pointer(&rawsa[0])), 0, uint32(o.rsan), uint32(o.rsan), &o.qty, &o.o)
812	})
813	if err != nil {
814		CloseFunc(s)
815		return "acceptex", err
816	}
817
818	// Inherit properties of the listening socket.
819	err = syscall.Setsockopt(s, syscall.SOL_SOCKET, syscall.SO_UPDATE_ACCEPT_CONTEXT, (*byte)(unsafe.Pointer(&fd.Sysfd)), int32(unsafe.Sizeof(fd.Sysfd)))
820	if err != nil {
821		CloseFunc(s)
822		return "setsockopt", err
823	}
824
825	return "", nil
826}
827
828// Accept handles accepting a socket. The sysSocket parameter is used
829// to allocate the net socket.
830func (fd *FD) Accept(sysSocket func() (syscall.Handle, error)) (syscall.Handle, []syscall.RawSockaddrAny, uint32, string, error) {
831	if err := fd.readLock(); err != nil {
832		return syscall.InvalidHandle, nil, 0, "", err
833	}
834	defer fd.readUnlock()
835
836	o := &fd.rop
837	var rawsa [2]syscall.RawSockaddrAny
838	for {
839		s, err := sysSocket()
840		if err != nil {
841			return syscall.InvalidHandle, nil, 0, "", err
842		}
843
844		errcall, err := fd.acceptOne(s, rawsa[:], o)
845		if err == nil {
846			return s, rawsa[:], uint32(o.rsan), "", nil
847		}
848
849		// Sometimes we see WSAECONNRESET and ERROR_NETNAME_DELETED is
850		// returned here. These happen if connection reset is received
851		// before AcceptEx could complete. These errors relate to new
852		// connection, not to AcceptEx, so ignore broken connection and
853		// try AcceptEx again for more connections.
854		errno, ok := err.(syscall.Errno)
855		if !ok {
856			return syscall.InvalidHandle, nil, 0, errcall, err
857		}
858		switch errno {
859		case syscall.ERROR_NETNAME_DELETED, syscall.WSAECONNRESET:
860			// ignore these and try again
861		default:
862			return syscall.InvalidHandle, nil, 0, errcall, err
863		}
864	}
865}
866
867// Seek wraps syscall.Seek.
868func (fd *FD) Seek(offset int64, whence int) (int64, error) {
869	if err := fd.incref(); err != nil {
870		return 0, err
871	}
872	defer fd.decref()
873
874	fd.l.Lock()
875	defer fd.l.Unlock()
876
877	return syscall.Seek(fd.Sysfd, offset, whence)
878}
879
880// FindNextFile wraps syscall.FindNextFile.
881func (fd *FD) FindNextFile(data *syscall.Win32finddata) error {
882	if err := fd.incref(); err != nil {
883		return err
884	}
885	defer fd.decref()
886	return syscall.FindNextFile(fd.Sysfd, data)
887}
888
889// Fchmod updates syscall.ByHandleFileInformation.Fileattributes when needed.
890func (fd *FD) Fchmod(mode uint32) error {
891	if err := fd.incref(); err != nil {
892		return err
893	}
894	defer fd.decref()
895
896	var d syscall.ByHandleFileInformation
897	if err := syscall.GetFileInformationByHandle(fd.Sysfd, &d); err != nil {
898		return err
899	}
900	attrs := d.FileAttributes
901	if mode&syscall.S_IWRITE != 0 {
902		attrs &^= syscall.FILE_ATTRIBUTE_READONLY
903	} else {
904		attrs |= syscall.FILE_ATTRIBUTE_READONLY
905	}
906	if attrs == d.FileAttributes {
907		return nil
908	}
909
910	var du windows.FILE_BASIC_INFO
911	du.FileAttributes = attrs
912	l := uint32(unsafe.Sizeof(d))
913	return windows.SetFileInformationByHandle(fd.Sysfd, windows.FileBasicInfo, uintptr(unsafe.Pointer(&du)), l)
914}
915
916// Fchdir wraps syscall.Fchdir.
917func (fd *FD) Fchdir() error {
918	if err := fd.incref(); err != nil {
919		return err
920	}
921	defer fd.decref()
922	return syscall.Fchdir(fd.Sysfd)
923}
924
925// GetFileType wraps syscall.GetFileType.
926func (fd *FD) GetFileType() (uint32, error) {
927	if err := fd.incref(); err != nil {
928		return 0, err
929	}
930	defer fd.decref()
931	return syscall.GetFileType(fd.Sysfd)
932}
933
934// GetFileInformationByHandle wraps GetFileInformationByHandle.
935func (fd *FD) GetFileInformationByHandle(data *syscall.ByHandleFileInformation) error {
936	if err := fd.incref(); err != nil {
937		return err
938	}
939	defer fd.decref()
940	return syscall.GetFileInformationByHandle(fd.Sysfd, data)
941}
942
943// RawRead invokes the user-defined function f for a read operation.
944func (fd *FD) RawRead(f func(uintptr) bool) error {
945	if err := fd.readLock(); err != nil {
946		return err
947	}
948	defer fd.readUnlock()
949	for {
950		if f(uintptr(fd.Sysfd)) {
951			return nil
952		}
953
954		// Use a zero-byte read as a way to get notified when this
955		// socket is readable. h/t https://stackoverflow.com/a/42019668/332798
956		o := &fd.rop
957		o.InitBuf(nil)
958		if !fd.IsStream {
959			o.flags |= windows.MSG_PEEK
960		}
961		_, err := execIO(o, func(o *operation) error {
962			return syscall.WSARecv(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, &o.o, nil)
963		})
964		if err == windows.WSAEMSGSIZE {
965			// expected with a 0-byte peek, ignore.
966		} else if err != nil {
967			return err
968		}
969	}
970}
971
972// RawWrite invokes the user-defined function f for a write operation.
973func (fd *FD) RawWrite(f func(uintptr) bool) error {
974	if err := fd.writeLock(); err != nil {
975		return err
976	}
977	defer fd.writeUnlock()
978
979	if f(uintptr(fd.Sysfd)) {
980		return nil
981	}
982
983	// TODO(tmm1): find a way to detect socket writability
984	return syscall.EWINDOWS
985}
986
987func sockaddrToRaw(sa syscall.Sockaddr) (unsafe.Pointer, int32, error) {
988	switch sa := sa.(type) {
989	case *syscall.SockaddrInet4:
990		var raw syscall.RawSockaddrInet4
991		raw.Family = syscall.AF_INET
992		p := (*[2]byte)(unsafe.Pointer(&raw.Port))
993		p[0] = byte(sa.Port >> 8)
994		p[1] = byte(sa.Port)
995		for i := 0; i < len(sa.Addr); i++ {
996			raw.Addr[i] = sa.Addr[i]
997		}
998		return unsafe.Pointer(&raw), int32(unsafe.Sizeof(raw)), nil
999	case *syscall.SockaddrInet6:
1000		var raw syscall.RawSockaddrInet6
1001		raw.Family = syscall.AF_INET6
1002		p := (*[2]byte)(unsafe.Pointer(&raw.Port))
1003		p[0] = byte(sa.Port >> 8)
1004		p[1] = byte(sa.Port)
1005		raw.Scope_id = sa.ZoneId
1006		for i := 0; i < len(sa.Addr); i++ {
1007			raw.Addr[i] = sa.Addr[i]
1008		}
1009		return unsafe.Pointer(&raw), int32(unsafe.Sizeof(raw)), nil
1010	default:
1011		return nil, 0, syscall.EWINDOWS
1012	}
1013}
1014
1015// ReadMsg wraps the WSARecvMsg network call.
1016func (fd *FD) ReadMsg(p []byte, oob []byte) (int, int, int, syscall.Sockaddr, error) {
1017	if err := fd.readLock(); err != nil {
1018		return 0, 0, 0, nil, err
1019	}
1020	defer fd.readUnlock()
1021
1022	if len(p) > maxRW {
1023		p = p[:maxRW]
1024	}
1025
1026	o := &fd.rop
1027	o.InitMsg(p, oob)
1028	o.rsa = new(syscall.RawSockaddrAny)
1029	o.msg.Name = (syscall.Pointer)(unsafe.Pointer(o.rsa))
1030	o.msg.Namelen = int32(unsafe.Sizeof(*o.rsa))
1031	n, err := execIO(o, func(o *operation) error {
1032		return windows.WSARecvMsg(o.fd.Sysfd, &o.msg, &o.qty, &o.o, nil)
1033	})
1034	err = fd.eofError(n, err)
1035	var sa syscall.Sockaddr
1036	if err == nil {
1037		sa, err = o.rsa.Sockaddr()
1038	}
1039	return n, int(o.msg.Control.Len), int(o.msg.Flags), sa, err
1040}
1041
1042// WriteMsg wraps the WSASendMsg network call.
1043func (fd *FD) WriteMsg(p []byte, oob []byte, sa syscall.Sockaddr) (int, int, error) {
1044	if len(p) > maxRW {
1045		return 0, 0, errors.New("packet is too large (only 1GB is allowed)")
1046	}
1047
1048	if err := fd.writeLock(); err != nil {
1049		return 0, 0, err
1050	}
1051	defer fd.writeUnlock()
1052
1053	o := &fd.wop
1054	o.InitMsg(p, oob)
1055	if sa != nil {
1056		rsa, len, err := sockaddrToRaw(sa)
1057		if err != nil {
1058			return 0, 0, err
1059		}
1060		o.msg.Name = (syscall.Pointer)(rsa)
1061		o.msg.Namelen = len
1062	}
1063	n, err := execIO(o, func(o *operation) error {
1064		return windows.WSASendMsg(o.fd.Sysfd, &o.msg, 0, &o.qty, &o.o, nil)
1065	})
1066	return n, int(o.msg.Control.Len), err
1067}
1068