1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build aix darwin dragonfly freebsd js,wasm linux netbsd openbsd solaris
6
7package poll
8
9import (
10	"io"
11	"runtime"
12	"sync/atomic"
13	"syscall"
14)
15
16// FD is a file descriptor. The net and os packages use this type as a
17// field of a larger type representing a network connection or OS file.
18type FD struct {
19	// Lock sysfd and serialize access to Read and Write methods.
20	fdmu fdMutex
21
22	// System file descriptor. Immutable until Close.
23	Sysfd int
24
25	// I/O poller.
26	pd pollDesc
27
28	// Writev cache.
29	iovecs *[]syscall.Iovec
30
31	// Semaphore signaled when file is closed.
32	csema uint32
33
34	// Non-zero if this file has been set to blocking mode.
35	isBlocking uint32
36
37	// Whether this is a streaming descriptor, as opposed to a
38	// packet-based descriptor like a UDP socket. Immutable.
39	IsStream bool
40
41	// Whether a zero byte read indicates EOF. This is false for a
42	// message based socket connection.
43	ZeroReadIsEOF bool
44
45	// Whether this is a file rather than a network socket.
46	isFile bool
47}
48
49// Init initializes the FD. The Sysfd field should already be set.
50// This can be called multiple times on a single FD.
51// The net argument is a network name from the net package (e.g., "tcp"),
52// or "file".
53// Set pollable to true if fd should be managed by runtime netpoll.
54func (fd *FD) Init(net string, pollable bool) error {
55	// We don't actually care about the various network types.
56	if net == "file" {
57		fd.isFile = true
58	}
59	if !pollable {
60		fd.isBlocking = 1
61		return nil
62	}
63	err := fd.pd.init(fd)
64	if err != nil {
65		// If we could not initialize the runtime poller,
66		// assume we are using blocking mode.
67		fd.isBlocking = 1
68	}
69	return err
70}
71
72// Destroy closes the file descriptor. This is called when there are
73// no remaining references.
74func (fd *FD) destroy() error {
75	// Poller may want to unregister fd in readiness notification mechanism,
76	// so this must be executed before CloseFunc.
77	fd.pd.close()
78	err := CloseFunc(fd.Sysfd)
79	fd.Sysfd = -1
80	runtime_Semrelease(&fd.csema)
81	return err
82}
83
84// Close closes the FD. The underlying file descriptor is closed by the
85// destroy method when there are no remaining references.
86func (fd *FD) Close() error {
87	if !fd.fdmu.increfAndClose() {
88		return errClosing(fd.isFile)
89	}
90
91	// Unblock any I/O.  Once it all unblocks and returns,
92	// so that it cannot be referring to fd.sysfd anymore,
93	// the final decref will close fd.sysfd. This should happen
94	// fairly quickly, since all the I/O is non-blocking, and any
95	// attempts to block in the pollDesc will return errClosing(fd.isFile).
96	fd.pd.evict()
97
98	// The call to decref will call destroy if there are no other
99	// references.
100	err := fd.decref()
101
102	// Wait until the descriptor is closed. If this was the only
103	// reference, it is already closed. Only wait if the file has
104	// not been set to blocking mode, as otherwise any current I/O
105	// may be blocking, and that would block the Close.
106	// No need for an atomic read of isBlocking, increfAndClose means
107	// we have exclusive access to fd.
108	if fd.isBlocking == 0 {
109		runtime_Semacquire(&fd.csema)
110	}
111
112	return err
113}
114
115// Shutdown wraps the shutdown network call.
116func (fd *FD) Shutdown(how int) error {
117	if err := fd.incref(); err != nil {
118		return err
119	}
120	defer fd.decref()
121	return syscall.Shutdown(fd.Sysfd, how)
122}
123
124// SetBlocking puts the file into blocking mode.
125func (fd *FD) SetBlocking() error {
126	if err := fd.incref(); err != nil {
127		return err
128	}
129	defer fd.decref()
130	// Atomic store so that concurrent calls to SetBlocking
131	// do not cause a race condition. isBlocking only ever goes
132	// from 0 to 1 so there is no real race here.
133	atomic.StoreUint32(&fd.isBlocking, 1)
134	return syscall.SetNonblock(fd.Sysfd, false)
135}
136
137// Darwin and FreeBSD can't read or write 2GB+ files at a time,
138// even on 64-bit systems.
139// The same is true of socket implementations on many systems.
140// See golang.org/issue/7812 and golang.org/issue/16266.
141// Use 1GB instead of, say, 2GB-1, to keep subsequent reads aligned.
142const maxRW = 1 << 30
143
144// Read implements io.Reader.
145func (fd *FD) Read(p []byte) (int, error) {
146	if err := fd.readLock(); err != nil {
147		return 0, err
148	}
149	defer fd.readUnlock()
150	if len(p) == 0 {
151		// If the caller wanted a zero byte read, return immediately
152		// without trying (but after acquiring the readLock).
153		// Otherwise syscall.Read returns 0, nil which looks like
154		// io.EOF.
155		// TODO(bradfitz): make it wait for readability? (Issue 15735)
156		return 0, nil
157	}
158	if err := fd.pd.prepareRead(fd.isFile); err != nil {
159		return 0, err
160	}
161	if fd.IsStream && len(p) > maxRW {
162		p = p[:maxRW]
163	}
164	for {
165		n, err := syscall.Read(fd.Sysfd, p)
166		if err != nil {
167			n = 0
168			if err == syscall.EAGAIN && fd.pd.pollable() {
169				if err = fd.pd.waitRead(fd.isFile); err == nil {
170					continue
171				}
172			}
173
174			// On MacOS we can see EINTR here if the user
175			// pressed ^Z.  See issue #22838.
176			if runtime.GOOS == "darwin" && err == syscall.EINTR {
177				continue
178			}
179		}
180		err = fd.eofError(n, err)
181		return n, err
182	}
183}
184
185// Pread wraps the pread system call.
186func (fd *FD) Pread(p []byte, off int64) (int, error) {
187	// Call incref, not readLock, because since pread specifies the
188	// offset it is independent from other reads.
189	// Similarly, using the poller doesn't make sense for pread.
190	if err := fd.incref(); err != nil {
191		return 0, err
192	}
193	if fd.IsStream && len(p) > maxRW {
194		p = p[:maxRW]
195	}
196	n, err := syscall.Pread(fd.Sysfd, p, off)
197	if err != nil {
198		n = 0
199	}
200	fd.decref()
201	err = fd.eofError(n, err)
202	return n, err
203}
204
205// ReadFrom wraps the recvfrom network call.
206func (fd *FD) ReadFrom(p []byte) (int, syscall.Sockaddr, error) {
207	if err := fd.readLock(); err != nil {
208		return 0, nil, err
209	}
210	defer fd.readUnlock()
211	if err := fd.pd.prepareRead(fd.isFile); err != nil {
212		return 0, nil, err
213	}
214	for {
215		n, sa, err := syscall.Recvfrom(fd.Sysfd, p, 0)
216		if err != nil {
217			n = 0
218			if err == syscall.EAGAIN && fd.pd.pollable() {
219				if err = fd.pd.waitRead(fd.isFile); err == nil {
220					continue
221				}
222			}
223		}
224		err = fd.eofError(n, err)
225		return n, sa, err
226	}
227}
228
229// ReadMsg wraps the recvmsg network call.
230func (fd *FD) ReadMsg(p []byte, oob []byte) (int, int, int, syscall.Sockaddr, error) {
231	if err := fd.readLock(); err != nil {
232		return 0, 0, 0, nil, err
233	}
234	defer fd.readUnlock()
235	if err := fd.pd.prepareRead(fd.isFile); err != nil {
236		return 0, 0, 0, nil, err
237	}
238	for {
239		n, oobn, flags, sa, err := syscall.Recvmsg(fd.Sysfd, p, oob, 0)
240		if err != nil {
241			// TODO(dfc) should n and oobn be set to 0
242			if err == syscall.EAGAIN && fd.pd.pollable() {
243				if err = fd.pd.waitRead(fd.isFile); err == nil {
244					continue
245				}
246			}
247		}
248		err = fd.eofError(n, err)
249		return n, oobn, flags, sa, err
250	}
251}
252
253// Write implements io.Writer.
254func (fd *FD) Write(p []byte) (int, error) {
255	if err := fd.writeLock(); err != nil {
256		return 0, err
257	}
258	defer fd.writeUnlock()
259	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
260		return 0, err
261	}
262	var nn int
263	for {
264		max := len(p)
265		if fd.IsStream && max-nn > maxRW {
266			max = nn + maxRW
267		}
268		n, err := syscall.Write(fd.Sysfd, p[nn:max])
269		if n > 0 {
270			nn += n
271		}
272		if nn == len(p) {
273			return nn, err
274		}
275		if err == syscall.EAGAIN && fd.pd.pollable() {
276			if err = fd.pd.waitWrite(fd.isFile); err == nil {
277				continue
278			}
279		}
280		if err != nil {
281			return nn, err
282		}
283		if n == 0 {
284			return nn, io.ErrUnexpectedEOF
285		}
286	}
287}
288
289// Pwrite wraps the pwrite system call.
290func (fd *FD) Pwrite(p []byte, off int64) (int, error) {
291	// Call incref, not writeLock, because since pwrite specifies the
292	// offset it is independent from other writes.
293	// Similarly, using the poller doesn't make sense for pwrite.
294	if err := fd.incref(); err != nil {
295		return 0, err
296	}
297	defer fd.decref()
298	var nn int
299	for {
300		max := len(p)
301		if fd.IsStream && max-nn > maxRW {
302			max = nn + maxRW
303		}
304		n, err := syscall.Pwrite(fd.Sysfd, p[nn:max], off+int64(nn))
305		if n > 0 {
306			nn += n
307		}
308		if nn == len(p) {
309			return nn, err
310		}
311		if err != nil {
312			return nn, err
313		}
314		if n == 0 {
315			return nn, io.ErrUnexpectedEOF
316		}
317	}
318}
319
320// WriteTo wraps the sendto network call.
321func (fd *FD) WriteTo(p []byte, sa syscall.Sockaddr) (int, error) {
322	if err := fd.writeLock(); err != nil {
323		return 0, err
324	}
325	defer fd.writeUnlock()
326	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
327		return 0, err
328	}
329	for {
330		err := syscall.Sendto(fd.Sysfd, p, 0, sa)
331		if err == syscall.EAGAIN && fd.pd.pollable() {
332			if err = fd.pd.waitWrite(fd.isFile); err == nil {
333				continue
334			}
335		}
336		if err != nil {
337			return 0, err
338		}
339		return len(p), nil
340	}
341}
342
343// WriteMsg wraps the sendmsg network call.
344func (fd *FD) WriteMsg(p []byte, oob []byte, sa syscall.Sockaddr) (int, int, error) {
345	if err := fd.writeLock(); err != nil {
346		return 0, 0, err
347	}
348	defer fd.writeUnlock()
349	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
350		return 0, 0, err
351	}
352	for {
353		n, err := syscall.SendmsgN(fd.Sysfd, p, oob, sa, 0)
354		if err == syscall.EAGAIN && fd.pd.pollable() {
355			if err = fd.pd.waitWrite(fd.isFile); err == nil {
356				continue
357			}
358		}
359		if err != nil {
360			return n, 0, err
361		}
362		return n, len(oob), err
363	}
364}
365
366// Accept wraps the accept network call.
367func (fd *FD) Accept() (int, syscall.Sockaddr, string, error) {
368	if err := fd.readLock(); err != nil {
369		return -1, nil, "", err
370	}
371	defer fd.readUnlock()
372
373	if err := fd.pd.prepareRead(fd.isFile); err != nil {
374		return -1, nil, "", err
375	}
376	for {
377		s, rsa, errcall, err := accept(fd.Sysfd)
378		if err == nil {
379			return s, rsa, "", err
380		}
381		switch err {
382		case syscall.EAGAIN:
383			if fd.pd.pollable() {
384				if err = fd.pd.waitRead(fd.isFile); err == nil {
385					continue
386				}
387			}
388		case syscall.ECONNABORTED:
389			// This means that a socket on the listen
390			// queue was closed before we Accept()ed it;
391			// it's a silly error, so try again.
392			continue
393		}
394		return -1, nil, errcall, err
395	}
396}
397
398// Seek wraps syscall.Seek.
399func (fd *FD) Seek(offset int64, whence int) (int64, error) {
400	if err := fd.incref(); err != nil {
401		return 0, err
402	}
403	defer fd.decref()
404	return syscall.Seek(fd.Sysfd, offset, whence)
405}
406
407// ReadDirent wraps syscall.ReadDirent.
408// We treat this like an ordinary system call rather than a call
409// that tries to fill the buffer.
410func (fd *FD) ReadDirent(buf []byte) (int, error) {
411	if err := fd.incref(); err != nil {
412		return 0, err
413	}
414	defer fd.decref()
415	for {
416		n, err := syscall.ReadDirent(fd.Sysfd, buf)
417		if err != nil {
418			n = 0
419			if err == syscall.EAGAIN && fd.pd.pollable() {
420				if err = fd.pd.waitRead(fd.isFile); err == nil {
421					continue
422				}
423			}
424		}
425		// Do not call eofError; caller does not expect to see io.EOF.
426		return n, err
427	}
428}
429
430// Fchdir wraps syscall.Fchdir.
431func (fd *FD) Fchdir() error {
432	if err := fd.incref(); err != nil {
433		return err
434	}
435	defer fd.decref()
436	return syscall.Fchdir(fd.Sysfd)
437}
438
439// Fstat wraps syscall.Fstat
440func (fd *FD) Fstat(s *syscall.Stat_t) error {
441	if err := fd.incref(); err != nil {
442		return err
443	}
444	defer fd.decref()
445	return syscall.Fstat(fd.Sysfd, s)
446}
447
448// tryDupCloexec indicates whether F_DUPFD_CLOEXEC should be used.
449// If the kernel doesn't support it, this is set to 0.
450var tryDupCloexec = int32(1)
451
452// DupCloseOnExec dups fd and marks it close-on-exec.
453func DupCloseOnExec(fd int) (int, string, error) {
454	if syscall.F_DUPFD_CLOEXEC != 0 && atomic.LoadInt32(&tryDupCloexec) == 1 {
455		r0, e1 := fcntl(fd, syscall.F_DUPFD_CLOEXEC, 0)
456		if e1 == nil {
457			return r0, "", nil
458		}
459		switch e1.(syscall.Errno) {
460		case syscall.EINVAL, syscall.ENOSYS:
461			// Old kernel, or js/wasm (which returns
462			// ENOSYS). Fall back to the portable way from
463			// now on.
464			atomic.StoreInt32(&tryDupCloexec, 0)
465		default:
466			return -1, "fcntl", e1
467		}
468	}
469	return dupCloseOnExecOld(fd)
470}
471
472// dupCloseOnExecUnixOld is the traditional way to dup an fd and
473// set its O_CLOEXEC bit, using two system calls.
474func dupCloseOnExecOld(fd int) (int, string, error) {
475	syscall.ForkLock.RLock()
476	defer syscall.ForkLock.RUnlock()
477	newfd, err := syscall.Dup(fd)
478	if err != nil {
479		return -1, "dup", err
480	}
481	syscall.CloseOnExec(newfd)
482	return newfd, "", nil
483}
484
485// Dup duplicates the file descriptor.
486func (fd *FD) Dup() (int, string, error) {
487	if err := fd.incref(); err != nil {
488		return -1, "", err
489	}
490	defer fd.decref()
491	return DupCloseOnExec(fd.Sysfd)
492}
493
494// On Unix variants only, expose the IO event for the net code.
495
496// WaitWrite waits until data can be read from fd.
497func (fd *FD) WaitWrite() error {
498	return fd.pd.waitWrite(fd.isFile)
499}
500
501// WriteOnce is for testing only. It makes a single write call.
502func (fd *FD) WriteOnce(p []byte) (int, error) {
503	if err := fd.writeLock(); err != nil {
504		return 0, err
505	}
506	defer fd.writeUnlock()
507	return syscall.Write(fd.Sysfd, p)
508}
509
510// RawControl invokes the user-defined function f for a non-IO
511// operation.
512func (fd *FD) RawControl(f func(uintptr)) error {
513	if err := fd.incref(); err != nil {
514		return err
515	}
516	defer fd.decref()
517	f(uintptr(fd.Sysfd))
518	return nil
519}
520
521// RawRead invokes the user-defined function f for a read operation.
522func (fd *FD) RawRead(f func(uintptr) bool) error {
523	if err := fd.readLock(); err != nil {
524		return err
525	}
526	defer fd.readUnlock()
527	if err := fd.pd.prepareRead(fd.isFile); err != nil {
528		return err
529	}
530	for {
531		if f(uintptr(fd.Sysfd)) {
532			return nil
533		}
534		if err := fd.pd.waitRead(fd.isFile); err != nil {
535			return err
536		}
537	}
538}
539
540// RawWrite invokes the user-defined function f for a write operation.
541func (fd *FD) RawWrite(f func(uintptr) bool) error {
542	if err := fd.writeLock(); err != nil {
543		return err
544	}
545	defer fd.writeUnlock()
546	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
547		return err
548	}
549	for {
550		if f(uintptr(fd.Sysfd)) {
551			return nil
552		}
553		if err := fd.pd.waitWrite(fd.isFile); err != nil {
554			return err
555		}
556	}
557}
558