1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build aix || darwin || dragonfly || freebsd || hurd || (js && wasm) || linux || netbsd || openbsd || solaris
6// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris
7
8package poll
9
10import (
11	"io"
12	"sync/atomic"
13	"syscall"
14)
15
16// FD is a file descriptor. The net and os packages use this type as a
17// field of a larger type representing a network connection or OS file.
18type FD struct {
19	// Lock sysfd and serialize access to Read and Write methods.
20	fdmu fdMutex
21
22	// System file descriptor. Immutable until Close.
23	Sysfd int
24
25	// I/O poller.
26	pd pollDesc
27
28	// Writev cache.
29	iovecs *[]syscall.Iovec
30
31	// Semaphore signaled when file is closed.
32	csema uint32
33
34	// Non-zero if this file has been set to blocking mode.
35	isBlocking uint32
36
37	// Whether this is a streaming descriptor, as opposed to a
38	// packet-based descriptor like a UDP socket. Immutable.
39	IsStream bool
40
41	// Whether a zero byte read indicates EOF. This is false for a
42	// message based socket connection.
43	ZeroReadIsEOF bool
44
45	// Whether this is a file rather than a network socket.
46	isFile bool
47}
48
49// Init initializes the FD. The Sysfd field should already be set.
50// This can be called multiple times on a single FD.
51// The net argument is a network name from the net package (e.g., "tcp"),
52// or "file".
53// Set pollable to true if fd should be managed by runtime netpoll.
54func (fd *FD) Init(net string, pollable bool) error {
55	// We don't actually care about the various network types.
56	if net == "file" {
57		fd.isFile = true
58	}
59	if !pollable {
60		fd.isBlocking = 1
61		return nil
62	}
63	err := fd.pd.init(fd)
64	if err != nil {
65		// If we could not initialize the runtime poller,
66		// assume we are using blocking mode.
67		fd.isBlocking = 1
68	}
69	return err
70}
71
72// Destroy closes the file descriptor. This is called when there are
73// no remaining references.
74func (fd *FD) destroy() error {
75	// Poller may want to unregister fd in readiness notification mechanism,
76	// so this must be executed before CloseFunc.
77	fd.pd.close()
78
79	// We don't use ignoringEINTR here because POSIX does not define
80	// whether the descriptor is closed if close returns EINTR.
81	// If the descriptor is indeed closed, using a loop would race
82	// with some other goroutine opening a new descriptor.
83	// (The Linux kernel guarantees that it is closed on an EINTR error.)
84	err := CloseFunc(fd.Sysfd)
85
86	fd.Sysfd = -1
87	runtime_Semrelease(&fd.csema)
88	return err
89}
90
91// Close closes the FD. The underlying file descriptor is closed by the
92// destroy method when there are no remaining references.
93func (fd *FD) Close() error {
94	if !fd.fdmu.increfAndClose() {
95		return errClosing(fd.isFile)
96	}
97
98	// Unblock any I/O.  Once it all unblocks and returns,
99	// so that it cannot be referring to fd.sysfd anymore,
100	// the final decref will close fd.sysfd. This should happen
101	// fairly quickly, since all the I/O is non-blocking, and any
102	// attempts to block in the pollDesc will return errClosing(fd.isFile).
103	fd.pd.evict()
104
105	// The call to decref will call destroy if there are no other
106	// references.
107	err := fd.decref()
108
109	// Wait until the descriptor is closed. If this was the only
110	// reference, it is already closed. Only wait if the file has
111	// not been set to blocking mode, as otherwise any current I/O
112	// may be blocking, and that would block the Close.
113	// No need for an atomic read of isBlocking, increfAndClose means
114	// we have exclusive access to fd.
115	if fd.isBlocking == 0 {
116		runtime_Semacquire(&fd.csema)
117	}
118
119	return err
120}
121
122// SetBlocking puts the file into blocking mode.
123func (fd *FD) SetBlocking() error {
124	if err := fd.incref(); err != nil {
125		return err
126	}
127	defer fd.decref()
128	// Atomic store so that concurrent calls to SetBlocking
129	// do not cause a race condition. isBlocking only ever goes
130	// from 0 to 1 so there is no real race here.
131	atomic.StoreUint32(&fd.isBlocking, 1)
132	return syscall.SetNonblock(fd.Sysfd, false)
133}
134
135// Darwin and FreeBSD can't read or write 2GB+ files at a time,
136// even on 64-bit systems.
137// The same is true of socket implementations on many systems.
138// See golang.org/issue/7812 and golang.org/issue/16266.
139// Use 1GB instead of, say, 2GB-1, to keep subsequent reads aligned.
140const maxRW = 1 << 30
141
142// Read implements io.Reader.
143func (fd *FD) Read(p []byte) (int, error) {
144	if err := fd.readLock(); err != nil {
145		return 0, err
146	}
147	defer fd.readUnlock()
148	if len(p) == 0 {
149		// If the caller wanted a zero byte read, return immediately
150		// without trying (but after acquiring the readLock).
151		// Otherwise syscall.Read returns 0, nil which looks like
152		// io.EOF.
153		// TODO(bradfitz): make it wait for readability? (Issue 15735)
154		return 0, nil
155	}
156	if err := fd.pd.prepareRead(fd.isFile); err != nil {
157		return 0, err
158	}
159	if fd.IsStream && len(p) > maxRW {
160		p = p[:maxRW]
161	}
162	for {
163		n, err := ignoringEINTRIO(syscall.Read, fd.Sysfd, p)
164		if err != nil {
165			n = 0
166			if err == syscall.EAGAIN && fd.pd.pollable() {
167				if err = fd.pd.waitRead(fd.isFile); err == nil {
168					continue
169				}
170			}
171		}
172		err = fd.eofError(n, err)
173		return n, err
174	}
175}
176
177// Pread wraps the pread system call.
178func (fd *FD) Pread(p []byte, off int64) (int, error) {
179	// Call incref, not readLock, because since pread specifies the
180	// offset it is independent from other reads.
181	// Similarly, using the poller doesn't make sense for pread.
182	if err := fd.incref(); err != nil {
183		return 0, err
184	}
185	if fd.IsStream && len(p) > maxRW {
186		p = p[:maxRW]
187	}
188	var (
189		n   int
190		err error
191	)
192	for {
193		n, err = syscall.Pread(fd.Sysfd, p, off)
194		if err != syscall.EINTR {
195			break
196		}
197	}
198	if err != nil {
199		n = 0
200	}
201	fd.decref()
202	err = fd.eofError(n, err)
203	return n, err
204}
205
206// ReadFrom wraps the recvfrom network call.
207func (fd *FD) ReadFrom(p []byte) (int, syscall.Sockaddr, error) {
208	if err := fd.readLock(); err != nil {
209		return 0, nil, err
210	}
211	defer fd.readUnlock()
212	if err := fd.pd.prepareRead(fd.isFile); err != nil {
213		return 0, nil, err
214	}
215	for {
216		n, sa, err := syscall.Recvfrom(fd.Sysfd, p, 0)
217		if err != nil {
218			if err == syscall.EINTR {
219				continue
220			}
221			n = 0
222			if err == syscall.EAGAIN && fd.pd.pollable() {
223				if err = fd.pd.waitRead(fd.isFile); err == nil {
224					continue
225				}
226			}
227		}
228		err = fd.eofError(n, err)
229		return n, sa, err
230	}
231}
232
233// ReadMsg wraps the recvmsg network call.
234func (fd *FD) ReadMsg(p []byte, oob []byte, flags int) (int, int, int, syscall.Sockaddr, error) {
235	if err := fd.readLock(); err != nil {
236		return 0, 0, 0, nil, err
237	}
238	defer fd.readUnlock()
239	if err := fd.pd.prepareRead(fd.isFile); err != nil {
240		return 0, 0, 0, nil, err
241	}
242	for {
243		n, oobn, sysflags, sa, err := syscall.Recvmsg(fd.Sysfd, p, oob, flags)
244		if err != nil {
245			if err == syscall.EINTR {
246				continue
247			}
248			// TODO(dfc) should n and oobn be set to 0
249			if err == syscall.EAGAIN && fd.pd.pollable() {
250				if err = fd.pd.waitRead(fd.isFile); err == nil {
251					continue
252				}
253			}
254		}
255		err = fd.eofError(n, err)
256		return n, oobn, sysflags, sa, err
257	}
258}
259
260// Write implements io.Writer.
261func (fd *FD) Write(p []byte) (int, error) {
262	if err := fd.writeLock(); err != nil {
263		return 0, err
264	}
265	defer fd.writeUnlock()
266	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
267		return 0, err
268	}
269	var nn int
270	for {
271		max := len(p)
272		if fd.IsStream && max-nn > maxRW {
273			max = nn + maxRW
274		}
275		n, err := ignoringEINTRIO(syscall.Write, fd.Sysfd, p[nn:max])
276		if n > 0 {
277			nn += n
278		}
279		if nn == len(p) {
280			return nn, err
281		}
282		if err == syscall.EAGAIN && fd.pd.pollable() {
283			if err = fd.pd.waitWrite(fd.isFile); err == nil {
284				continue
285			}
286		}
287		if err != nil {
288			return nn, err
289		}
290		if n == 0 {
291			return nn, io.ErrUnexpectedEOF
292		}
293	}
294}
295
296// Pwrite wraps the pwrite system call.
297func (fd *FD) Pwrite(p []byte, off int64) (int, error) {
298	// Call incref, not writeLock, because since pwrite specifies the
299	// offset it is independent from other writes.
300	// Similarly, using the poller doesn't make sense for pwrite.
301	if err := fd.incref(); err != nil {
302		return 0, err
303	}
304	defer fd.decref()
305	var nn int
306	for {
307		max := len(p)
308		if fd.IsStream && max-nn > maxRW {
309			max = nn + maxRW
310		}
311		n, err := syscall.Pwrite(fd.Sysfd, p[nn:max], off+int64(nn))
312		if err == syscall.EINTR {
313			continue
314		}
315		if n > 0 {
316			nn += n
317		}
318		if nn == len(p) {
319			return nn, err
320		}
321		if err != nil {
322			return nn, err
323		}
324		if n == 0 {
325			return nn, io.ErrUnexpectedEOF
326		}
327	}
328}
329
330// WriteTo wraps the sendto network call.
331func (fd *FD) WriteTo(p []byte, sa syscall.Sockaddr) (int, error) {
332	if err := fd.writeLock(); err != nil {
333		return 0, err
334	}
335	defer fd.writeUnlock()
336	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
337		return 0, err
338	}
339	for {
340		err := syscall.Sendto(fd.Sysfd, p, 0, sa)
341		if err == syscall.EINTR {
342			continue
343		}
344		if err == syscall.EAGAIN && fd.pd.pollable() {
345			if err = fd.pd.waitWrite(fd.isFile); err == nil {
346				continue
347			}
348		}
349		if err != nil {
350			return 0, err
351		}
352		return len(p), nil
353	}
354}
355
356// WriteMsg wraps the sendmsg network call.
357func (fd *FD) WriteMsg(p []byte, oob []byte, sa syscall.Sockaddr) (int, int, error) {
358	if err := fd.writeLock(); err != nil {
359		return 0, 0, err
360	}
361	defer fd.writeUnlock()
362	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
363		return 0, 0, err
364	}
365	for {
366		n, err := syscall.SendmsgN(fd.Sysfd, p, oob, sa, 0)
367		if err == syscall.EINTR {
368			continue
369		}
370		if err == syscall.EAGAIN && fd.pd.pollable() {
371			if err = fd.pd.waitWrite(fd.isFile); err == nil {
372				continue
373			}
374		}
375		if err != nil {
376			return n, 0, err
377		}
378		return n, len(oob), err
379	}
380}
381
382// Accept wraps the accept network call.
383func (fd *FD) Accept() (int, syscall.Sockaddr, string, error) {
384	if err := fd.readLock(); err != nil {
385		return -1, nil, "", err
386	}
387	defer fd.readUnlock()
388
389	if err := fd.pd.prepareRead(fd.isFile); err != nil {
390		return -1, nil, "", err
391	}
392	for {
393		s, rsa, errcall, err := accept(fd.Sysfd)
394		if err == nil {
395			return s, rsa, "", err
396		}
397		switch err {
398		case syscall.EINTR:
399			continue
400		case syscall.EAGAIN:
401			if fd.pd.pollable() {
402				if err = fd.pd.waitRead(fd.isFile); err == nil {
403					continue
404				}
405			}
406		case syscall.ECONNABORTED:
407			// This means that a socket on the listen
408			// queue was closed before we Accept()ed it;
409			// it's a silly error, so try again.
410			continue
411		}
412		return -1, nil, errcall, err
413	}
414}
415
416// Seek wraps syscall.Seek.
417func (fd *FD) Seek(offset int64, whence int) (int64, error) {
418	if err := fd.incref(); err != nil {
419		return 0, err
420	}
421	defer fd.decref()
422	return syscall.Seek(fd.Sysfd, offset, whence)
423}
424
425// ReadDirent wraps syscall.ReadDirent.
426// We treat this like an ordinary system call rather than a call
427// that tries to fill the buffer.
428func (fd *FD) ReadDirent(buf []byte) (int, error) {
429	if err := fd.incref(); err != nil {
430		return 0, err
431	}
432	defer fd.decref()
433	for {
434		n, err := ignoringEINTRIO(syscall.ReadDirent, fd.Sysfd, buf)
435		if err != nil {
436			n = 0
437			if err == syscall.EAGAIN && fd.pd.pollable() {
438				if err = fd.pd.waitRead(fd.isFile); err == nil {
439					continue
440				}
441			}
442		}
443		// Do not call eofError; caller does not expect to see io.EOF.
444		return n, err
445	}
446}
447
448// Fchmod wraps syscall.Fchmod.
449func (fd *FD) Fchmod(mode uint32) error {
450	if err := fd.incref(); err != nil {
451		return err
452	}
453	defer fd.decref()
454	return ignoringEINTR(func() error {
455		return syscall.Fchmod(fd.Sysfd, mode)
456	})
457}
458
459// Fchdir wraps syscall.Fchdir.
460func (fd *FD) Fchdir() error {
461	if err := fd.incref(); err != nil {
462		return err
463	}
464	defer fd.decref()
465	return syscall.Fchdir(fd.Sysfd)
466}
467
468// Fstat wraps syscall.Fstat
469func (fd *FD) Fstat(s *syscall.Stat_t) error {
470	if err := fd.incref(); err != nil {
471		return err
472	}
473	defer fd.decref()
474	return ignoringEINTR(func() error {
475		return syscall.Fstat(fd.Sysfd, s)
476	})
477}
478
479// tryDupCloexec indicates whether F_DUPFD_CLOEXEC should be used.
480// If the kernel doesn't support it, this is set to 0.
481var tryDupCloexec = int32(1)
482
483// DupCloseOnExec dups fd and marks it close-on-exec.
484func DupCloseOnExec(fd int) (int, string, error) {
485	if syscall.F_DUPFD_CLOEXEC != 0 && atomic.LoadInt32(&tryDupCloexec) == 1 {
486		r0, e1 := fcntl(fd, syscall.F_DUPFD_CLOEXEC, 0)
487		if e1 == nil {
488			return r0, "", nil
489		}
490		switch e1.(syscall.Errno) {
491		case syscall.EINVAL, syscall.ENOSYS:
492			// Old kernel, or js/wasm (which returns
493			// ENOSYS). Fall back to the portable way from
494			// now on.
495			atomic.StoreInt32(&tryDupCloexec, 0)
496		default:
497			return -1, "fcntl", e1
498		}
499	}
500	return dupCloseOnExecOld(fd)
501}
502
503// dupCloseOnExecOld is the traditional way to dup an fd and
504// set its O_CLOEXEC bit, using two system calls.
505func dupCloseOnExecOld(fd int) (int, string, error) {
506	syscall.ForkLock.RLock()
507	defer syscall.ForkLock.RUnlock()
508	newfd, err := syscall.Dup(fd)
509	if err != nil {
510		return -1, "dup", err
511	}
512	syscall.CloseOnExec(newfd)
513	return newfd, "", nil
514}
515
516// Dup duplicates the file descriptor.
517func (fd *FD) Dup() (int, string, error) {
518	if err := fd.incref(); err != nil {
519		return -1, "", err
520	}
521	defer fd.decref()
522	return DupCloseOnExec(fd.Sysfd)
523}
524
525// On Unix variants only, expose the IO event for the net code.
526
527// WaitWrite waits until data can be read from fd.
528func (fd *FD) WaitWrite() error {
529	return fd.pd.waitWrite(fd.isFile)
530}
531
532// WriteOnce is for testing only. It makes a single write call.
533func (fd *FD) WriteOnce(p []byte) (int, error) {
534	if err := fd.writeLock(); err != nil {
535		return 0, err
536	}
537	defer fd.writeUnlock()
538	return ignoringEINTRIO(syscall.Write, fd.Sysfd, p)
539}
540
541// RawRead invokes the user-defined function f for a read operation.
542func (fd *FD) RawRead(f func(uintptr) bool) error {
543	if err := fd.readLock(); err != nil {
544		return err
545	}
546	defer fd.readUnlock()
547	if err := fd.pd.prepareRead(fd.isFile); err != nil {
548		return err
549	}
550	for {
551		if f(uintptr(fd.Sysfd)) {
552			return nil
553		}
554		if err := fd.pd.waitRead(fd.isFile); err != nil {
555			return err
556		}
557	}
558}
559
560// RawWrite invokes the user-defined function f for a write operation.
561func (fd *FD) RawWrite(f func(uintptr) bool) error {
562	if err := fd.writeLock(); err != nil {
563		return err
564	}
565	defer fd.writeUnlock()
566	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
567		return err
568	}
569	for {
570		if f(uintptr(fd.Sysfd)) {
571			return nil
572		}
573		if err := fd.pd.waitWrite(fd.isFile); err != nil {
574			return err
575		}
576	}
577}
578
579// ignoringEINTRIO is like ignoringEINTR, but just for IO calls.
580func ignoringEINTRIO(fn func(fd int, p []byte) (int, error), fd int, p []byte) (int, error) {
581	for {
582		n, err := fn(fd, p)
583		if err != syscall.EINTR {
584			return n, err
585		}
586	}
587}
588