1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris
6
7package poll
8
9import (
10	"io"
11	"sync/atomic"
12	"syscall"
13)
14
15// FD is a file descriptor. The net and os packages use this type as a
16// field of a larger type representing a network connection or OS file.
17type FD struct {
18	// Lock sysfd and serialize access to Read and Write methods.
19	fdmu fdMutex
20
21	// System file descriptor. Immutable until Close.
22	Sysfd int
23
24	// I/O poller.
25	pd pollDesc
26
27	// Writev cache.
28	iovecs *[]syscall.Iovec
29
30	// Semaphore signaled when file is closed.
31	csema uint32
32
33	// Non-zero if this file has been set to blocking mode.
34	isBlocking uint32
35
36	// Whether this is a streaming descriptor, as opposed to a
37	// packet-based descriptor like a UDP socket. Immutable.
38	IsStream bool
39
40	// Whether a zero byte read indicates EOF. This is false for a
41	// message based socket connection.
42	ZeroReadIsEOF bool
43
44	// Whether this is a file rather than a network socket.
45	isFile bool
46}
47
48// Init initializes the FD. The Sysfd field should already be set.
49// This can be called multiple times on a single FD.
50// The net argument is a network name from the net package (e.g., "tcp"),
51// or "file".
52// Set pollable to true if fd should be managed by runtime netpoll.
53func (fd *FD) Init(net string, pollable bool) error {
54	// We don't actually care about the various network types.
55	if net == "file" {
56		fd.isFile = true
57	}
58	if !pollable {
59		fd.isBlocking = 1
60		return nil
61	}
62	err := fd.pd.init(fd)
63	if err != nil {
64		// If we could not initialize the runtime poller,
65		// assume we are using blocking mode.
66		fd.isBlocking = 1
67	}
68	return err
69}
70
71// Destroy closes the file descriptor. This is called when there are
72// no remaining references.
73func (fd *FD) destroy() error {
74	// Poller may want to unregister fd in readiness notification mechanism,
75	// so this must be executed before CloseFunc.
76	fd.pd.close()
77
78	// We don't use ignoringEINTR here because POSIX does not define
79	// whether the descriptor is closed if close returns EINTR.
80	// If the descriptor is indeed closed, using a loop would race
81	// with some other goroutine opening a new descriptor.
82	// (The Linux kernel guarantees that it is closed on an EINTR error.)
83	err := CloseFunc(fd.Sysfd)
84
85	fd.Sysfd = -1
86	runtime_Semrelease(&fd.csema)
87	return err
88}
89
90// Close closes the FD. The underlying file descriptor is closed by the
91// destroy method when there are no remaining references.
92func (fd *FD) Close() error {
93	if !fd.fdmu.increfAndClose() {
94		return errClosing(fd.isFile)
95	}
96
97	// Unblock any I/O.  Once it all unblocks and returns,
98	// so that it cannot be referring to fd.sysfd anymore,
99	// the final decref will close fd.sysfd. This should happen
100	// fairly quickly, since all the I/O is non-blocking, and any
101	// attempts to block in the pollDesc will return errClosing(fd.isFile).
102	fd.pd.evict()
103
104	// The call to decref will call destroy if there are no other
105	// references.
106	err := fd.decref()
107
108	// Wait until the descriptor is closed. If this was the only
109	// reference, it is already closed. Only wait if the file has
110	// not been set to blocking mode, as otherwise any current I/O
111	// may be blocking, and that would block the Close.
112	// No need for an atomic read of isBlocking, increfAndClose means
113	// we have exclusive access to fd.
114	if fd.isBlocking == 0 {
115		runtime_Semacquire(&fd.csema)
116	}
117
118	return err
119}
120
121// SetBlocking puts the file into blocking mode.
122func (fd *FD) SetBlocking() error {
123	if err := fd.incref(); err != nil {
124		return err
125	}
126	defer fd.decref()
127	// Atomic store so that concurrent calls to SetBlocking
128	// do not cause a race condition. isBlocking only ever goes
129	// from 0 to 1 so there is no real race here.
130	atomic.StoreUint32(&fd.isBlocking, 1)
131	return syscall.SetNonblock(fd.Sysfd, false)
132}
133
134// Darwin and FreeBSD can't read or write 2GB+ files at a time,
135// even on 64-bit systems.
136// The same is true of socket implementations on many systems.
137// See golang.org/issue/7812 and golang.org/issue/16266.
138// Use 1GB instead of, say, 2GB-1, to keep subsequent reads aligned.
139const maxRW = 1 << 30
140
141// Read implements io.Reader.
142func (fd *FD) Read(p []byte) (int, error) {
143	if err := fd.readLock(); err != nil {
144		return 0, err
145	}
146	defer fd.readUnlock()
147	if len(p) == 0 {
148		// If the caller wanted a zero byte read, return immediately
149		// without trying (but after acquiring the readLock).
150		// Otherwise syscall.Read returns 0, nil which looks like
151		// io.EOF.
152		// TODO(bradfitz): make it wait for readability? (Issue 15735)
153		return 0, nil
154	}
155	if err := fd.pd.prepareRead(fd.isFile); err != nil {
156		return 0, err
157	}
158	if fd.IsStream && len(p) > maxRW {
159		p = p[:maxRW]
160	}
161	for {
162		n, err := ignoringEINTRIO(syscall.Read, fd.Sysfd, p)
163		if err != nil {
164			n = 0
165			if err == syscall.EAGAIN && fd.pd.pollable() {
166				if err = fd.pd.waitRead(fd.isFile); err == nil {
167					continue
168				}
169			}
170		}
171		err = fd.eofError(n, err)
172		return n, err
173	}
174}
175
176// Pread wraps the pread system call.
177func (fd *FD) Pread(p []byte, off int64) (int, error) {
178	// Call incref, not readLock, because since pread specifies the
179	// offset it is independent from other reads.
180	// Similarly, using the poller doesn't make sense for pread.
181	if err := fd.incref(); err != nil {
182		return 0, err
183	}
184	if fd.IsStream && len(p) > maxRW {
185		p = p[:maxRW]
186	}
187	var (
188		n   int
189		err error
190	)
191	for {
192		n, err = syscall.Pread(fd.Sysfd, p, off)
193		if err != syscall.EINTR {
194			break
195		}
196	}
197	if err != nil {
198		n = 0
199	}
200	fd.decref()
201	err = fd.eofError(n, err)
202	return n, err
203}
204
205// ReadFrom wraps the recvfrom network call.
206func (fd *FD) ReadFrom(p []byte) (int, syscall.Sockaddr, error) {
207	if err := fd.readLock(); err != nil {
208		return 0, nil, err
209	}
210	defer fd.readUnlock()
211	if err := fd.pd.prepareRead(fd.isFile); err != nil {
212		return 0, nil, err
213	}
214	for {
215		n, sa, err := syscall.Recvfrom(fd.Sysfd, p, 0)
216		if err != nil {
217			if err == syscall.EINTR {
218				continue
219			}
220			n = 0
221			if err == syscall.EAGAIN && fd.pd.pollable() {
222				if err = fd.pd.waitRead(fd.isFile); err == nil {
223					continue
224				}
225			}
226		}
227		err = fd.eofError(n, err)
228		return n, sa, err
229	}
230}
231
232// ReadMsg wraps the recvmsg network call.
233func (fd *FD) ReadMsg(p []byte, oob []byte) (int, int, int, syscall.Sockaddr, error) {
234	if err := fd.readLock(); err != nil {
235		return 0, 0, 0, nil, err
236	}
237	defer fd.readUnlock()
238	if err := fd.pd.prepareRead(fd.isFile); err != nil {
239		return 0, 0, 0, nil, err
240	}
241	for {
242		n, oobn, flags, sa, err := syscall.Recvmsg(fd.Sysfd, p, oob, 0)
243		if err != nil {
244			if err == syscall.EINTR {
245				continue
246			}
247			// TODO(dfc) should n and oobn be set to 0
248			if err == syscall.EAGAIN && fd.pd.pollable() {
249				if err = fd.pd.waitRead(fd.isFile); err == nil {
250					continue
251				}
252			}
253		}
254		err = fd.eofError(n, err)
255		return n, oobn, flags, sa, err
256	}
257}
258
259// Write implements io.Writer.
260func (fd *FD) Write(p []byte) (int, error) {
261	if err := fd.writeLock(); err != nil {
262		return 0, err
263	}
264	defer fd.writeUnlock()
265	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
266		return 0, err
267	}
268	var nn int
269	for {
270		max := len(p)
271		if fd.IsStream && max-nn > maxRW {
272			max = nn + maxRW
273		}
274		n, err := ignoringEINTRIO(syscall.Write, fd.Sysfd, p[nn:max])
275		if n > 0 {
276			nn += n
277		}
278		if nn == len(p) {
279			return nn, err
280		}
281		if err == syscall.EAGAIN && fd.pd.pollable() {
282			if err = fd.pd.waitWrite(fd.isFile); err == nil {
283				continue
284			}
285		}
286		if err != nil {
287			return nn, err
288		}
289		if n == 0 {
290			return nn, io.ErrUnexpectedEOF
291		}
292	}
293}
294
295// Pwrite wraps the pwrite system call.
296func (fd *FD) Pwrite(p []byte, off int64) (int, error) {
297	// Call incref, not writeLock, because since pwrite specifies the
298	// offset it is independent from other writes.
299	// Similarly, using the poller doesn't make sense for pwrite.
300	if err := fd.incref(); err != nil {
301		return 0, err
302	}
303	defer fd.decref()
304	var nn int
305	for {
306		max := len(p)
307		if fd.IsStream && max-nn > maxRW {
308			max = nn + maxRW
309		}
310		n, err := syscall.Pwrite(fd.Sysfd, p[nn:max], off+int64(nn))
311		if err == syscall.EINTR {
312			continue
313		}
314		if n > 0 {
315			nn += n
316		}
317		if nn == len(p) {
318			return nn, err
319		}
320		if err != nil {
321			return nn, err
322		}
323		if n == 0 {
324			return nn, io.ErrUnexpectedEOF
325		}
326	}
327}
328
329// WriteTo wraps the sendto network call.
330func (fd *FD) WriteTo(p []byte, sa syscall.Sockaddr) (int, error) {
331	if err := fd.writeLock(); err != nil {
332		return 0, err
333	}
334	defer fd.writeUnlock()
335	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
336		return 0, err
337	}
338	for {
339		err := syscall.Sendto(fd.Sysfd, p, 0, sa)
340		if err == syscall.EINTR {
341			continue
342		}
343		if err == syscall.EAGAIN && fd.pd.pollable() {
344			if err = fd.pd.waitWrite(fd.isFile); err == nil {
345				continue
346			}
347		}
348		if err != nil {
349			return 0, err
350		}
351		return len(p), nil
352	}
353}
354
355// WriteMsg wraps the sendmsg network call.
356func (fd *FD) WriteMsg(p []byte, oob []byte, sa syscall.Sockaddr) (int, int, error) {
357	if err := fd.writeLock(); err != nil {
358		return 0, 0, err
359	}
360	defer fd.writeUnlock()
361	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
362		return 0, 0, err
363	}
364	for {
365		n, err := syscall.SendmsgN(fd.Sysfd, p, oob, sa, 0)
366		if err == syscall.EINTR {
367			continue
368		}
369		if err == syscall.EAGAIN && fd.pd.pollable() {
370			if err = fd.pd.waitWrite(fd.isFile); err == nil {
371				continue
372			}
373		}
374		if err != nil {
375			return n, 0, err
376		}
377		return n, len(oob), err
378	}
379}
380
381// Accept wraps the accept network call.
382func (fd *FD) Accept() (int, syscall.Sockaddr, string, error) {
383	if err := fd.readLock(); err != nil {
384		return -1, nil, "", err
385	}
386	defer fd.readUnlock()
387
388	if err := fd.pd.prepareRead(fd.isFile); err != nil {
389		return -1, nil, "", err
390	}
391	for {
392		s, rsa, errcall, err := accept(fd.Sysfd)
393		if err == nil {
394			return s, rsa, "", err
395		}
396		switch err {
397		case syscall.EINTR:
398			continue
399		case syscall.EAGAIN:
400			if fd.pd.pollable() {
401				if err = fd.pd.waitRead(fd.isFile); err == nil {
402					continue
403				}
404			}
405		case syscall.ECONNABORTED:
406			// This means that a socket on the listen
407			// queue was closed before we Accept()ed it;
408			// it's a silly error, so try again.
409			continue
410		}
411		return -1, nil, errcall, err
412	}
413}
414
415// Seek wraps syscall.Seek.
416func (fd *FD) Seek(offset int64, whence int) (int64, error) {
417	if err := fd.incref(); err != nil {
418		return 0, err
419	}
420	defer fd.decref()
421	return syscall.Seek(fd.Sysfd, offset, whence)
422}
423
424// ReadDirent wraps syscall.ReadDirent.
425// We treat this like an ordinary system call rather than a call
426// that tries to fill the buffer.
427func (fd *FD) ReadDirent(buf []byte) (int, error) {
428	if err := fd.incref(); err != nil {
429		return 0, err
430	}
431	defer fd.decref()
432	for {
433		n, err := ignoringEINTRIO(syscall.ReadDirent, fd.Sysfd, buf)
434		if err != nil {
435			n = 0
436			if err == syscall.EAGAIN && fd.pd.pollable() {
437				if err = fd.pd.waitRead(fd.isFile); err == nil {
438					continue
439				}
440			}
441		}
442		// Do not call eofError; caller does not expect to see io.EOF.
443		return n, err
444	}
445}
446
447// Fchmod wraps syscall.Fchmod.
448func (fd *FD) Fchmod(mode uint32) error {
449	if err := fd.incref(); err != nil {
450		return err
451	}
452	defer fd.decref()
453	return ignoringEINTR(func() error {
454		return syscall.Fchmod(fd.Sysfd, mode)
455	})
456}
457
458// Fchdir wraps syscall.Fchdir.
459func (fd *FD) Fchdir() error {
460	if err := fd.incref(); err != nil {
461		return err
462	}
463	defer fd.decref()
464	return syscall.Fchdir(fd.Sysfd)
465}
466
467// Fstat wraps syscall.Fstat
468func (fd *FD) Fstat(s *syscall.Stat_t) error {
469	if err := fd.incref(); err != nil {
470		return err
471	}
472	defer fd.decref()
473	return ignoringEINTR(func() error {
474		return syscall.Fstat(fd.Sysfd, s)
475	})
476}
477
478// tryDupCloexec indicates whether F_DUPFD_CLOEXEC should be used.
479// If the kernel doesn't support it, this is set to 0.
480var tryDupCloexec = int32(1)
481
482// DupCloseOnExec dups fd and marks it close-on-exec.
483func DupCloseOnExec(fd int) (int, string, error) {
484	if syscall.F_DUPFD_CLOEXEC != 0 && atomic.LoadInt32(&tryDupCloexec) == 1 {
485		r0, e1 := fcntl(fd, syscall.F_DUPFD_CLOEXEC, 0)
486		if e1 == nil {
487			return r0, "", nil
488		}
489		switch e1.(syscall.Errno) {
490		case syscall.EINVAL, syscall.ENOSYS:
491			// Old kernel, or js/wasm (which returns
492			// ENOSYS). Fall back to the portable way from
493			// now on.
494			atomic.StoreInt32(&tryDupCloexec, 0)
495		default:
496			return -1, "fcntl", e1
497		}
498	}
499	return dupCloseOnExecOld(fd)
500}
501
502// dupCloseOnExecOld is the traditional way to dup an fd and
503// set its O_CLOEXEC bit, using two system calls.
504func dupCloseOnExecOld(fd int) (int, string, error) {
505	syscall.ForkLock.RLock()
506	defer syscall.ForkLock.RUnlock()
507	newfd, err := syscall.Dup(fd)
508	if err != nil {
509		return -1, "dup", err
510	}
511	syscall.CloseOnExec(newfd)
512	return newfd, "", nil
513}
514
515// Dup duplicates the file descriptor.
516func (fd *FD) Dup() (int, string, error) {
517	if err := fd.incref(); err != nil {
518		return -1, "", err
519	}
520	defer fd.decref()
521	return DupCloseOnExec(fd.Sysfd)
522}
523
524// On Unix variants only, expose the IO event for the net code.
525
526// WaitWrite waits until data can be read from fd.
527func (fd *FD) WaitWrite() error {
528	return fd.pd.waitWrite(fd.isFile)
529}
530
531// WriteOnce is for testing only. It makes a single write call.
532func (fd *FD) WriteOnce(p []byte) (int, error) {
533	if err := fd.writeLock(); err != nil {
534		return 0, err
535	}
536	defer fd.writeUnlock()
537	return ignoringEINTRIO(syscall.Write, fd.Sysfd, p)
538}
539
540// RawRead invokes the user-defined function f for a read operation.
541func (fd *FD) RawRead(f func(uintptr) bool) error {
542	if err := fd.readLock(); err != nil {
543		return err
544	}
545	defer fd.readUnlock()
546	if err := fd.pd.prepareRead(fd.isFile); err != nil {
547		return err
548	}
549	for {
550		if f(uintptr(fd.Sysfd)) {
551			return nil
552		}
553		if err := fd.pd.waitRead(fd.isFile); err != nil {
554			return err
555		}
556	}
557}
558
559// RawWrite invokes the user-defined function f for a write operation.
560func (fd *FD) RawWrite(f func(uintptr) bool) error {
561	if err := fd.writeLock(); err != nil {
562		return err
563	}
564	defer fd.writeUnlock()
565	if err := fd.pd.prepareWrite(fd.isFile); err != nil {
566		return err
567	}
568	for {
569		if f(uintptr(fd.Sysfd)) {
570			return nil
571		}
572		if err := fd.pd.waitWrite(fd.isFile); err != nil {
573			return err
574		}
575	}
576}
577
578// ignoringEINTRIO is like ignoringEINTR, but just for IO calls.
579func ignoringEINTRIO(fn func(fd int, p []byte) (int, error), fd int, p []byte) (int, error) {
580	for {
581		n, err := fn(fd, p)
582		if err != syscall.EINTR {
583			return n, err
584		}
585	}
586}
587