1// Copyright 2017 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build aix || darwin || dragonfly || freebsd || hurd || (js && wasm) || linux || netbsd || openbsd || solaris 6// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris 7 8package poll 9 10import ( 11 "io" 12 "sync/atomic" 13 "syscall" 14) 15 16// FD is a file descriptor. The net and os packages use this type as a 17// field of a larger type representing a network connection or OS file. 18type FD struct { 19 // Lock sysfd and serialize access to Read and Write methods. 20 fdmu fdMutex 21 22 // System file descriptor. Immutable until Close. 23 Sysfd int 24 25 // I/O poller. 26 pd pollDesc 27 28 // Writev cache. 29 iovecs *[]syscall.Iovec 30 31 // Semaphore signaled when file is closed. 32 csema uint32 33 34 // Non-zero if this file has been set to blocking mode. 35 isBlocking uint32 36 37 // Whether this is a streaming descriptor, as opposed to a 38 // packet-based descriptor like a UDP socket. Immutable. 39 IsStream bool 40 41 // Whether a zero byte read indicates EOF. This is false for a 42 // message based socket connection. 43 ZeroReadIsEOF bool 44 45 // Whether this is a file rather than a network socket. 46 isFile bool 47} 48 49// Init initializes the FD. The Sysfd field should already be set. 50// This can be called multiple times on a single FD. 51// The net argument is a network name from the net package (e.g., "tcp"), 52// or "file". 53// Set pollable to true if fd should be managed by runtime netpoll. 54func (fd *FD) Init(net string, pollable bool) error { 55 // We don't actually care about the various network types. 56 if net == "file" { 57 fd.isFile = true 58 } 59 if !pollable { 60 fd.isBlocking = 1 61 return nil 62 } 63 err := fd.pd.init(fd) 64 if err != nil { 65 // If we could not initialize the runtime poller, 66 // assume we are using blocking mode. 67 fd.isBlocking = 1 68 } 69 return err 70} 71 72// Destroy closes the file descriptor. This is called when there are 73// no remaining references. 74func (fd *FD) destroy() error { 75 // Poller may want to unregister fd in readiness notification mechanism, 76 // so this must be executed before CloseFunc. 77 fd.pd.close() 78 79 // We don't use ignoringEINTR here because POSIX does not define 80 // whether the descriptor is closed if close returns EINTR. 81 // If the descriptor is indeed closed, using a loop would race 82 // with some other goroutine opening a new descriptor. 83 // (The Linux kernel guarantees that it is closed on an EINTR error.) 84 err := CloseFunc(fd.Sysfd) 85 86 fd.Sysfd = -1 87 runtime_Semrelease(&fd.csema) 88 return err 89} 90 91// Close closes the FD. The underlying file descriptor is closed by the 92// destroy method when there are no remaining references. 93func (fd *FD) Close() error { 94 if !fd.fdmu.increfAndClose() { 95 return errClosing(fd.isFile) 96 } 97 98 // Unblock any I/O. Once it all unblocks and returns, 99 // so that it cannot be referring to fd.sysfd anymore, 100 // the final decref will close fd.sysfd. This should happen 101 // fairly quickly, since all the I/O is non-blocking, and any 102 // attempts to block in the pollDesc will return errClosing(fd.isFile). 103 fd.pd.evict() 104 105 // The call to decref will call destroy if there are no other 106 // references. 107 err := fd.decref() 108 109 // Wait until the descriptor is closed. If this was the only 110 // reference, it is already closed. Only wait if the file has 111 // not been set to blocking mode, as otherwise any current I/O 112 // may be blocking, and that would block the Close. 113 // No need for an atomic read of isBlocking, increfAndClose means 114 // we have exclusive access to fd. 115 if fd.isBlocking == 0 { 116 runtime_Semacquire(&fd.csema) 117 } 118 119 return err 120} 121 122// SetBlocking puts the file into blocking mode. 123func (fd *FD) SetBlocking() error { 124 if err := fd.incref(); err != nil { 125 return err 126 } 127 defer fd.decref() 128 // Atomic store so that concurrent calls to SetBlocking 129 // do not cause a race condition. isBlocking only ever goes 130 // from 0 to 1 so there is no real race here. 131 atomic.StoreUint32(&fd.isBlocking, 1) 132 return syscall.SetNonblock(fd.Sysfd, false) 133} 134 135// Darwin and FreeBSD can't read or write 2GB+ files at a time, 136// even on 64-bit systems. 137// The same is true of socket implementations on many systems. 138// See golang.org/issue/7812 and golang.org/issue/16266. 139// Use 1GB instead of, say, 2GB-1, to keep subsequent reads aligned. 140const maxRW = 1 << 30 141 142// Read implements io.Reader. 143func (fd *FD) Read(p []byte) (int, error) { 144 if err := fd.readLock(); err != nil { 145 return 0, err 146 } 147 defer fd.readUnlock() 148 if len(p) == 0 { 149 // If the caller wanted a zero byte read, return immediately 150 // without trying (but after acquiring the readLock). 151 // Otherwise syscall.Read returns 0, nil which looks like 152 // io.EOF. 153 // TODO(bradfitz): make it wait for readability? (Issue 15735) 154 return 0, nil 155 } 156 if err := fd.pd.prepareRead(fd.isFile); err != nil { 157 return 0, err 158 } 159 if fd.IsStream && len(p) > maxRW { 160 p = p[:maxRW] 161 } 162 for { 163 n, err := ignoringEINTRIO(syscall.Read, fd.Sysfd, p) 164 if err != nil { 165 n = 0 166 if err == syscall.EAGAIN && fd.pd.pollable() { 167 if err = fd.pd.waitRead(fd.isFile); err == nil { 168 continue 169 } 170 } 171 } 172 err = fd.eofError(n, err) 173 return n, err 174 } 175} 176 177// Pread wraps the pread system call. 178func (fd *FD) Pread(p []byte, off int64) (int, error) { 179 // Call incref, not readLock, because since pread specifies the 180 // offset it is independent from other reads. 181 // Similarly, using the poller doesn't make sense for pread. 182 if err := fd.incref(); err != nil { 183 return 0, err 184 } 185 if fd.IsStream && len(p) > maxRW { 186 p = p[:maxRW] 187 } 188 var ( 189 n int 190 err error 191 ) 192 for { 193 n, err = syscall.Pread(fd.Sysfd, p, off) 194 if err != syscall.EINTR { 195 break 196 } 197 } 198 if err != nil { 199 n = 0 200 } 201 fd.decref() 202 err = fd.eofError(n, err) 203 return n, err 204} 205 206// ReadFrom wraps the recvfrom network call. 207func (fd *FD) ReadFrom(p []byte) (int, syscall.Sockaddr, error) { 208 if err := fd.readLock(); err != nil { 209 return 0, nil, err 210 } 211 defer fd.readUnlock() 212 if err := fd.pd.prepareRead(fd.isFile); err != nil { 213 return 0, nil, err 214 } 215 for { 216 n, sa, err := syscall.Recvfrom(fd.Sysfd, p, 0) 217 if err != nil { 218 if err == syscall.EINTR { 219 continue 220 } 221 n = 0 222 if err == syscall.EAGAIN && fd.pd.pollable() { 223 if err = fd.pd.waitRead(fd.isFile); err == nil { 224 continue 225 } 226 } 227 } 228 err = fd.eofError(n, err) 229 return n, sa, err 230 } 231} 232 233// ReadMsg wraps the recvmsg network call. 234func (fd *FD) ReadMsg(p []byte, oob []byte, flags int) (int, int, int, syscall.Sockaddr, error) { 235 if err := fd.readLock(); err != nil { 236 return 0, 0, 0, nil, err 237 } 238 defer fd.readUnlock() 239 if err := fd.pd.prepareRead(fd.isFile); err != nil { 240 return 0, 0, 0, nil, err 241 } 242 for { 243 n, oobn, sysflags, sa, err := syscall.Recvmsg(fd.Sysfd, p, oob, flags) 244 if err != nil { 245 if err == syscall.EINTR { 246 continue 247 } 248 // TODO(dfc) should n and oobn be set to 0 249 if err == syscall.EAGAIN && fd.pd.pollable() { 250 if err = fd.pd.waitRead(fd.isFile); err == nil { 251 continue 252 } 253 } 254 } 255 err = fd.eofError(n, err) 256 return n, oobn, sysflags, sa, err 257 } 258} 259 260// Write implements io.Writer. 261func (fd *FD) Write(p []byte) (int, error) { 262 if err := fd.writeLock(); err != nil { 263 return 0, err 264 } 265 defer fd.writeUnlock() 266 if err := fd.pd.prepareWrite(fd.isFile); err != nil { 267 return 0, err 268 } 269 var nn int 270 for { 271 max := len(p) 272 if fd.IsStream && max-nn > maxRW { 273 max = nn + maxRW 274 } 275 n, err := ignoringEINTRIO(syscall.Write, fd.Sysfd, p[nn:max]) 276 if n > 0 { 277 nn += n 278 } 279 if nn == len(p) { 280 return nn, err 281 } 282 if err == syscall.EAGAIN && fd.pd.pollable() { 283 if err = fd.pd.waitWrite(fd.isFile); err == nil { 284 continue 285 } 286 } 287 if err != nil { 288 return nn, err 289 } 290 if n == 0 { 291 return nn, io.ErrUnexpectedEOF 292 } 293 } 294} 295 296// Pwrite wraps the pwrite system call. 297func (fd *FD) Pwrite(p []byte, off int64) (int, error) { 298 // Call incref, not writeLock, because since pwrite specifies the 299 // offset it is independent from other writes. 300 // Similarly, using the poller doesn't make sense for pwrite. 301 if err := fd.incref(); err != nil { 302 return 0, err 303 } 304 defer fd.decref() 305 var nn int 306 for { 307 max := len(p) 308 if fd.IsStream && max-nn > maxRW { 309 max = nn + maxRW 310 } 311 n, err := syscall.Pwrite(fd.Sysfd, p[nn:max], off+int64(nn)) 312 if err == syscall.EINTR { 313 continue 314 } 315 if n > 0 { 316 nn += n 317 } 318 if nn == len(p) { 319 return nn, err 320 } 321 if err != nil { 322 return nn, err 323 } 324 if n == 0 { 325 return nn, io.ErrUnexpectedEOF 326 } 327 } 328} 329 330// WriteTo wraps the sendto network call. 331func (fd *FD) WriteTo(p []byte, sa syscall.Sockaddr) (int, error) { 332 if err := fd.writeLock(); err != nil { 333 return 0, err 334 } 335 defer fd.writeUnlock() 336 if err := fd.pd.prepareWrite(fd.isFile); err != nil { 337 return 0, err 338 } 339 for { 340 err := syscall.Sendto(fd.Sysfd, p, 0, sa) 341 if err == syscall.EINTR { 342 continue 343 } 344 if err == syscall.EAGAIN && fd.pd.pollable() { 345 if err = fd.pd.waitWrite(fd.isFile); err == nil { 346 continue 347 } 348 } 349 if err != nil { 350 return 0, err 351 } 352 return len(p), nil 353 } 354} 355 356// WriteMsg wraps the sendmsg network call. 357func (fd *FD) WriteMsg(p []byte, oob []byte, sa syscall.Sockaddr) (int, int, error) { 358 if err := fd.writeLock(); err != nil { 359 return 0, 0, err 360 } 361 defer fd.writeUnlock() 362 if err := fd.pd.prepareWrite(fd.isFile); err != nil { 363 return 0, 0, err 364 } 365 for { 366 n, err := syscall.SendmsgN(fd.Sysfd, p, oob, sa, 0) 367 if err == syscall.EINTR { 368 continue 369 } 370 if err == syscall.EAGAIN && fd.pd.pollable() { 371 if err = fd.pd.waitWrite(fd.isFile); err == nil { 372 continue 373 } 374 } 375 if err != nil { 376 return n, 0, err 377 } 378 return n, len(oob), err 379 } 380} 381 382// Accept wraps the accept network call. 383func (fd *FD) Accept() (int, syscall.Sockaddr, string, error) { 384 if err := fd.readLock(); err != nil { 385 return -1, nil, "", err 386 } 387 defer fd.readUnlock() 388 389 if err := fd.pd.prepareRead(fd.isFile); err != nil { 390 return -1, nil, "", err 391 } 392 for { 393 s, rsa, errcall, err := accept(fd.Sysfd) 394 if err == nil { 395 return s, rsa, "", err 396 } 397 switch err { 398 case syscall.EINTR: 399 continue 400 case syscall.EAGAIN: 401 if fd.pd.pollable() { 402 if err = fd.pd.waitRead(fd.isFile); err == nil { 403 continue 404 } 405 } 406 case syscall.ECONNABORTED: 407 // This means that a socket on the listen 408 // queue was closed before we Accept()ed it; 409 // it's a silly error, so try again. 410 continue 411 } 412 return -1, nil, errcall, err 413 } 414} 415 416// Seek wraps syscall.Seek. 417func (fd *FD) Seek(offset int64, whence int) (int64, error) { 418 if err := fd.incref(); err != nil { 419 return 0, err 420 } 421 defer fd.decref() 422 return syscall.Seek(fd.Sysfd, offset, whence) 423} 424 425// ReadDirent wraps syscall.ReadDirent. 426// We treat this like an ordinary system call rather than a call 427// that tries to fill the buffer. 428func (fd *FD) ReadDirent(buf []byte) (int, error) { 429 if err := fd.incref(); err != nil { 430 return 0, err 431 } 432 defer fd.decref() 433 for { 434 n, err := ignoringEINTRIO(syscall.ReadDirent, fd.Sysfd, buf) 435 if err != nil { 436 n = 0 437 if err == syscall.EAGAIN && fd.pd.pollable() { 438 if err = fd.pd.waitRead(fd.isFile); err == nil { 439 continue 440 } 441 } 442 } 443 // Do not call eofError; caller does not expect to see io.EOF. 444 return n, err 445 } 446} 447 448// Fchmod wraps syscall.Fchmod. 449func (fd *FD) Fchmod(mode uint32) error { 450 if err := fd.incref(); err != nil { 451 return err 452 } 453 defer fd.decref() 454 return ignoringEINTR(func() error { 455 return syscall.Fchmod(fd.Sysfd, mode) 456 }) 457} 458 459// Fchdir wraps syscall.Fchdir. 460func (fd *FD) Fchdir() error { 461 if err := fd.incref(); err != nil { 462 return err 463 } 464 defer fd.decref() 465 return syscall.Fchdir(fd.Sysfd) 466} 467 468// Fstat wraps syscall.Fstat 469func (fd *FD) Fstat(s *syscall.Stat_t) error { 470 if err := fd.incref(); err != nil { 471 return err 472 } 473 defer fd.decref() 474 return ignoringEINTR(func() error { 475 return syscall.Fstat(fd.Sysfd, s) 476 }) 477} 478 479// tryDupCloexec indicates whether F_DUPFD_CLOEXEC should be used. 480// If the kernel doesn't support it, this is set to 0. 481var tryDupCloexec = int32(1) 482 483// DupCloseOnExec dups fd and marks it close-on-exec. 484func DupCloseOnExec(fd int) (int, string, error) { 485 if syscall.F_DUPFD_CLOEXEC != 0 && atomic.LoadInt32(&tryDupCloexec) == 1 { 486 r0, e1 := fcntl(fd, syscall.F_DUPFD_CLOEXEC, 0) 487 if e1 == nil { 488 return r0, "", nil 489 } 490 switch e1.(syscall.Errno) { 491 case syscall.EINVAL, syscall.ENOSYS: 492 // Old kernel, or js/wasm (which returns 493 // ENOSYS). Fall back to the portable way from 494 // now on. 495 atomic.StoreInt32(&tryDupCloexec, 0) 496 default: 497 return -1, "fcntl", e1 498 } 499 } 500 return dupCloseOnExecOld(fd) 501} 502 503// dupCloseOnExecOld is the traditional way to dup an fd and 504// set its O_CLOEXEC bit, using two system calls. 505func dupCloseOnExecOld(fd int) (int, string, error) { 506 syscall.ForkLock.RLock() 507 defer syscall.ForkLock.RUnlock() 508 newfd, err := syscall.Dup(fd) 509 if err != nil { 510 return -1, "dup", err 511 } 512 syscall.CloseOnExec(newfd) 513 return newfd, "", nil 514} 515 516// Dup duplicates the file descriptor. 517func (fd *FD) Dup() (int, string, error) { 518 if err := fd.incref(); err != nil { 519 return -1, "", err 520 } 521 defer fd.decref() 522 return DupCloseOnExec(fd.Sysfd) 523} 524 525// On Unix variants only, expose the IO event for the net code. 526 527// WaitWrite waits until data can be read from fd. 528func (fd *FD) WaitWrite() error { 529 return fd.pd.waitWrite(fd.isFile) 530} 531 532// WriteOnce is for testing only. It makes a single write call. 533func (fd *FD) WriteOnce(p []byte) (int, error) { 534 if err := fd.writeLock(); err != nil { 535 return 0, err 536 } 537 defer fd.writeUnlock() 538 return ignoringEINTRIO(syscall.Write, fd.Sysfd, p) 539} 540 541// RawRead invokes the user-defined function f for a read operation. 542func (fd *FD) RawRead(f func(uintptr) bool) error { 543 if err := fd.readLock(); err != nil { 544 return err 545 } 546 defer fd.readUnlock() 547 if err := fd.pd.prepareRead(fd.isFile); err != nil { 548 return err 549 } 550 for { 551 if f(uintptr(fd.Sysfd)) { 552 return nil 553 } 554 if err := fd.pd.waitRead(fd.isFile); err != nil { 555 return err 556 } 557 } 558} 559 560// RawWrite invokes the user-defined function f for a write operation. 561func (fd *FD) RawWrite(f func(uintptr) bool) error { 562 if err := fd.writeLock(); err != nil { 563 return err 564 } 565 defer fd.writeUnlock() 566 if err := fd.pd.prepareWrite(fd.isFile); err != nil { 567 return err 568 } 569 for { 570 if f(uintptr(fd.Sysfd)) { 571 return nil 572 } 573 if err := fd.pd.waitWrite(fd.isFile); err != nil { 574 return err 575 } 576 } 577} 578 579// ignoringEINTRIO is like ignoringEINTR, but just for IO calls. 580func ignoringEINTRIO(fn func(fd int, p []byte) (int, error), fd int, p []byte) (int, error) { 581 for { 582 n, err := fn(fd, p) 583 if err != syscall.EINTR { 584 return n, err 585 } 586 } 587} 588