1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris windows
6
7package runtime
8
9import (
10	"runtime/internal/atomic"
11	"unsafe"
12)
13
14// Export temporarily for gccgo's C code to call:
15//go:linkname netpoll
16
17// Integrated network poller (platform-independent part).
18// A particular implementation (epoll/kqueue/port/AIX/Windows)
19// must define the following functions:
20//
21// func netpollinit()
22//     Initialize the poller. Only called once.
23//
24// func netpollopen(fd uintptr, pd *pollDesc) int32
25//     Arm edge-triggered notifications for fd. The pd argument is to pass
26//     back to netpollready when fd is ready. Return an errno value.
27//
28// func netpoll(delta int64) gList
29//     Poll the network. If delta < 0, block indefinitely. If delta == 0,
30//     poll without blocking. If delta > 0, block for up to delta nanoseconds.
31//     Return a list of goroutines built by calling netpollready.
32//
33// func netpollBreak()
34//     Wake up the network poller, assumed to be blocked in netpoll.
35//
36// func netpollIsPollDescriptor(fd uintptr) bool
37//     Reports whether fd is a file descriptor used by the poller.
38
39// pollDesc contains 2 binary semaphores, rg and wg, to park reader and writer
40// goroutines respectively. The semaphore can be in the following states:
41// pdReady - io readiness notification is pending;
42//           a goroutine consumes the notification by changing the state to nil.
43// pdWait - a goroutine prepares to park on the semaphore, but not yet parked;
44//          the goroutine commits to park by changing the state to G pointer,
45//          or, alternatively, concurrent io notification changes the state to READY,
46//          or, alternatively, concurrent timeout/close changes the state to nil.
47// G pointer - the goroutine is blocked on the semaphore;
48//             io notification or timeout/close changes the state to READY or nil respectively
49//             and unparks the goroutine.
50// nil - nothing of the above.
51const (
52	pdReady uintptr = 1
53	pdWait  uintptr = 2
54)
55
56const pollBlockSize = 4 * 1024
57
58// Network poller descriptor.
59//
60// No heap pointers.
61//
62//go:notinheap
63type pollDesc struct {
64	link *pollDesc // in pollcache, protected by pollcache.lock
65
66	// The lock protects pollOpen, pollSetDeadline, pollUnblock and deadlineimpl operations.
67	// This fully covers seq, rt and wt variables. fd is constant throughout the PollDesc lifetime.
68	// pollReset, pollWait, pollWaitCanceled and runtime·netpollready (IO readiness notification)
69	// proceed w/o taking the lock. So closing, everr, rg, rd, wg and wd are manipulated
70	// in a lock-free way by all operations.
71	// NOTE(dvyukov): the following code uses uintptr to store *g (rg/wg),
72	// that will blow up when GC starts moving objects.
73	lock    mutex // protects the following fields
74	fd      uintptr
75	closing bool
76	everr   bool    // marks event scanning error happened
77	user    uint32  // user settable cookie
78	rseq    uintptr // protects from stale read timers
79	rg      uintptr // pdReady, pdWait, G waiting for read or nil
80	rt      timer   // read deadline timer (set if rt.f != nil)
81	rd      int64   // read deadline
82	wseq    uintptr // protects from stale write timers
83	wg      uintptr // pdReady, pdWait, G waiting for write or nil
84	wt      timer   // write deadline timer
85	wd      int64   // write deadline
86}
87
88type pollCache struct {
89	lock  mutex
90	first *pollDesc
91	// PollDesc objects must be type-stable,
92	// because we can get ready notification from epoll/kqueue
93	// after the descriptor is closed/reused.
94	// Stale notifications are detected using seq variable,
95	// seq is incremented when deadlines are changed or descriptor is reused.
96}
97
98var (
99	netpollInitLock mutex
100	netpollInited   uint32
101
102	pollcache      pollCache
103	netpollWaiters uint32
104)
105
106//go:linkname poll_runtime_pollServerInit internal..z2fpoll.runtime_pollServerInit
107func poll_runtime_pollServerInit() {
108	netpollGenericInit()
109}
110
111func netpollGenericInit() {
112	if atomic.Load(&netpollInited) == 0 {
113		lock(&netpollInitLock)
114		if netpollInited == 0 {
115			netpollinit()
116			atomic.Store(&netpollInited, 1)
117		}
118		unlock(&netpollInitLock)
119	}
120}
121
122func netpollinited() bool {
123	return atomic.Load(&netpollInited) != 0
124}
125
126//go:linkname poll_runtime_isPollServerDescriptor internal..z2fpoll.runtime_isPollServerDescriptor
127
128// poll_runtime_isPollServerDescriptor reports whether fd is a
129// descriptor being used by netpoll.
130func poll_runtime_isPollServerDescriptor(fd uintptr) bool {
131	return netpollIsPollDescriptor(fd)
132}
133
134//go:linkname poll_runtime_pollOpen internal..z2fpoll.runtime_pollOpen
135func poll_runtime_pollOpen(fd uintptr) (uintptr, int) {
136	pd := pollcache.alloc()
137	lock(&pd.lock)
138	if pd.wg != 0 && pd.wg != pdReady {
139		throw("runtime: blocked write on free polldesc")
140	}
141	if pd.rg != 0 && pd.rg != pdReady {
142		throw("runtime: blocked read on free polldesc")
143	}
144	pd.fd = fd
145	pd.closing = false
146	pd.everr = false
147	pd.rseq++
148	pd.rg = 0
149	pd.rd = 0
150	pd.wseq++
151	pd.wg = 0
152	pd.wd = 0
153	unlock(&pd.lock)
154
155	var errno int32
156	errno = netpollopen(fd, pd)
157	return uintptr(unsafe.Pointer(pd)), int(errno)
158}
159
160//go:linkname poll_runtime_pollClose internal..z2fpoll.runtime_pollClose
161func poll_runtime_pollClose(ctx uintptr) {
162	pd := (*pollDesc)(unsafe.Pointer(ctx))
163	if !pd.closing {
164		throw("runtime: close polldesc w/o unblock")
165	}
166	if pd.wg != 0 && pd.wg != pdReady {
167		throw("runtime: blocked write on closing polldesc")
168	}
169	if pd.rg != 0 && pd.rg != pdReady {
170		throw("runtime: blocked read on closing polldesc")
171	}
172	netpollclose(pd.fd)
173	pollcache.free(pd)
174}
175
176func (c *pollCache) free(pd *pollDesc) {
177	lock(&c.lock)
178	pd.link = c.first
179	c.first = pd
180	unlock(&c.lock)
181}
182
183//go:linkname poll_runtime_pollReset internal..z2fpoll.runtime_pollReset
184func poll_runtime_pollReset(ctx uintptr, mode int) int {
185	pd := (*pollDesc)(unsafe.Pointer(ctx))
186	err := netpollcheckerr(pd, int32(mode))
187	if err != 0 {
188		return err
189	}
190	if mode == 'r' {
191		pd.rg = 0
192	} else if mode == 'w' {
193		pd.wg = 0
194	}
195	return 0
196}
197
198//go:linkname poll_runtime_pollWait internal..z2fpoll.runtime_pollWait
199func poll_runtime_pollWait(ctx uintptr, mode int) int {
200	pd := (*pollDesc)(unsafe.Pointer(ctx))
201	err := netpollcheckerr(pd, int32(mode))
202	if err != 0 {
203		return err
204	}
205	// As for now only Solaris, illumos, and AIX use level-triggered IO.
206	if GOOS == "solaris" || GOOS == "illumos" || GOOS == "aix" || GOOS == "hurd" {
207		netpollarm(pd, mode)
208	}
209	for !netpollblock(pd, int32(mode), false) {
210		err = netpollcheckerr(pd, int32(mode))
211		if err != 0 {
212			return err
213		}
214		// Can happen if timeout has fired and unblocked us,
215		// but before we had a chance to run, timeout has been reset.
216		// Pretend it has not happened and retry.
217	}
218	return 0
219}
220
221//go:linkname poll_runtime_pollWaitCanceled internal..z2fpoll.runtime_pollWaitCanceled
222func poll_runtime_pollWaitCanceled(ctx uintptr, mode int) {
223	pd := (*pollDesc)(unsafe.Pointer(ctx))
224	// This function is used only on windows after a failed attempt to cancel
225	// a pending async IO operation. Wait for ioready, ignore closing or timeouts.
226	for !netpollblock(pd, int32(mode), true) {
227	}
228}
229
230//go:linkname poll_runtime_pollSetDeadline internal..z2fpoll.runtime_pollSetDeadline
231func poll_runtime_pollSetDeadline(ctx uintptr, d int64, mode int) {
232	pd := (*pollDesc)(unsafe.Pointer(ctx))
233	lock(&pd.lock)
234	if pd.closing {
235		unlock(&pd.lock)
236		return
237	}
238	rd0, wd0 := pd.rd, pd.wd
239	combo0 := rd0 > 0 && rd0 == wd0
240	if d > 0 {
241		d += nanotime()
242		if d <= 0 {
243			// If the user has a deadline in the future, but the delay calculation
244			// overflows, then set the deadline to the maximum possible value.
245			d = 1<<63 - 1
246		}
247	}
248	if mode == 'r' || mode == 'r'+'w' {
249		pd.rd = d
250	}
251	if mode == 'w' || mode == 'r'+'w' {
252		pd.wd = d
253	}
254	combo := pd.rd > 0 && pd.rd == pd.wd
255	rtf := netpollReadDeadline
256	if combo {
257		rtf = netpollDeadline
258	}
259	if pd.rt.f == nil {
260		if pd.rd > 0 {
261			pd.rt.f = rtf
262			// Copy current seq into the timer arg.
263			// Timer func will check the seq against current descriptor seq,
264			// if they differ the descriptor was reused or timers were reset.
265			pd.rt.arg = pd
266			pd.rt.seq = pd.rseq
267			resettimer(&pd.rt, pd.rd)
268		}
269	} else if pd.rd != rd0 || combo != combo0 {
270		pd.rseq++ // invalidate current timers
271		if pd.rd > 0 {
272			modtimer(&pd.rt, pd.rd, 0, rtf, pd, pd.rseq)
273		} else {
274			deltimer(&pd.rt)
275			pd.rt.f = nil
276		}
277	}
278	if pd.wt.f == nil {
279		if pd.wd > 0 && !combo {
280			pd.wt.f = netpollWriteDeadline
281			pd.wt.arg = pd
282			pd.wt.seq = pd.wseq
283			resettimer(&pd.wt, pd.wd)
284		}
285	} else if pd.wd != wd0 || combo != combo0 {
286		pd.wseq++ // invalidate current timers
287		if pd.wd > 0 && !combo {
288			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd, pd.wseq)
289		} else {
290			deltimer(&pd.wt)
291			pd.wt.f = nil
292		}
293	}
294	// If we set the new deadline in the past, unblock currently pending IO if any.
295	var rg, wg *g
296	if pd.rd < 0 || pd.wd < 0 {
297		atomic.StorepNoWB(noescape(unsafe.Pointer(&wg)), nil) // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
298		if pd.rd < 0 {
299			rg = netpollunblock(pd, 'r', false)
300		}
301		if pd.wd < 0 {
302			wg = netpollunblock(pd, 'w', false)
303		}
304	}
305	unlock(&pd.lock)
306	if rg != nil {
307		netpollgoready(rg, 3)
308	}
309	if wg != nil {
310		netpollgoready(wg, 3)
311	}
312}
313
314//go:linkname poll_runtime_pollUnblock internal..z2fpoll.runtime_pollUnblock
315func poll_runtime_pollUnblock(ctx uintptr) {
316	pd := (*pollDesc)(unsafe.Pointer(ctx))
317	lock(&pd.lock)
318	if pd.closing {
319		throw("runtime: unblock on closing polldesc")
320	}
321	pd.closing = true
322	pd.rseq++
323	pd.wseq++
324	var rg, wg *g
325	atomic.StorepNoWB(noescape(unsafe.Pointer(&rg)), nil) // full memory barrier between store to closing and read of rg/wg in netpollunblock
326	rg = netpollunblock(pd, 'r', false)
327	wg = netpollunblock(pd, 'w', false)
328	if pd.rt.f != nil {
329		deltimer(&pd.rt)
330		pd.rt.f = nil
331	}
332	if pd.wt.f != nil {
333		deltimer(&pd.wt)
334		pd.wt.f = nil
335	}
336	unlock(&pd.lock)
337	if rg != nil {
338		netpollgoready(rg, 3)
339	}
340	if wg != nil {
341		netpollgoready(wg, 3)
342	}
343}
344
345// netpollready is called by the platform-specific netpoll function.
346// It declares that the fd associated with pd is ready for I/O.
347// The toRun argument is used to build a list of goroutines to return
348// from netpoll. The mode argument is 'r', 'w', or 'r'+'w' to indicate
349// whether the fd is ready for reading or writing or both.
350//
351// This may run while the world is stopped, so write barriers are not allowed.
352//go:nowritebarrier
353func netpollready(toRun *gList, pd *pollDesc, mode int32) {
354	var rg, wg *g
355	if mode == 'r' || mode == 'r'+'w' {
356		rg = netpollunblock(pd, 'r', true)
357	}
358	if mode == 'w' || mode == 'r'+'w' {
359		wg = netpollunblock(pd, 'w', true)
360	}
361	if rg != nil {
362		toRun.push(rg)
363	}
364	if wg != nil {
365		toRun.push(wg)
366	}
367}
368
369func netpollcheckerr(pd *pollDesc, mode int32) int {
370	if pd.closing {
371		return 1 // ErrFileClosing or ErrNetClosing
372	}
373	if (mode == 'r' && pd.rd < 0) || (mode == 'w' && pd.wd < 0) {
374		return 2 // ErrTimeout
375	}
376	// Report an event scanning error only on a read event.
377	// An error on a write event will be captured in a subsequent
378	// write call that is able to report a more specific error.
379	if mode == 'r' && pd.everr {
380		return 3 // ErrNotPollable
381	}
382	return 0
383}
384
385func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
386	r := atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
387	if r {
388		// Bump the count of goroutines waiting for the poller.
389		// The scheduler uses this to decide whether to block
390		// waiting for the poller if there is nothing else to do.
391		atomic.Xadd(&netpollWaiters, 1)
392	}
393	return r
394}
395
396func netpollgoready(gp *g, traceskip int) {
397	atomic.Xadd(&netpollWaiters, -1)
398	goready(gp, traceskip+1)
399}
400
401// returns true if IO is ready, or false if timedout or closed
402// waitio - wait only for completed IO, ignore errors
403func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
404	gpp := &pd.rg
405	if mode == 'w' {
406		gpp = &pd.wg
407	}
408
409	// set the gpp semaphore to WAIT
410	for {
411		old := *gpp
412		if old == pdReady {
413			*gpp = 0
414			return true
415		}
416		if old != 0 {
417			throw("runtime: double wait")
418		}
419		if atomic.Casuintptr(gpp, 0, pdWait) {
420			break
421		}
422	}
423
424	// need to recheck error states after setting gpp to WAIT
425	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
426	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
427	if waitio || netpollcheckerr(pd, mode) == 0 {
428		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
429	}
430	// be careful to not lose concurrent READY notification
431	old := atomic.Xchguintptr(gpp, 0)
432	if old > pdWait {
433		throw("runtime: corrupted polldesc")
434	}
435	return old == pdReady
436}
437
438func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
439	gpp := &pd.rg
440	if mode == 'w' {
441		gpp = &pd.wg
442	}
443
444	for {
445		old := *gpp
446		if old == pdReady {
447			return nil
448		}
449		if old == 0 && !ioready {
450			// Only set READY for ioready. runtime_pollWait
451			// will check for timeout/cancel before waiting.
452			return nil
453		}
454		var new uintptr
455		if ioready {
456			new = pdReady
457		}
458		if atomic.Casuintptr(gpp, old, new) {
459			if old == pdReady || old == pdWait {
460				old = 0
461			}
462			return (*g)(unsafe.Pointer(old))
463		}
464	}
465}
466
467func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
468	lock(&pd.lock)
469	// Seq arg is seq when the timer was set.
470	// If it's stale, ignore the timer event.
471	currentSeq := pd.rseq
472	if !read {
473		currentSeq = pd.wseq
474	}
475	if seq != currentSeq {
476		// The descriptor was reused or timers were reset.
477		unlock(&pd.lock)
478		return
479	}
480	var rg *g
481	if read {
482		if pd.rd <= 0 || pd.rt.f == nil {
483			throw("runtime: inconsistent read deadline")
484		}
485		pd.rd = -1
486		atomic.StorepNoWB(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
487		rg = netpollunblock(pd, 'r', false)
488	}
489	var wg *g
490	if write {
491		if pd.wd <= 0 || pd.wt.f == nil && !read {
492			throw("runtime: inconsistent write deadline")
493		}
494		pd.wd = -1
495		atomic.StorepNoWB(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
496		wg = netpollunblock(pd, 'w', false)
497	}
498	unlock(&pd.lock)
499	if rg != nil {
500		netpollgoready(rg, 0)
501	}
502	if wg != nil {
503		netpollgoready(wg, 0)
504	}
505}
506
507func netpollDeadline(arg interface{}, seq uintptr) {
508	netpolldeadlineimpl(arg.(*pollDesc), seq, true, true)
509}
510
511func netpollReadDeadline(arg interface{}, seq uintptr) {
512	netpolldeadlineimpl(arg.(*pollDesc), seq, true, false)
513}
514
515func netpollWriteDeadline(arg interface{}, seq uintptr) {
516	netpolldeadlineimpl(arg.(*pollDesc), seq, false, true)
517}
518
519func (c *pollCache) alloc() *pollDesc {
520	lock(&c.lock)
521	if c.first == nil {
522		const pdSize = unsafe.Sizeof(pollDesc{})
523		n := pollBlockSize / pdSize
524		if n == 0 {
525			n = 1
526		}
527		// Must be in non-GC memory because can be referenced
528		// only from epoll/kqueue internals.
529		mem := persistentalloc(n*pdSize, 0, &memstats.other_sys)
530		for i := uintptr(0); i < n; i++ {
531			pd := (*pollDesc)(add(mem, i*pdSize))
532			pd.link = c.first
533			c.first = pd
534		}
535	}
536	pd := c.first
537	c.first = pd.link
538	unlock(&c.lock)
539	return pd
540}
541