1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Semaphore implementation exposed to Go.
6// Intended use is provide a sleep and wakeup
7// primitive that can be used in the contended case
8// of other synchronization primitives.
9// Thus it targets the same goal as Linux's futex,
10// but it has much simpler semantics.
11//
12// That is, don't think of these as semaphores.
13// Think of them as a way to implement sleep and wakeup
14// such that every sleep is paired with a single wakeup,
15// even if, due to races, the wakeup happens before the sleep.
16//
17// See Mullender and Cox, ``Semaphores in Plan 9,''
18// https://swtch.com/semaphore.pdf
19
20package runtime
21
22import (
23	"internal/cpu"
24	"runtime/internal/atomic"
25	"unsafe"
26)
27
28// Asynchronous semaphore for sync.Mutex.
29
30// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem).
31// Each of those sudog may in turn point (through s.waitlink) to a list
32// of other sudogs waiting on the same address.
33// The operations on the inner lists of sudogs with the same address
34// are all O(1). The scanning of the top-level semaRoot list is O(log n),
35// where n is the number of distinct addresses with goroutines blocked
36// on them that hash to the given semaRoot.
37// See golang.org/issue/17953 for a program that worked badly
38// before we introduced the second level of list, and test/locklinear.go
39// for a test that exercises this.
40type semaRoot struct {
41	lock  mutex
42	treap *sudog // root of balanced tree of unique waiters.
43	nwait uint32 // Number of waiters. Read w/o the lock.
44}
45
46// Prime to not correlate with any user patterns.
47const semTabSize = 251
48
49var semtable [semTabSize]struct {
50	root semaRoot
51	pad  [cpu.CacheLinePadSize - unsafe.Sizeof(semaRoot{})]byte
52}
53
54//go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
55func sync_runtime_Semacquire(addr *uint32) {
56	semacquire1(addr, false, semaBlockProfile, 0)
57}
58
59//go:linkname poll_runtime_Semacquire internal..z2fpoll.runtime_Semacquire
60func poll_runtime_Semacquire(addr *uint32) {
61	semacquire1(addr, false, semaBlockProfile, 0)
62}
63
64//go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
65func sync_runtime_Semrelease(addr *uint32, handoff bool, skipframes int) {
66	semrelease1(addr, handoff, skipframes)
67}
68
69//go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
70func sync_runtime_SemacquireMutex(addr *uint32, lifo bool, skipframes int) {
71	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes)
72}
73
74//go:linkname poll_runtime_Semrelease internal..z2fpoll.runtime_Semrelease
75func poll_runtime_Semrelease(addr *uint32) {
76	semrelease(addr)
77}
78
79func readyWithTime(s *sudog, traceskip int) {
80	if s.releasetime != 0 {
81		s.releasetime = cputicks()
82	}
83	goready(s.g, traceskip)
84}
85
86type semaProfileFlags int
87
88const (
89	semaBlockProfile semaProfileFlags = 1 << iota
90	semaMutexProfile
91)
92
93// Called from runtime.
94func semacquire(addr *uint32) {
95	semacquire1(addr, false, 0, 0)
96}
97
98func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes int) {
99	gp := getg()
100	if gp != gp.m.curg {
101		throw("semacquire not on the G stack")
102	}
103
104	// Easy case.
105	if cansemacquire(addr) {
106		return
107	}
108
109	// Harder case:
110	//	increment waiter count
111	//	try cansemacquire one more time, return if succeeded
112	//	enqueue itself as a waiter
113	//	sleep
114	//	(waiter descriptor is dequeued by signaler)
115	s := acquireSudog()
116	root := semroot(addr)
117	t0 := int64(0)
118	s.releasetime = 0
119	s.acquiretime = 0
120	s.ticket = 0
121	if profile&semaBlockProfile != 0 && blockprofilerate > 0 {
122		t0 = cputicks()
123		s.releasetime = -1
124	}
125	if profile&semaMutexProfile != 0 && mutexprofilerate > 0 {
126		if t0 == 0 {
127			t0 = cputicks()
128		}
129		s.acquiretime = t0
130	}
131	for {
132		lock(&root.lock)
133		// Add ourselves to nwait to disable "easy case" in semrelease.
134		atomic.Xadd(&root.nwait, 1)
135		// Check cansemacquire to avoid missed wakeup.
136		if cansemacquire(addr) {
137			atomic.Xadd(&root.nwait, -1)
138			unlock(&root.lock)
139			break
140		}
141		// Any semrelease after the cansemacquire knows we're waiting
142		// (we set nwait above), so go to sleep.
143		root.queue(addr, s, lifo)
144		goparkunlock(&root.lock, waitReasonSemacquire, traceEvGoBlockSync, 4+skipframes)
145		if s.ticket != 0 || cansemacquire(addr) {
146			break
147		}
148	}
149	if s.releasetime > 0 {
150		blockevent(s.releasetime-t0, 3+skipframes)
151	}
152	releaseSudog(s)
153}
154
155func semrelease(addr *uint32) {
156	semrelease1(addr, false, 0)
157}
158
159func semrelease1(addr *uint32, handoff bool, skipframes int) {
160	root := semroot(addr)
161	atomic.Xadd(addr, 1)
162
163	// Easy case: no waiters?
164	// This check must happen after the xadd, to avoid a missed wakeup
165	// (see loop in semacquire).
166	if atomic.Load(&root.nwait) == 0 {
167		return
168	}
169
170	// Harder case: search for a waiter and wake it.
171	lock(&root.lock)
172	if atomic.Load(&root.nwait) == 0 {
173		// The count is already consumed by another goroutine,
174		// so no need to wake up another goroutine.
175		unlock(&root.lock)
176		return
177	}
178	s, t0 := root.dequeue(addr)
179	if s != nil {
180		atomic.Xadd(&root.nwait, -1)
181	}
182	unlock(&root.lock)
183	if s != nil { // May be slow or even yield, so unlock first
184		acquiretime := s.acquiretime
185		if acquiretime != 0 {
186			mutexevent(t0-acquiretime, 3+skipframes)
187		}
188		if s.ticket != 0 {
189			throw("corrupted semaphore ticket")
190		}
191		if handoff && cansemacquire(addr) {
192			s.ticket = 1
193		}
194		readyWithTime(s, 5+skipframes)
195		if s.ticket == 1 && getg().m.locks == 0 {
196			// Direct G handoff
197			// readyWithTime has added the waiter G as runnext in the
198			// current P; we now call the scheduler so that we start running
199			// the waiter G immediately.
200			// Note that waiter inherits our time slice: this is desirable
201			// to avoid having a highly contended semaphore hog the P
202			// indefinitely. goyield is like Gosched, but it emits a
203			// "preempted" trace event instead and, more importantly, puts
204			// the current G on the local runq instead of the global one.
205			// We only do this in the starving regime (handoff=true), as in
206			// the non-starving case it is possible for a different waiter
207			// to acquire the semaphore while we are yielding/scheduling,
208			// and this would be wasteful. We wait instead to enter starving
209			// regime, and then we start to do direct handoffs of ticket and
210			// P.
211			// See issue 33747 for discussion.
212			goyield()
213		}
214	}
215}
216
217func semroot(addr *uint32) *semaRoot {
218	return &semtable[(uintptr(unsafe.Pointer(addr))>>3)%semTabSize].root
219}
220
221func cansemacquire(addr *uint32) bool {
222	for {
223		v := atomic.Load(addr)
224		if v == 0 {
225			return false
226		}
227		if atomic.Cas(addr, v, v-1) {
228			return true
229		}
230	}
231}
232
233// queue adds s to the blocked goroutines in semaRoot.
234func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
235	s.g = getg()
236	s.elem = unsafe.Pointer(addr)
237	s.next = nil
238	s.prev = nil
239
240	var last *sudog
241	pt := &root.treap
242	for t := *pt; t != nil; t = *pt {
243		if t.elem == unsafe.Pointer(addr) {
244			// Already have addr in list.
245			if lifo {
246				// Substitute s in t's place in treap.
247				*pt = s
248				s.ticket = t.ticket
249				s.acquiretime = t.acquiretime
250				s.parent = t.parent
251				s.prev = t.prev
252				s.next = t.next
253				if s.prev != nil {
254					s.prev.parent = s
255				}
256				if s.next != nil {
257					s.next.parent = s
258				}
259				// Add t first in s's wait list.
260				s.waitlink = t
261				s.waittail = t.waittail
262				if s.waittail == nil {
263					s.waittail = t
264				}
265				t.parent = nil
266				t.prev = nil
267				t.next = nil
268				t.waittail = nil
269			} else {
270				// Add s to end of t's wait list.
271				if t.waittail == nil {
272					t.waitlink = s
273				} else {
274					t.waittail.waitlink = s
275				}
276				t.waittail = s
277				s.waitlink = nil
278			}
279			return
280		}
281		last = t
282		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
283			pt = &t.prev
284		} else {
285			pt = &t.next
286		}
287	}
288
289	// Add s as new leaf in tree of unique addrs.
290	// The balanced tree is a treap using ticket as the random heap priority.
291	// That is, it is a binary tree ordered according to the elem addresses,
292	// but then among the space of possible binary trees respecting those
293	// addresses, it is kept balanced on average by maintaining a heap ordering
294	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
295	// https://en.wikipedia.org/wiki/Treap
296	// https://faculty.washington.edu/aragon/pubs/rst89.pdf
297	//
298	// s.ticket compared with zero in couple of places, therefore set lowest bit.
299	// It will not affect treap's quality noticeably.
300	s.ticket = fastrand() | 1
301	s.parent = last
302	*pt = s
303
304	// Rotate up into tree according to ticket (priority).
305	for s.parent != nil && s.parent.ticket > s.ticket {
306		if s.parent.prev == s {
307			root.rotateRight(s.parent)
308		} else {
309			if s.parent.next != s {
310				panic("semaRoot queue")
311			}
312			root.rotateLeft(s.parent)
313		}
314	}
315}
316
317// dequeue searches for and finds the first goroutine
318// in semaRoot blocked on addr.
319// If the sudog was being profiled, dequeue returns the time
320// at which it was woken up as now. Otherwise now is 0.
321func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now int64) {
322	ps := &root.treap
323	s := *ps
324	for ; s != nil; s = *ps {
325		if s.elem == unsafe.Pointer(addr) {
326			goto Found
327		}
328		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
329			ps = &s.prev
330		} else {
331			ps = &s.next
332		}
333	}
334	return nil, 0
335
336Found:
337	now = int64(0)
338	if s.acquiretime != 0 {
339		now = cputicks()
340	}
341	if t := s.waitlink; t != nil {
342		// Substitute t, also waiting on addr, for s in root tree of unique addrs.
343		*ps = t
344		t.ticket = s.ticket
345		t.parent = s.parent
346		t.prev = s.prev
347		if t.prev != nil {
348			t.prev.parent = t
349		}
350		t.next = s.next
351		if t.next != nil {
352			t.next.parent = t
353		}
354		if t.waitlink != nil {
355			t.waittail = s.waittail
356		} else {
357			t.waittail = nil
358		}
359		t.acquiretime = now
360		s.waitlink = nil
361		s.waittail = nil
362	} else {
363		// Rotate s down to be leaf of tree for removal, respecting priorities.
364		for s.next != nil || s.prev != nil {
365			if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket {
366				root.rotateRight(s)
367			} else {
368				root.rotateLeft(s)
369			}
370		}
371		// Remove s, now a leaf.
372		if s.parent != nil {
373			if s.parent.prev == s {
374				s.parent.prev = nil
375			} else {
376				s.parent.next = nil
377			}
378		} else {
379			root.treap = nil
380		}
381	}
382	s.parent = nil
383	s.elem = nil
384	s.next = nil
385	s.prev = nil
386	s.ticket = 0
387	return s, now
388}
389
390// rotateLeft rotates the tree rooted at node x.
391// turning (x a (y b c)) into (y (x a b) c).
392func (root *semaRoot) rotateLeft(x *sudog) {
393	// p -> (x a (y b c))
394	p := x.parent
395	y := x.next
396	b := y.prev
397
398	y.prev = x
399	x.parent = y
400	x.next = b
401	if b != nil {
402		b.parent = x
403	}
404
405	y.parent = p
406	if p == nil {
407		root.treap = y
408	} else if p.prev == x {
409		p.prev = y
410	} else {
411		if p.next != x {
412			throw("semaRoot rotateLeft")
413		}
414		p.next = y
415	}
416}
417
418// rotateRight rotates the tree rooted at node y.
419// turning (y (x a b) c) into (x a (y b c)).
420func (root *semaRoot) rotateRight(y *sudog) {
421	// p -> (y (x a b) c)
422	p := y.parent
423	x := y.prev
424	b := x.next
425
426	x.next = y
427	y.parent = x
428	y.prev = b
429	if b != nil {
430		b.parent = y
431	}
432
433	x.parent = p
434	if p == nil {
435		root.treap = x
436	} else if p.prev == y {
437		p.prev = x
438	} else {
439		if p.next != y {
440			throw("semaRoot rotateRight")
441		}
442		p.next = x
443	}
444}
445
446// notifyList is a ticket-based notification list used to implement sync.Cond.
447//
448// It must be kept in sync with the sync package.
449type notifyList struct {
450	// wait is the ticket number of the next waiter. It is atomically
451	// incremented outside the lock.
452	wait uint32
453
454	// notify is the ticket number of the next waiter to be notified. It can
455	// be read outside the lock, but is only written to with lock held.
456	//
457	// Both wait & notify can wrap around, and such cases will be correctly
458	// handled as long as their "unwrapped" difference is bounded by 2^31.
459	// For this not to be the case, we'd need to have 2^31+ goroutines
460	// blocked on the same condvar, which is currently not possible.
461	notify uint32
462
463	// List of parked waiters.
464	lock mutex
465	head *sudog
466	tail *sudog
467}
468
469// less checks if a < b, considering a & b running counts that may overflow the
470// 32-bit range, and that their "unwrapped" difference is always less than 2^31.
471func less(a, b uint32) bool {
472	return int32(a-b) < 0
473}
474
475// notifyListAdd adds the caller to a notify list such that it can receive
476// notifications. The caller must eventually call notifyListWait to wait for
477// such a notification, passing the returned ticket number.
478//go:linkname notifyListAdd sync.runtime_notifyListAdd
479func notifyListAdd(l *notifyList) uint32 {
480	// This may be called concurrently, for example, when called from
481	// sync.Cond.Wait while holding a RWMutex in read mode.
482	return atomic.Xadd(&l.wait, 1) - 1
483}
484
485// notifyListWait waits for a notification. If one has been sent since
486// notifyListAdd was called, it returns immediately. Otherwise, it blocks.
487//go:linkname notifyListWait sync.runtime_notifyListWait
488func notifyListWait(l *notifyList, t uint32) {
489	lock(&l.lock)
490
491	// Return right away if this ticket has already been notified.
492	if less(t, l.notify) {
493		unlock(&l.lock)
494		return
495	}
496
497	// Enqueue itself.
498	s := acquireSudog()
499	s.g = getg()
500	s.ticket = t
501	s.releasetime = 0
502	t0 := int64(0)
503	if blockprofilerate > 0 {
504		t0 = cputicks()
505		s.releasetime = -1
506	}
507	if l.tail == nil {
508		l.head = s
509	} else {
510		l.tail.next = s
511	}
512	l.tail = s
513	goparkunlock(&l.lock, waitReasonSyncCondWait, traceEvGoBlockCond, 3)
514	if t0 != 0 {
515		blockevent(s.releasetime-t0, 2)
516	}
517	releaseSudog(s)
518}
519
520// notifyListNotifyAll notifies all entries in the list.
521//go:linkname notifyListNotifyAll sync.runtime_notifyListNotifyAll
522func notifyListNotifyAll(l *notifyList) {
523	// Fast-path: if there are no new waiters since the last notification
524	// we don't need to acquire the lock.
525	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
526		return
527	}
528
529	// Pull the list out into a local variable, waiters will be readied
530	// outside the lock.
531	lock(&l.lock)
532	s := l.head
533	l.head = nil
534	l.tail = nil
535
536	// Update the next ticket to be notified. We can set it to the current
537	// value of wait because any previous waiters are already in the list
538	// or will notice that they have already been notified when trying to
539	// add themselves to the list.
540	atomic.Store(&l.notify, atomic.Load(&l.wait))
541	unlock(&l.lock)
542
543	// Go through the local list and ready all waiters.
544	for s != nil {
545		next := s.next
546		s.next = nil
547		readyWithTime(s, 4)
548		s = next
549	}
550}
551
552// notifyListNotifyOne notifies one entry in the list.
553//go:linkname notifyListNotifyOne sync.runtime_notifyListNotifyOne
554func notifyListNotifyOne(l *notifyList) {
555	// Fast-path: if there are no new waiters since the last notification
556	// we don't need to acquire the lock at all.
557	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
558		return
559	}
560
561	lock(&l.lock)
562
563	// Re-check under the lock if we need to do anything.
564	t := l.notify
565	if t == atomic.Load(&l.wait) {
566		unlock(&l.lock)
567		return
568	}
569
570	// Update the next notify ticket number.
571	atomic.Store(&l.notify, t+1)
572
573	// Try to find the g that needs to be notified.
574	// If it hasn't made it to the list yet we won't find it,
575	// but it won't park itself once it sees the new notify number.
576	//
577	// This scan looks linear but essentially always stops quickly.
578	// Because g's queue separately from taking numbers,
579	// there may be minor reorderings in the list, but we
580	// expect the g we're looking for to be near the front.
581	// The g has others in front of it on the list only to the
582	// extent that it lost the race, so the iteration will not
583	// be too long. This applies even when the g is missing:
584	// it hasn't yet gotten to sleep and has lost the race to
585	// the (few) other g's that we find on the list.
586	for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
587		if s.ticket == t {
588			n := s.next
589			if p != nil {
590				p.next = n
591			} else {
592				l.head = n
593			}
594			if n == nil {
595				l.tail = p
596			}
597			unlock(&l.lock)
598			s.next = nil
599			readyWithTime(s, 4)
600			return
601		}
602	}
603	unlock(&l.lock)
604}
605
606//go:linkname notifyListCheck sync.runtime_notifyListCheck
607func notifyListCheck(sz uintptr) {
608	if sz != unsafe.Sizeof(notifyList{}) {
609		print("runtime: bad notifyList size - sync=", sz, " runtime=", unsafe.Sizeof(notifyList{}), "\n")
610		throw("bad notifyList size")
611	}
612}
613
614//go:linkname sync_nanotime sync.runtime_nanotime
615func sync_nanotime() int64 {
616	return nanotime()
617}
618