1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Garbage collector (GC).
6//
7// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
8// GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
9// non-generational and non-compacting. Allocation is done using size segregated per P allocation
10// areas to minimize fragmentation while eliminating locks in the common case.
11//
12// The algorithm decomposes into several steps.
13// This is a high level description of the algorithm being used. For an overview of GC a good
14// place to start is Richard Jones' gchandbook.org.
15//
16// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
17// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
18// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
19// 966-975.
20// For journal quality proofs that these steps are complete, correct, and terminate see
21// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
22// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
23//
24// 1. GC performs sweep termination.
25//
26//    a. Stop the world. This causes all Ps to reach a GC safe-point.
27//
28//    b. Sweep any unswept spans. There will only be unswept spans if
29//    this GC cycle was forced before the expected time.
30//
31// 2. GC performs the mark phase.
32//
33//    a. Prepare for the mark phase by setting gcphase to _GCmark
34//    (from _GCoff), enabling the write barrier, enabling mutator
35//    assists, and enqueueing root mark jobs. No objects may be
36//    scanned until all Ps have enabled the write barrier, which is
37//    accomplished using STW.
38//
39//    b. Start the world. From this point, GC work is done by mark
40//    workers started by the scheduler and by assists performed as
41//    part of allocation. The write barrier shades both the
42//    overwritten pointer and the new pointer value for any pointer
43//    writes (see mbarrier.go for details). Newly allocated objects
44//    are immediately marked black.
45//
46//    c. GC performs root marking jobs. This includes scanning all
47//    stacks, shading all globals, and shading any heap pointers in
48//    off-heap runtime data structures. Scanning a stack stops a
49//    goroutine, shades any pointers found on its stack, and then
50//    resumes the goroutine.
51//
52//    d. GC drains the work queue of grey objects, scanning each grey
53//    object to black and shading all pointers found in the object
54//    (which in turn may add those pointers to the work queue).
55//
56//    e. Because GC work is spread across local caches, GC uses a
57//    distributed termination algorithm to detect when there are no
58//    more root marking jobs or grey objects (see gcMarkDone). At this
59//    point, GC transitions to mark termination.
60//
61// 3. GC performs mark termination.
62//
63//    a. Stop the world.
64//
65//    b. Set gcphase to _GCmarktermination, and disable workers and
66//    assists.
67//
68//    c. Perform housekeeping like flushing mcaches.
69//
70// 4. GC performs the sweep phase.
71//
72//    a. Prepare for the sweep phase by setting gcphase to _GCoff,
73//    setting up sweep state and disabling the write barrier.
74//
75//    b. Start the world. From this point on, newly allocated objects
76//    are white, and allocating sweeps spans before use if necessary.
77//
78//    c. GC does concurrent sweeping in the background and in response
79//    to allocation. See description below.
80//
81// 5. When sufficient allocation has taken place, replay the sequence
82// starting with 1 above. See discussion of GC rate below.
83
84// Concurrent sweep.
85//
86// The sweep phase proceeds concurrently with normal program execution.
87// The heap is swept span-by-span both lazily (when a goroutine needs another span)
88// and concurrently in a background goroutine (this helps programs that are not CPU bound).
89// At the end of STW mark termination all spans are marked as "needs sweeping".
90//
91// The background sweeper goroutine simply sweeps spans one-by-one.
92//
93// To avoid requesting more OS memory while there are unswept spans, when a
94// goroutine needs another span, it first attempts to reclaim that much memory
95// by sweeping. When a goroutine needs to allocate a new small-object span, it
96// sweeps small-object spans for the same object size until it frees at least
97// one object. When a goroutine needs to allocate large-object span from heap,
98// it sweeps spans until it frees at least that many pages into heap. There is
99// one case where this may not suffice: if a goroutine sweeps and frees two
100// nonadjacent one-page spans to the heap, it will allocate a new two-page
101// span, but there can still be other one-page unswept spans which could be
102// combined into a two-page span.
103//
104// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
105// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
106// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
107// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
108// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
109// The finalizer goroutine is kicked off only when all spans are swept.
110// When the next GC starts, it sweeps all not-yet-swept spans (if any).
111
112// GC rate.
113// Next GC is after we've allocated an extra amount of memory proportional to
114// the amount already in use. The proportion is controlled by GOGC environment variable
115// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
116// (this mark is tracked in gcController.heapGoal variable). This keeps the GC cost in
117// linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
118// (and also the amount of extra memory used).
119
120// Oblets
121//
122// In order to prevent long pauses while scanning large objects and to
123// improve parallelism, the garbage collector breaks up scan jobs for
124// objects larger than maxObletBytes into "oblets" of at most
125// maxObletBytes. When scanning encounters the beginning of a large
126// object, it scans only the first oblet and enqueues the remaining
127// oblets as new scan jobs.
128
129package runtime
130
131import (
132	"internal/cpu"
133	"runtime/internal/atomic"
134	"unsafe"
135)
136
137const (
138	_DebugGC         = 0
139	_ConcurrentSweep = true
140	_FinBlockSize    = 4 * 1024
141
142	// debugScanConservative enables debug logging for stack
143	// frames that are scanned conservatively.
144	debugScanConservative = false
145
146	// sweepMinHeapDistance is a lower bound on the heap distance
147	// (in bytes) reserved for concurrent sweeping between GC
148	// cycles.
149	sweepMinHeapDistance = 1024 * 1024
150)
151
152func gcinit() {
153	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
154		throw("size of Workbuf is suboptimal")
155	}
156	// No sweep on the first cycle.
157	mheap_.sweepDrained = 1
158
159	// Initialize GC pacer state.
160	// Use the environment variable GOGC for the initial gcPercent value.
161	gcController.init(readGOGC())
162
163	work.startSema = 1
164	work.markDoneSema = 1
165	lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
166	lockInit(&work.assistQueue.lock, lockRankAssistQueue)
167	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
168}
169
170// Temporary in order to enable register ABI work.
171// TODO(register args): convert back to local chan in gcenabled, passed to "go" stmts.
172var gcenable_setup chan int
173
174// gcenable is called after the bulk of the runtime initialization,
175// just before we're about to start letting user code run.
176// It kicks off the background sweeper goroutine, the background
177// scavenger goroutine, and enables GC.
178func gcenable() {
179	// Kick off sweeping and scavenging.
180	gcenable_setup = make(chan int, 2)
181	expectSystemGoroutine()
182	go bgsweep()
183	expectSystemGoroutine()
184	go bgscavenge()
185	<-gcenable_setup
186	<-gcenable_setup
187	gcenable_setup = nil
188	memstats.enablegc = true // now that runtime is initialized, GC is okay
189}
190
191// Garbage collector phase.
192// Indicates to write barrier and synchronization task to perform.
193var gcphase uint32
194
195// The compiler knows about this variable.
196// If you change it, you must change gofrontend/wb.cc, too.
197// If you change the first four bytes, you must also change the write
198// barrier insertion code.
199var writeBarrier struct {
200	enabled bool    // compiler emits a check of this before calling write barrier
201	pad     [3]byte // compiler uses 32-bit load for "enabled" field
202	needed  bool    // whether we need a write barrier for current GC phase
203	cgo     bool    // whether we need a write barrier for a cgo check
204	alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
205}
206
207// gcBlackenEnabled is 1 if mutator assists and background mark
208// workers are allowed to blacken objects. This must only be set when
209// gcphase == _GCmark.
210var gcBlackenEnabled uint32
211
212const (
213	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
214	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
215	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
216)
217
218//go:nosplit
219func setGCPhase(x uint32) {
220	atomic.Store(&gcphase, x)
221	writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
222	writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
223}
224
225// gcMarkWorkerMode represents the mode that a concurrent mark worker
226// should operate in.
227//
228// Concurrent marking happens through four different mechanisms. One
229// is mutator assists, which happen in response to allocations and are
230// not scheduled. The other three are variations in the per-P mark
231// workers and are distinguished by gcMarkWorkerMode.
232type gcMarkWorkerMode int
233
234const (
235	// gcMarkWorkerNotWorker indicates that the next scheduled G is not
236	// starting work and the mode should be ignored.
237	gcMarkWorkerNotWorker gcMarkWorkerMode = iota
238
239	// gcMarkWorkerDedicatedMode indicates that the P of a mark
240	// worker is dedicated to running that mark worker. The mark
241	// worker should run without preemption.
242	gcMarkWorkerDedicatedMode
243
244	// gcMarkWorkerFractionalMode indicates that a P is currently
245	// running the "fractional" mark worker. The fractional worker
246	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
247	// an integer and using only dedicated workers would result in
248	// utilization too far from the target of gcBackgroundUtilization.
249	// The fractional worker should run until it is preempted and
250	// will be scheduled to pick up the fractional part of
251	// GOMAXPROCS*gcBackgroundUtilization.
252	gcMarkWorkerFractionalMode
253
254	// gcMarkWorkerIdleMode indicates that a P is running the mark
255	// worker because it has nothing else to do. The idle worker
256	// should run until it is preempted and account its time
257	// against gcController.idleMarkTime.
258	gcMarkWorkerIdleMode
259)
260
261// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
262// to use in execution traces.
263var gcMarkWorkerModeStrings = [...]string{
264	"Not worker",
265	"GC (dedicated)",
266	"GC (fractional)",
267	"GC (idle)",
268}
269
270// pollFractionalWorkerExit reports whether a fractional mark worker
271// should self-preempt. It assumes it is called from the fractional
272// worker.
273func pollFractionalWorkerExit() bool {
274	// This should be kept in sync with the fractional worker
275	// scheduler logic in findRunnableGCWorker.
276	now := nanotime()
277	delta := now - gcController.markStartTime
278	if delta <= 0 {
279		return true
280	}
281	p := getg().m.p.ptr()
282	selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
283	// Add some slack to the utilization goal so that the
284	// fractional worker isn't behind again the instant it exits.
285	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
286}
287
288var work struct {
289	full  lfstack          // lock-free list of full blocks workbuf
290	empty lfstack          // lock-free list of empty blocks workbuf
291	pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
292
293	wbufSpans struct {
294		lock mutex
295		// free is a list of spans dedicated to workbufs, but
296		// that don't currently contain any workbufs.
297		free mSpanList
298		// busy is a list of all spans containing workbufs on
299		// one of the workbuf lists.
300		busy mSpanList
301	}
302
303	// Restore 64-bit alignment on 32-bit.
304	_ uint32
305
306	// bytesMarked is the number of bytes marked this cycle. This
307	// includes bytes blackened in scanned objects, noscan objects
308	// that go straight to black, and permagrey objects scanned by
309	// markroot during the concurrent scan phase. This is updated
310	// atomically during the cycle. Updates may be batched
311	// arbitrarily, since the value is only read at the end of the
312	// cycle.
313	//
314	// Because of benign races during marking, this number may not
315	// be the exact number of marked bytes, but it should be very
316	// close.
317	//
318	// Put this field here because it needs 64-bit atomic access
319	// (and thus 8-byte alignment even on 32-bit architectures).
320	bytesMarked uint64
321
322	markrootNext uint32 // next markroot job
323	markrootJobs uint32 // number of markroot jobs
324
325	nproc  uint32
326	tstart int64
327	nwait  uint32
328
329	// Number of roots of various root types. Set by gcMarkRootPrepare.
330	nDataRoots, nSpanRoots, nStackRoots int
331
332	// Base indexes of each root type. Set by gcMarkRootPrepare.
333	baseData, baseSpans, baseStacks, baseEnd uint32
334
335	// Each type of GC state transition is protected by a lock.
336	// Since multiple threads can simultaneously detect the state
337	// transition condition, any thread that detects a transition
338	// condition must acquire the appropriate transition lock,
339	// re-check the transition condition and return if it no
340	// longer holds or perform the transition if it does.
341	// Likewise, any transition must invalidate the transition
342	// condition before releasing the lock. This ensures that each
343	// transition is performed by exactly one thread and threads
344	// that need the transition to happen block until it has
345	// happened.
346	//
347	// startSema protects the transition from "off" to mark or
348	// mark termination.
349	startSema uint32
350	// markDoneSema protects transitions from mark to mark termination.
351	markDoneSema uint32
352
353	bgMarkReady note   // signal background mark worker has started
354	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
355	// Background mark completion signaling
356
357	// mode is the concurrency mode of the current GC cycle.
358	mode gcMode
359
360	// userForced indicates the current GC cycle was forced by an
361	// explicit user call.
362	userForced bool
363
364	// totaltime is the CPU nanoseconds spent in GC since the
365	// program started if debug.gctrace > 0.
366	totaltime int64
367
368	// initialHeapLive is the value of gcController.heapLive at the
369	// beginning of this GC cycle.
370	initialHeapLive uint64
371
372	// assistQueue is a queue of assists that are blocked because
373	// there was neither enough credit to steal or enough work to
374	// do.
375	assistQueue struct {
376		lock mutex
377		q    gQueue
378	}
379
380	// sweepWaiters is a list of blocked goroutines to wake when
381	// we transition from mark termination to sweep.
382	sweepWaiters struct {
383		lock mutex
384		list gList
385	}
386
387	// cycles is the number of completed GC cycles, where a GC
388	// cycle is sweep termination, mark, mark termination, and
389	// sweep. This differs from memstats.numgc, which is
390	// incremented at mark termination.
391	cycles uint32
392
393	// Timing/utilization stats for this cycle.
394	stwprocs, maxprocs                 int32
395	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
396
397	pauseNS    int64 // total STW time this cycle
398	pauseStart int64 // nanotime() of last STW
399
400	// debug.gctrace heap sizes for this cycle.
401	heap0, heap1, heap2, heapGoal uint64
402}
403
404// GC runs a garbage collection and blocks the caller until the
405// garbage collection is complete. It may also block the entire
406// program.
407func GC() {
408	// We consider a cycle to be: sweep termination, mark, mark
409	// termination, and sweep. This function shouldn't return
410	// until a full cycle has been completed, from beginning to
411	// end. Hence, we always want to finish up the current cycle
412	// and start a new one. That means:
413	//
414	// 1. In sweep termination, mark, or mark termination of cycle
415	// N, wait until mark termination N completes and transitions
416	// to sweep N.
417	//
418	// 2. In sweep N, help with sweep N.
419	//
420	// At this point we can begin a full cycle N+1.
421	//
422	// 3. Trigger cycle N+1 by starting sweep termination N+1.
423	//
424	// 4. Wait for mark termination N+1 to complete.
425	//
426	// 5. Help with sweep N+1 until it's done.
427	//
428	// This all has to be written to deal with the fact that the
429	// GC may move ahead on its own. For example, when we block
430	// until mark termination N, we may wake up in cycle N+2.
431
432	// Wait until the current sweep termination, mark, and mark
433	// termination complete.
434	n := atomic.Load(&work.cycles)
435	gcWaitOnMark(n)
436
437	// We're now in sweep N or later. Trigger GC cycle N+1, which
438	// will first finish sweep N if necessary and then enter sweep
439	// termination N+1.
440	gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
441
442	// Wait for mark termination N+1 to complete.
443	gcWaitOnMark(n + 1)
444
445	// Finish sweep N+1 before returning. We do this both to
446	// complete the cycle and because runtime.GC() is often used
447	// as part of tests and benchmarks to get the system into a
448	// relatively stable and isolated state.
449	for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
450		sweep.nbgsweep++
451		Gosched()
452	}
453
454	// Callers may assume that the heap profile reflects the
455	// just-completed cycle when this returns (historically this
456	// happened because this was a STW GC), but right now the
457	// profile still reflects mark termination N, not N+1.
458	//
459	// As soon as all of the sweep frees from cycle N+1 are done,
460	// we can go ahead and publish the heap profile.
461	//
462	// First, wait for sweeping to finish. (We know there are no
463	// more spans on the sweep queue, but we may be concurrently
464	// sweeping spans, so we have to wait.)
465	for atomic.Load(&work.cycles) == n+1 && !isSweepDone() {
466		Gosched()
467	}
468
469	// Now we're really done with sweeping, so we can publish the
470	// stable heap profile. Only do this if we haven't already hit
471	// another mark termination.
472	mp := acquirem()
473	cycle := atomic.Load(&work.cycles)
474	if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
475		mProf_PostSweep()
476	}
477	releasem(mp)
478}
479
480// gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
481// already completed this mark phase, it returns immediately.
482func gcWaitOnMark(n uint32) {
483	for {
484		// Disable phase transitions.
485		lock(&work.sweepWaiters.lock)
486		nMarks := atomic.Load(&work.cycles)
487		if gcphase != _GCmark {
488			// We've already completed this cycle's mark.
489			nMarks++
490		}
491		if nMarks > n {
492			// We're done.
493			unlock(&work.sweepWaiters.lock)
494			return
495		}
496
497		// Wait until sweep termination, mark, and mark
498		// termination of cycle N complete.
499		work.sweepWaiters.list.push(getg())
500		goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
501	}
502}
503
504// gcMode indicates how concurrent a GC cycle should be.
505type gcMode int
506
507const (
508	gcBackgroundMode gcMode = iota // concurrent GC and sweep
509	gcForceMode                    // stop-the-world GC now, concurrent sweep
510	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
511)
512
513// A gcTrigger is a predicate for starting a GC cycle. Specifically,
514// it is an exit condition for the _GCoff phase.
515type gcTrigger struct {
516	kind gcTriggerKind
517	now  int64  // gcTriggerTime: current time
518	n    uint32 // gcTriggerCycle: cycle number to start
519}
520
521type gcTriggerKind int
522
523const (
524	// gcTriggerHeap indicates that a cycle should be started when
525	// the heap size reaches the trigger heap size computed by the
526	// controller.
527	gcTriggerHeap gcTriggerKind = iota
528
529	// gcTriggerTime indicates that a cycle should be started when
530	// it's been more than forcegcperiod nanoseconds since the
531	// previous GC cycle.
532	gcTriggerTime
533
534	// gcTriggerCycle indicates that a cycle should be started if
535	// we have not yet started cycle number gcTrigger.n (relative
536	// to work.cycles).
537	gcTriggerCycle
538)
539
540// test reports whether the trigger condition is satisfied, meaning
541// that the exit condition for the _GCoff phase has been met. The exit
542// condition should be tested when allocating.
543func (t gcTrigger) test() bool {
544	if !memstats.enablegc || panicking != 0 || gcphase != _GCoff {
545		return false
546	}
547	switch t.kind {
548	case gcTriggerHeap:
549		// Non-atomic access to gcController.heapLive for performance. If
550		// we are going to trigger on this, this thread just
551		// atomically wrote gcController.heapLive anyway and we'll see our
552		// own write.
553		return gcController.heapLive >= gcController.trigger
554	case gcTriggerTime:
555		if gcController.gcPercent < 0 {
556			return false
557		}
558		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
559		return lastgc != 0 && t.now-lastgc > forcegcperiod
560	case gcTriggerCycle:
561		// t.n > work.cycles, but accounting for wraparound.
562		return int32(t.n-work.cycles) > 0
563	}
564	return true
565}
566
567// gcStart starts the GC. It transitions from _GCoff to _GCmark (if
568// debug.gcstoptheworld == 0) or performs all of GC (if
569// debug.gcstoptheworld != 0).
570//
571// This may return without performing this transition in some cases,
572// such as when called on a system stack or with locks held.
573func gcStart(trigger gcTrigger) {
574	// Since this is called from malloc and malloc is called in
575	// the guts of a number of libraries that might be holding
576	// locks, don't attempt to start GC in non-preemptible or
577	// potentially unstable situations.
578	mp := acquirem()
579	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
580		releasem(mp)
581		return
582	}
583	releasem(mp)
584	mp = nil
585
586	// Pick up the remaining unswept/not being swept spans concurrently
587	//
588	// This shouldn't happen if we're being invoked in background
589	// mode since proportional sweep should have just finished
590	// sweeping everything, but rounding errors, etc, may leave a
591	// few spans unswept. In forced mode, this is necessary since
592	// GC can be forced at any point in the sweeping cycle.
593	//
594	// We check the transition condition continuously here in case
595	// this G gets delayed in to the next GC cycle.
596	for trigger.test() && sweepone() != ^uintptr(0) {
597		sweep.nbgsweep++
598	}
599
600	// Perform GC initialization and the sweep termination
601	// transition.
602	semacquire(&work.startSema)
603	// Re-check transition condition under transition lock.
604	if !trigger.test() {
605		semrelease(&work.startSema)
606		return
607	}
608
609	// For stats, check if this GC was forced by the user.
610	work.userForced = trigger.kind == gcTriggerCycle
611
612	// In gcstoptheworld debug mode, upgrade the mode accordingly.
613	// We do this after re-checking the transition condition so
614	// that multiple goroutines that detect the heap trigger don't
615	// start multiple STW GCs.
616	mode := gcBackgroundMode
617	if debug.gcstoptheworld == 1 {
618		mode = gcForceMode
619	} else if debug.gcstoptheworld == 2 {
620		mode = gcForceBlockMode
621	}
622
623	// Ok, we're doing it! Stop everybody else
624	semacquire(&gcsema)
625	semacquire(&worldsema)
626
627	if trace.enabled {
628		traceGCStart()
629	}
630
631	// Check that all Ps have finished deferred mcache flushes.
632	for _, p := range allp {
633		if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
634			println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
635			throw("p mcache not flushed")
636		}
637	}
638
639	gcBgMarkStartWorkers()
640
641	systemstack(gcResetMarkState)
642
643	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
644	if work.stwprocs > ncpu {
645		// This is used to compute CPU time of the STW phases,
646		// so it can't be more than ncpu, even if GOMAXPROCS is.
647		work.stwprocs = ncpu
648	}
649	work.heap0 = atomic.Load64(&gcController.heapLive)
650	work.pauseNS = 0
651	work.mode = mode
652
653	now := nanotime()
654	work.tSweepTerm = now
655	work.pauseStart = now
656	if trace.enabled {
657		traceGCSTWStart(1)
658	}
659	systemstack(stopTheWorldWithSema)
660	// Finish sweep before we start concurrent scan.
661	systemstack(func() {
662		finishsweep_m()
663	})
664
665	// clearpools before we start the GC. If we wait they memory will not be
666	// reclaimed until the next GC cycle.
667	clearpools()
668
669	work.cycles++
670
671	gcController.startCycle()
672	work.heapGoal = gcController.heapGoal
673
674	// In STW mode, disable scheduling of user Gs. This may also
675	// disable scheduling of this goroutine, so it may block as
676	// soon as we start the world again.
677	if mode != gcBackgroundMode {
678		schedEnableUser(false)
679	}
680
681	// Enter concurrent mark phase and enable
682	// write barriers.
683	//
684	// Because the world is stopped, all Ps will
685	// observe that write barriers are enabled by
686	// the time we start the world and begin
687	// scanning.
688	//
689	// Write barriers must be enabled before assists are
690	// enabled because they must be enabled before
691	// any non-leaf heap objects are marked. Since
692	// allocations are blocked until assists can
693	// happen, we want enable assists as early as
694	// possible.
695	setGCPhase(_GCmark)
696
697	gcBgMarkPrepare() // Must happen before assist enable.
698	gcMarkRootPrepare()
699
700	// Mark all active tinyalloc blocks. Since we're
701	// allocating from these, they need to be black like
702	// other allocations. The alternative is to blacken
703	// the tiny block on every allocation from it, which
704	// would slow down the tiny allocator.
705	gcMarkTinyAllocs()
706
707	// At this point all Ps have enabled the write
708	// barrier, thus maintaining the no white to
709	// black invariant. Enable mutator assists to
710	// put back-pressure on fast allocating
711	// mutators.
712	atomic.Store(&gcBlackenEnabled, 1)
713
714	// Assists and workers can start the moment we start
715	// the world.
716	gcController.markStartTime = now
717
718	// In STW mode, we could block the instant systemstack
719	// returns, so make sure we're not preemptible.
720	mp = acquirem()
721
722	// Concurrent mark.
723	systemstack(func() {
724		now = startTheWorldWithSema(trace.enabled)
725		work.pauseNS += now - work.pauseStart
726		work.tMark = now
727		memstats.gcPauseDist.record(now - work.pauseStart)
728	})
729
730	// Release the world sema before Gosched() in STW mode
731	// because we will need to reacquire it later but before
732	// this goroutine becomes runnable again, and we could
733	// self-deadlock otherwise.
734	semrelease(&worldsema)
735	releasem(mp)
736
737	// Make sure we block instead of returning to user code
738	// in STW mode.
739	if mode != gcBackgroundMode {
740		Gosched()
741	}
742
743	semrelease(&work.startSema)
744}
745
746// gcMarkDoneFlushed counts the number of P's with flushed work.
747//
748// Ideally this would be a captured local in gcMarkDone, but forEachP
749// escapes its callback closure, so it can't capture anything.
750//
751// This is protected by markDoneSema.
752var gcMarkDoneFlushed uint32
753
754// gcMarkDone transitions the GC from mark to mark termination if all
755// reachable objects have been marked (that is, there are no grey
756// objects and can be no more in the future). Otherwise, it flushes
757// all local work to the global queues where it can be discovered by
758// other workers.
759//
760// This should be called when all local mark work has been drained and
761// there are no remaining workers. Specifically, when
762//
763//   work.nwait == work.nproc && !gcMarkWorkAvailable(p)
764//
765// The calling context must be preemptible.
766//
767// Flushing local work is important because idle Ps may have local
768// work queued. This is the only way to make that work visible and
769// drive GC to completion.
770//
771// It is explicitly okay to have write barriers in this function. If
772// it does transition to mark termination, then all reachable objects
773// have been marked, so the write barrier cannot shade any more
774// objects.
775func gcMarkDone() {
776	// Ensure only one thread is running the ragged barrier at a
777	// time.
778	semacquire(&work.markDoneSema)
779
780top:
781	// Re-check transition condition under transition lock.
782	//
783	// It's critical that this checks the global work queues are
784	// empty before performing the ragged barrier. Otherwise,
785	// there could be global work that a P could take after the P
786	// has passed the ragged barrier.
787	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
788		semrelease(&work.markDoneSema)
789		return
790	}
791
792	// forEachP needs worldsema to execute, and we'll need it to
793	// stop the world later, so acquire worldsema now.
794	semacquire(&worldsema)
795
796	// Flush all local buffers and collect flushedWork flags.
797	gcMarkDoneFlushed = 0
798	systemstack(func() {
799		gp := getg().m.curg
800		// Mark the user stack as preemptible so that it may be scanned.
801		// Otherwise, our attempt to force all P's to a safepoint could
802		// result in a deadlock as we attempt to preempt a worker that's
803		// trying to preempt us (e.g. for a stack scan).
804		casgstatus(gp, _Grunning, _Gwaiting)
805		forEachP(func(_p_ *p) {
806			// Flush the write barrier buffer, since this may add
807			// work to the gcWork.
808			wbBufFlush1(_p_)
809
810			// Flush the gcWork, since this may create global work
811			// and set the flushedWork flag.
812			//
813			// TODO(austin): Break up these workbufs to
814			// better distribute work.
815			_p_.gcw.dispose()
816			// Collect the flushedWork flag.
817			if _p_.gcw.flushedWork {
818				atomic.Xadd(&gcMarkDoneFlushed, 1)
819				_p_.gcw.flushedWork = false
820			}
821		})
822		casgstatus(gp, _Gwaiting, _Grunning)
823	})
824
825	if gcMarkDoneFlushed != 0 {
826		// More grey objects were discovered since the
827		// previous termination check, so there may be more
828		// work to do. Keep going. It's possible the
829		// transition condition became true again during the
830		// ragged barrier, so re-check it.
831		semrelease(&worldsema)
832		goto top
833	}
834
835	// There was no global work, no local work, and no Ps
836	// communicated work since we took markDoneSema. Therefore
837	// there are no grey objects and no more objects can be
838	// shaded. Transition to mark termination.
839	now := nanotime()
840	work.tMarkTerm = now
841	work.pauseStart = now
842	getg().m.preemptoff = "gcing"
843	if trace.enabled {
844		traceGCSTWStart(0)
845	}
846	systemstack(stopTheWorldWithSema)
847	// The gcphase is _GCmark, it will transition to _GCmarktermination
848	// below. The important thing is that the wb remains active until
849	// all marking is complete. This includes writes made by the GC.
850
851	// There is sometimes work left over when we enter mark termination due
852	// to write barriers performed after the completion barrier above.
853	// Detect this and resume concurrent mark. This is obviously
854	// unfortunate.
855	//
856	// See issue #27993 for details.
857	//
858	// Switch to the system stack to call wbBufFlush1, though in this case
859	// it doesn't matter because we're non-preemptible anyway.
860	restart := false
861	systemstack(func() {
862		for _, p := range allp {
863			wbBufFlush1(p)
864			if !p.gcw.empty() {
865				restart = true
866				break
867			}
868		}
869	})
870	if restart {
871		getg().m.preemptoff = ""
872		systemstack(func() {
873			now := startTheWorldWithSema(true)
874			work.pauseNS += now - work.pauseStart
875			memstats.gcPauseDist.record(now - work.pauseStart)
876		})
877		semrelease(&worldsema)
878		goto top
879	}
880
881	// Disable assists and background workers. We must do
882	// this before waking blocked assists.
883	atomic.Store(&gcBlackenEnabled, 0)
884
885	// Wake all blocked assists. These will run when we
886	// start the world again.
887	gcWakeAllAssists()
888
889	// Likewise, release the transition lock. Blocked
890	// workers and assists will run when we start the
891	// world again.
892	semrelease(&work.markDoneSema)
893
894	// In STW mode, re-enable user goroutines. These will be
895	// queued to run after we start the world.
896	schedEnableUser(true)
897
898	// endCycle depends on all gcWork cache stats being flushed.
899	// The termination algorithm above ensured that up to
900	// allocations since the ragged barrier.
901	nextTriggerRatio := gcController.endCycle(work.userForced)
902
903	// Perform mark termination. This will restart the world.
904	gcMarkTermination(nextTriggerRatio)
905}
906
907// World must be stopped and mark assists and background workers must be
908// disabled.
909func gcMarkTermination(nextTriggerRatio float64) {
910	// Start marktermination (write barrier remains enabled for now).
911	setGCPhase(_GCmarktermination)
912
913	work.heap1 = gcController.heapLive
914	startTime := nanotime()
915
916	mp := acquirem()
917	mp.preemptoff = "gcing"
918	_g_ := getg()
919	_g_.m.traceback = 2
920	gp := _g_.m.curg
921	casgstatus(gp, _Grunning, _Gwaiting)
922	gp.waitreason = waitReasonGarbageCollection
923
924	// Run gc on the g0 stack. We do this so that the g stack
925	// we're currently running on will no longer change. Cuts
926	// the root set down a bit (g0 stacks are not scanned, and
927	// we don't need to scan gc's internal state).  We also
928	// need to switch to g0 so we can shrink the stack.
929	systemstack(func() {
930		gcMark(startTime)
931		// Must return immediately.
932		// The outer function's stack may have moved
933		// during gcMark (it shrinks stacks, including the
934		// outer function's stack), so we must not refer
935		// to any of its variables. Return back to the
936		// non-system stack to pick up the new addresses
937		// before continuing.
938	})
939
940	systemstack(func() {
941		work.heap2 = work.bytesMarked
942		if debug.gccheckmark > 0 {
943			// Run a full non-parallel, stop-the-world
944			// mark using checkmark bits, to check that we
945			// didn't forget to mark anything during the
946			// concurrent mark process.
947			startCheckmarks()
948			gcResetMarkState()
949			gcw := &getg().m.p.ptr().gcw
950			gcDrain(gcw, 0)
951			wbBufFlush1(getg().m.p.ptr())
952			gcw.dispose()
953			endCheckmarks()
954		}
955
956		// marking is complete so we can turn the write barrier off
957		setGCPhase(_GCoff)
958		gcSweep(work.mode)
959	})
960
961	_g_.m.traceback = 0
962	casgstatus(gp, _Gwaiting, _Grunning)
963
964	if trace.enabled {
965		traceGCDone()
966	}
967
968	// all done
969	mp.preemptoff = ""
970
971	if gcphase != _GCoff {
972		throw("gc done but gcphase != _GCoff")
973	}
974
975	// Record heapGoal and heap_inuse for scavenger.
976	gcController.lastHeapGoal = gcController.heapGoal
977	memstats.last_heap_inuse = memstats.heap_inuse
978
979	// Update GC trigger and pacing for the next cycle.
980	gcController.commit(nextTriggerRatio)
981
982	// Update timing memstats
983	now := nanotime()
984	sec, nsec, _ := time_now()
985	unixNow := sec*1e9 + int64(nsec)
986	work.pauseNS += now - work.pauseStart
987	work.tEnd = now
988	memstats.gcPauseDist.record(now - work.pauseStart)
989	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
990	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
991	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
992	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
993	memstats.pause_total_ns += uint64(work.pauseNS)
994
995	// Update work.totaltime.
996	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
997	// We report idle marking time below, but omit it from the
998	// overall utilization here since it's "free".
999	markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
1000	markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
1001	cycleCpu := sweepTermCpu + markCpu + markTermCpu
1002	work.totaltime += cycleCpu
1003
1004	// Compute overall GC CPU utilization.
1005	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
1006	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
1007
1008	// Reset sweep state.
1009	sweep.nbgsweep = 0
1010	sweep.npausesweep = 0
1011
1012	if work.userForced {
1013		memstats.numforcedgc++
1014	}
1015
1016	// Bump GC cycle count and wake goroutines waiting on sweep.
1017	lock(&work.sweepWaiters.lock)
1018	memstats.numgc++
1019	injectglist(&work.sweepWaiters.list)
1020	unlock(&work.sweepWaiters.lock)
1021
1022	// Finish the current heap profiling cycle and start a new
1023	// heap profiling cycle. We do this before starting the world
1024	// so events don't leak into the wrong cycle.
1025	mProf_NextCycle()
1026
1027	// There may be stale spans in mcaches that need to be swept.
1028	// Those aren't tracked in any sweep lists, so we need to
1029	// count them against sweep completion until we ensure all
1030	// those spans have been forced out.
1031	sl := newSweepLocker()
1032	sl.blockCompletion()
1033
1034	systemstack(func() { startTheWorldWithSema(true) })
1035
1036	// Flush the heap profile so we can start a new cycle next GC.
1037	// This is relatively expensive, so we don't do it with the
1038	// world stopped.
1039	mProf_Flush()
1040
1041	// Prepare workbufs for freeing by the sweeper. We do this
1042	// asynchronously because it can take non-trivial time.
1043	prepareFreeWorkbufs()
1044
1045	// Ensure all mcaches are flushed. Each P will flush its own
1046	// mcache before allocating, but idle Ps may not. Since this
1047	// is necessary to sweep all spans, we need to ensure all
1048	// mcaches are flushed before we start the next GC cycle.
1049	systemstack(func() {
1050		forEachP(func(_p_ *p) {
1051			_p_.mcache.prepareForSweep()
1052		})
1053	})
1054	// Now that we've swept stale spans in mcaches, they don't
1055	// count against unswept spans.
1056	sl.dispose()
1057
1058	// Print gctrace before dropping worldsema. As soon as we drop
1059	// worldsema another cycle could start and smash the stats
1060	// we're trying to print.
1061	if debug.gctrace > 0 {
1062		util := int(memstats.gc_cpu_fraction * 100)
1063
1064		var sbuf [24]byte
1065		printlock()
1066		print("gc ", memstats.numgc,
1067			" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1068			util, "%: ")
1069		prev := work.tSweepTerm
1070		for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1071			if i != 0 {
1072				print("+")
1073			}
1074			print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1075			prev = ns
1076		}
1077		print(" ms clock, ")
1078		for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
1079			if i == 2 || i == 3 {
1080				// Separate mark time components with /.
1081				print("/")
1082			} else if i != 0 {
1083				print("+")
1084			}
1085			print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1086		}
1087		print(" ms cpu, ",
1088			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1089			work.heapGoal>>20, " MB goal, ",
1090			work.maxprocs, " P")
1091		if work.userForced {
1092			print(" (forced)")
1093		}
1094		print("\n")
1095		printunlock()
1096	}
1097
1098	semrelease(&worldsema)
1099	semrelease(&gcsema)
1100	// Careful: another GC cycle may start now.
1101
1102	releasem(mp)
1103	mp = nil
1104
1105	// now that gc is done, kick off finalizer thread if needed
1106	if !concurrentSweep {
1107		// give the queued finalizers, if any, a chance to run
1108		Gosched()
1109	}
1110}
1111
1112// gcBgMarkStartWorkers prepares background mark worker goroutines. These
1113// goroutines will not run until the mark phase, but they must be started while
1114// the work is not stopped and from a regular G stack. The caller must hold
1115// worldsema.
1116func gcBgMarkStartWorkers() {
1117	// Background marking is performed by per-P G's. Ensure that each P has
1118	// a background GC G.
1119	//
1120	// Worker Gs don't exit if gomaxprocs is reduced. If it is raised
1121	// again, we can reuse the old workers; no need to create new workers.
1122	for gcBgMarkWorkerCount < gomaxprocs {
1123		expectSystemGoroutine()
1124		go gcBgMarkWorker()
1125
1126		notetsleepg(&work.bgMarkReady, -1)
1127		noteclear(&work.bgMarkReady)
1128		// The worker is now guaranteed to be added to the pool before
1129		// its P's next findRunnableGCWorker.
1130
1131		gcBgMarkWorkerCount++
1132	}
1133}
1134
1135// gcBgMarkPrepare sets up state for background marking.
1136// Mutator assists must not yet be enabled.
1137func gcBgMarkPrepare() {
1138	// Background marking will stop when the work queues are empty
1139	// and there are no more workers (note that, since this is
1140	// concurrent, this may be a transient state, but mark
1141	// termination will clean it up). Between background workers
1142	// and assists, we don't really know how many workers there
1143	// will be, so we pretend to have an arbitrarily large number
1144	// of workers, almost all of which are "waiting". While a
1145	// worker is working it decrements nwait. If nproc == nwait,
1146	// there are no workers.
1147	work.nproc = ^uint32(0)
1148	work.nwait = ^uint32(0)
1149}
1150
1151// gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
1152// gcBgMarkWorker goroutine.
1153type gcBgMarkWorkerNode struct {
1154	// Unused workers are managed in a lock-free stack. This field must be first.
1155	node lfnode
1156
1157	// The g of this worker.
1158	gp guintptr
1159
1160	// Release this m on park. This is used to communicate with the unlock
1161	// function, which cannot access the G's stack. It is unused outside of
1162	// gcBgMarkWorker().
1163	m muintptr
1164}
1165
1166func gcBgMarkWorker() {
1167	setSystemGoroutine()
1168
1169	gp := getg()
1170
1171	// We pass node to a gopark unlock function, so it can't be on
1172	// the stack (see gopark). Prevent deadlock from recursively
1173	// starting GC by disabling preemption.
1174	gp.m.preemptoff = "GC worker init"
1175	node := new(gcBgMarkWorkerNode)
1176	gp.m.preemptoff = ""
1177
1178	node.gp.set(gp)
1179
1180	node.m.set(acquirem())
1181	notewakeup(&work.bgMarkReady)
1182	// After this point, the background mark worker is generally scheduled
1183	// cooperatively by gcController.findRunnableGCWorker. While performing
1184	// work on the P, preemption is disabled because we are working on
1185	// P-local work buffers. When the preempt flag is set, this puts itself
1186	// into _Gwaiting to be woken up by gcController.findRunnableGCWorker
1187	// at the appropriate time.
1188	//
1189	// When preemption is enabled (e.g., while in gcMarkDone), this worker
1190	// may be preempted and schedule as a _Grunnable G from a runq. That is
1191	// fine; it will eventually gopark again for further scheduling via
1192	// findRunnableGCWorker.
1193	//
1194	// Since we disable preemption before notifying bgMarkReady, we
1195	// guarantee that this G will be in the worker pool for the next
1196	// findRunnableGCWorker. This isn't strictly necessary, but it reduces
1197	// latency between _GCmark starting and the workers starting.
1198
1199	for {
1200		// Go to sleep until woken by
1201		// gcController.findRunnableGCWorker.
1202		gopark(func(g *g, nodep unsafe.Pointer) bool {
1203			node := (*gcBgMarkWorkerNode)(nodep)
1204
1205			if mp := node.m.ptr(); mp != nil {
1206				// The worker G is no longer running; release
1207				// the M.
1208				//
1209				// N.B. it is _safe_ to release the M as soon
1210				// as we are no longer performing P-local mark
1211				// work.
1212				//
1213				// However, since we cooperatively stop work
1214				// when gp.preempt is set, if we releasem in
1215				// the loop then the following call to gopark
1216				// would immediately preempt the G. This is
1217				// also safe, but inefficient: the G must
1218				// schedule again only to enter gopark and park
1219				// again. Thus, we defer the release until
1220				// after parking the G.
1221				releasem(mp)
1222			}
1223
1224			// Release this G to the pool.
1225			gcBgMarkWorkerPool.push(&node.node)
1226			// Note that at this point, the G may immediately be
1227			// rescheduled and may be running.
1228			return true
1229		}, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
1230
1231		// Preemption must not occur here, or another G might see
1232		// p.gcMarkWorkerMode.
1233
1234		// Disable preemption so we can use the gcw. If the
1235		// scheduler wants to preempt us, we'll stop draining,
1236		// dispose the gcw, and then preempt.
1237		node.m.set(acquirem())
1238		pp := gp.m.p.ptr() // P can't change with preemption disabled.
1239
1240		if gcBlackenEnabled == 0 {
1241			println("worker mode", pp.gcMarkWorkerMode)
1242			throw("gcBgMarkWorker: blackening not enabled")
1243		}
1244
1245		if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
1246			throw("gcBgMarkWorker: mode not set")
1247		}
1248
1249		startTime := nanotime()
1250		pp.gcMarkWorkerStartTime = startTime
1251
1252		decnwait := atomic.Xadd(&work.nwait, -1)
1253		if decnwait == work.nproc {
1254			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1255			throw("work.nwait was > work.nproc")
1256		}
1257
1258		systemstack(func() {
1259			// Mark our goroutine preemptible so its stack
1260			// can be scanned. This lets two mark workers
1261			// scan each other (otherwise, they would
1262			// deadlock). We must not modify anything on
1263			// the G stack. However, stack shrinking is
1264			// disabled for mark workers, so it is safe to
1265			// read from the G stack.
1266			casgstatus(gp, _Grunning, _Gwaiting)
1267			switch pp.gcMarkWorkerMode {
1268			default:
1269				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1270			case gcMarkWorkerDedicatedMode:
1271				gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1272				if gp.preempt {
1273					// We were preempted. This is
1274					// a useful signal to kick
1275					// everything out of the run
1276					// queue so it can run
1277					// somewhere else.
1278					if drainQ, n := runqdrain(pp); n > 0 {
1279						lock(&sched.lock)
1280						globrunqputbatch(&drainQ, int32(n))
1281						unlock(&sched.lock)
1282					}
1283				}
1284				// Go back to draining, this time
1285				// without preemption.
1286				gcDrain(&pp.gcw, gcDrainFlushBgCredit)
1287			case gcMarkWorkerFractionalMode:
1288				gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1289			case gcMarkWorkerIdleMode:
1290				gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1291			}
1292			casgstatus(gp, _Gwaiting, _Grunning)
1293		})
1294
1295		// Account for time.
1296		duration := nanotime() - startTime
1297		switch pp.gcMarkWorkerMode {
1298		case gcMarkWorkerDedicatedMode:
1299			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
1300			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
1301		case gcMarkWorkerFractionalMode:
1302			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
1303			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
1304		case gcMarkWorkerIdleMode:
1305			atomic.Xaddint64(&gcController.idleMarkTime, duration)
1306		}
1307
1308		// Was this the last worker and did we run out
1309		// of work?
1310		incnwait := atomic.Xadd(&work.nwait, +1)
1311		if incnwait > work.nproc {
1312			println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
1313				"work.nwait=", incnwait, "work.nproc=", work.nproc)
1314			throw("work.nwait > work.nproc")
1315		}
1316
1317		// We'll releasem after this point and thus this P may run
1318		// something else. We must clear the worker mode to avoid
1319		// attributing the mode to a different (non-worker) G in
1320		// traceGoStart.
1321		pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
1322
1323		// If this worker reached a background mark completion
1324		// point, signal the main GC goroutine.
1325		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1326			// We don't need the P-local buffers here, allow
1327			// preemption becuse we may schedule like a regular
1328			// goroutine in gcMarkDone (block on locks, etc).
1329			releasem(node.m.ptr())
1330			node.m.set(nil)
1331
1332			gcMarkDone()
1333		}
1334	}
1335}
1336
1337// gcMarkWorkAvailable reports whether executing a mark worker
1338// on p is potentially useful. p may be nil, in which case it only
1339// checks the global sources of work.
1340func gcMarkWorkAvailable(p *p) bool {
1341	if p != nil && !p.gcw.empty() {
1342		return true
1343	}
1344	if !work.full.empty() {
1345		return true // global work available
1346	}
1347	if work.markrootNext < work.markrootJobs {
1348		return true // root scan work available
1349	}
1350	return false
1351}
1352
1353// gcMark runs the mark (or, for concurrent GC, mark termination)
1354// All gcWork caches must be empty.
1355// STW is in effect at this point.
1356func gcMark(startTime int64) {
1357	if debug.allocfreetrace > 0 {
1358		tracegc()
1359	}
1360
1361	if gcphase != _GCmarktermination {
1362		throw("in gcMark expecting to see gcphase as _GCmarktermination")
1363	}
1364	work.tstart = startTime
1365
1366	// Check that there's no marking work remaining.
1367	if work.full != 0 || work.markrootNext < work.markrootJobs {
1368		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
1369		panic("non-empty mark queue after concurrent mark")
1370	}
1371
1372	if debug.gccheckmark > 0 {
1373		// This is expensive when there's a large number of
1374		// Gs, so only do it if checkmark is also enabled.
1375		gcMarkRootCheck()
1376	}
1377	if work.full != 0 {
1378		throw("work.full != 0")
1379	}
1380
1381	// Clear out buffers and double-check that all gcWork caches
1382	// are empty. This should be ensured by gcMarkDone before we
1383	// enter mark termination.
1384	//
1385	// TODO: We could clear out buffers just before mark if this
1386	// has a non-negligible impact on STW time.
1387	for _, p := range allp {
1388		// The write barrier may have buffered pointers since
1389		// the gcMarkDone barrier. However, since the barrier
1390		// ensured all reachable objects were marked, all of
1391		// these must be pointers to black objects. Hence we
1392		// can just discard the write barrier buffer.
1393		if debug.gccheckmark > 0 {
1394			// For debugging, flush the buffer and make
1395			// sure it really was all marked.
1396			wbBufFlush1(p)
1397		} else {
1398			p.wbBuf.reset()
1399		}
1400
1401		gcw := &p.gcw
1402		if !gcw.empty() {
1403			printlock()
1404			print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
1405			if gcw.wbuf1 == nil {
1406				print(" wbuf1=<nil>")
1407			} else {
1408				print(" wbuf1.n=", gcw.wbuf1.nobj)
1409			}
1410			if gcw.wbuf2 == nil {
1411				print(" wbuf2=<nil>")
1412			} else {
1413				print(" wbuf2.n=", gcw.wbuf2.nobj)
1414			}
1415			print("\n")
1416			throw("P has cached GC work at end of mark termination")
1417		}
1418		// There may still be cached empty buffers, which we
1419		// need to flush since we're going to free them. Also,
1420		// there may be non-zero stats because we allocated
1421		// black after the gcMarkDone barrier.
1422		gcw.dispose()
1423	}
1424
1425	// Update the marked heap stat.
1426	gcController.heapMarked = work.bytesMarked
1427
1428	// Flush scanAlloc from each mcache since we're about to modify
1429	// heapScan directly. If we were to flush this later, then scanAlloc
1430	// might have incorrect information.
1431	for _, p := range allp {
1432		c := p.mcache
1433		if c == nil {
1434			continue
1435		}
1436		gcController.heapScan += uint64(c.scanAlloc)
1437		c.scanAlloc = 0
1438	}
1439
1440	// Update other GC heap size stats. This must happen after
1441	// cachestats (which flushes local statistics to these) and
1442	// flushallmcaches (which modifies gcController.heapLive).
1443	gcController.heapLive = work.bytesMarked
1444	gcController.heapScan = uint64(gcController.scanWork)
1445
1446	if trace.enabled {
1447		traceHeapAlloc()
1448	}
1449}
1450
1451// gcSweep must be called on the system stack because it acquires the heap
1452// lock. See mheap for details.
1453//
1454// The world must be stopped.
1455//
1456//go:systemstack
1457func gcSweep(mode gcMode) {
1458	assertWorldStopped()
1459
1460	if gcphase != _GCoff {
1461		throw("gcSweep being done but phase is not GCoff")
1462	}
1463
1464	lock(&mheap_.lock)
1465	mheap_.sweepgen += 2
1466	mheap_.sweepDrained = 0
1467	mheap_.pagesSwept = 0
1468	mheap_.sweepArenas = mheap_.allArenas
1469	mheap_.reclaimIndex = 0
1470	mheap_.reclaimCredit = 0
1471	unlock(&mheap_.lock)
1472
1473	sweep.centralIndex.clear()
1474
1475	if !_ConcurrentSweep || mode == gcForceBlockMode {
1476		// Special case synchronous sweep.
1477		// Record that no proportional sweeping has to happen.
1478		lock(&mheap_.lock)
1479		mheap_.sweepPagesPerByte = 0
1480		unlock(&mheap_.lock)
1481		// Sweep all spans eagerly.
1482		for sweepone() != ^uintptr(0) {
1483			sweep.npausesweep++
1484		}
1485		// Free workbufs eagerly.
1486		prepareFreeWorkbufs()
1487		for freeSomeWbufs(false) {
1488		}
1489		// All "free" events for this mark/sweep cycle have
1490		// now happened, so we can make this profile cycle
1491		// available immediately.
1492		mProf_NextCycle()
1493		mProf_Flush()
1494		return
1495	}
1496
1497	// Background sweep.
1498	lock(&sweep.lock)
1499	if sweep.parked {
1500		sweep.parked = false
1501		ready(sweep.g, 0, true)
1502	}
1503	unlock(&sweep.lock)
1504}
1505
1506// gcResetMarkState resets global state prior to marking (concurrent
1507// or STW) and resets the stack scan state of all Gs.
1508//
1509// This is safe to do without the world stopped because any Gs created
1510// during or after this will start out in the reset state.
1511//
1512// gcResetMarkState must be called on the system stack because it acquires
1513// the heap lock. See mheap for details.
1514//
1515//go:systemstack
1516func gcResetMarkState() {
1517	// This may be called during a concurrent phase, so lock to make sure
1518	// allgs doesn't change.
1519	forEachG(func(gp *g) {
1520		gp.gcscandone = false // set to true in gcphasework
1521		gp.gcAssistBytes = 0
1522	})
1523
1524	// Clear page marks. This is just 1MB per 64GB of heap, so the
1525	// time here is pretty trivial.
1526	lock(&mheap_.lock)
1527	arenas := mheap_.allArenas
1528	unlock(&mheap_.lock)
1529	for _, ai := range arenas {
1530		ha := mheap_.arenas[ai.l1()][ai.l2()]
1531		for i := range ha.pageMarks {
1532			ha.pageMarks[i] = 0
1533		}
1534	}
1535
1536	work.bytesMarked = 0
1537	work.initialHeapLive = atomic.Load64(&gcController.heapLive)
1538}
1539
1540// Hooks for other packages
1541
1542var poolcleanup func()
1543
1544//go:linkname sync_runtime_registerPoolCleanup sync.runtime__registerPoolCleanup
1545func sync_runtime_registerPoolCleanup(f func()) {
1546	poolcleanup = f
1547}
1548
1549func clearpools() {
1550	// clear sync.Pools
1551	if poolcleanup != nil {
1552		poolcleanup()
1553	}
1554
1555	// Clear central sudog cache.
1556	// Leave per-P caches alone, they have strictly bounded size.
1557	// Disconnect cached list before dropping it on the floor,
1558	// so that a dangling ref to one entry does not pin all of them.
1559	lock(&sched.sudoglock)
1560	var sg, sgnext *sudog
1561	for sg = sched.sudogcache; sg != nil; sg = sgnext {
1562		sgnext = sg.next
1563		sg.next = nil
1564	}
1565	sched.sudogcache = nil
1566	unlock(&sched.sudoglock)
1567
1568	// Clear central defer pools.
1569	// Leave per-P pools alone, they have strictly bounded size.
1570	lock(&sched.deferlock)
1571	// disconnect cached list before dropping it on the floor,
1572	// so that a dangling ref to one entry does not pin all of them.
1573	var d, dlink *_defer
1574	for d = sched.deferpool; d != nil; d = dlink {
1575		dlink = d.link
1576		d.link = nil
1577	}
1578	sched.deferpool = nil
1579	unlock(&sched.deferlock)
1580}
1581
1582// Timing
1583
1584// itoaDiv formats val/(10**dec) into buf.
1585func itoaDiv(buf []byte, val uint64, dec int) []byte {
1586	i := len(buf) - 1
1587	idec := i - dec
1588	for val >= 10 || i >= idec {
1589		buf[i] = byte(val%10 + '0')
1590		i--
1591		if i == idec {
1592			buf[i] = '.'
1593			i--
1594		}
1595		val /= 10
1596	}
1597	buf[i] = byte(val + '0')
1598	return buf[i:]
1599}
1600
1601// fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
1602func fmtNSAsMS(buf []byte, ns uint64) []byte {
1603	if ns >= 10e6 {
1604		// Format as whole milliseconds.
1605		return itoaDiv(buf, ns/1e6, 0)
1606	}
1607	// Format two digits of precision, with at most three decimal places.
1608	x := ns / 1e3
1609	if x == 0 {
1610		buf[0] = '0'
1611		return buf[:1]
1612	}
1613	dec := 3
1614	for x >= 100 {
1615		x /= 10
1616		dec--
1617	}
1618	return itoaDiv(buf, x, dec)
1619}
1620
1621// Helpers for testing GC.
1622
1623// gcTestIsReachable performs a GC and returns a bit set where bit i
1624// is set if ptrs[i] is reachable.
1625func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
1626	// This takes the pointers as unsafe.Pointers in order to keep
1627	// them live long enough for us to attach specials. After
1628	// that, we drop our references to them.
1629
1630	if len(ptrs) > 64 {
1631		panic("too many pointers for uint64 mask")
1632	}
1633
1634	// Block GC while we attach specials and drop our references
1635	// to ptrs. Otherwise, if a GC is in progress, it could mark
1636	// them reachable via this function before we have a chance to
1637	// drop them.
1638	semacquire(&gcsema)
1639
1640	// Create reachability specials for ptrs.
1641	specials := make([]*specialReachable, len(ptrs))
1642	for i, p := range ptrs {
1643		lock(&mheap_.speciallock)
1644		s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
1645		unlock(&mheap_.speciallock)
1646		s.special.kind = _KindSpecialReachable
1647		if !addspecial(p, &s.special) {
1648			throw("already have a reachable special (duplicate pointer?)")
1649		}
1650		specials[i] = s
1651		// Make sure we don't retain ptrs.
1652		ptrs[i] = nil
1653	}
1654
1655	semrelease(&gcsema)
1656
1657	// Force a full GC and sweep.
1658	GC()
1659
1660	// Process specials.
1661	for i, s := range specials {
1662		if !s.done {
1663			printlock()
1664			println("runtime: object", i, "was not swept")
1665			throw("IsReachable failed")
1666		}
1667		if s.reachable {
1668			mask |= 1 << i
1669		}
1670		lock(&mheap_.speciallock)
1671		mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
1672		unlock(&mheap_.speciallock)
1673	}
1674
1675	return mask
1676}
1677
1678// onCurrentStack reports whether the argument is on the current stack.
1679// It is implemented in C.
1680func onCurrentStack(uintptr) bool
1681
1682// getBSS returns the start of the BSS section.
1683// It is implemented in C.
1684func getBSS() uintptr
1685
1686// gcTestPointerClass returns the category of what p points to, one of:
1687// "heap", "stack", "data", "bss", "other". This is useful for checking
1688// that a test is doing what it's intended to do.
1689//
1690// This is nosplit simply to avoid extra pointer shuffling that may
1691// complicate a test.
1692//
1693//go:nosplit
1694func gcTestPointerClass(p unsafe.Pointer) string {
1695	p2 := uintptr(noescape(p))
1696	if onCurrentStack(p2) {
1697		return "stack"
1698	}
1699	if base, _, _ := findObject(p2, 0, 0, false); base != 0 {
1700		return "heap"
1701	}
1702	bss := getBSS()
1703	if p2 >= getText() && p2 < bss {
1704		return "data"
1705	}
1706	if p2 >= bss && p2 < getEnd() {
1707		return "bss"
1708	}
1709	KeepAlive(p)
1710	return "other"
1711}
1712